diff options
Diffstat (limited to '')
-rw-r--r-- | src/backend/access/hash/hashovfl.c | 1083 |
1 files changed, 1083 insertions, 0 deletions
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c new file mode 100644 index 0000000..1ff2e0c --- /dev/null +++ b/src/backend/access/hash/hashovfl.c @@ -0,0 +1,1083 @@ +/*------------------------------------------------------------------------- + * + * hashovfl.c + * Overflow page management code for the Postgres hash access method + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/hash/hashovfl.c + * + * NOTES + * Overflow pages look like ordinary relation pages. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/hash.h" +#include "access/hash_xlog.h" +#include "miscadmin.h" +#include "utils/rel.h" + + +static uint32 _hash_firstfreebit(uint32 map); + + +/* + * Convert overflow page bit number (its index in the free-page bitmaps) + * to block number within the index. + */ +static BlockNumber +bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum) +{ + uint32 splitnum = metap->hashm_ovflpoint; + uint32 i; + + /* Convert zero-based bitnumber to 1-based page number */ + ovflbitnum += 1; + + /* Determine the split number for this page (must be >= 1) */ + for (i = 1; + i < splitnum && ovflbitnum > metap->hashm_spares[i]; + i++) + /* loop */ ; + + /* + * Convert to absolute page number by adding the number of bucket pages + * that exist before this split point. + */ + return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum); +} + +/* + * _hash_ovflblkno_to_bitno + * + * Convert overflow page block number to bit number for free-page bitmap. + */ +uint32 +_hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno) +{ + uint32 splitnum = metap->hashm_ovflpoint; + uint32 i; + uint32 bitnum; + + /* Determine the split number containing this page */ + for (i = 1; i <= splitnum; i++) + { + if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i)) + break; /* oops */ + bitnum = ovflblkno - _hash_get_totalbuckets(i); + + /* + * bitnum has to be greater than number of overflow page added in + * previous split point. The overflow page at this splitnum (i) if any + * should start from (_hash_get_totalbuckets(i) + + * metap->hashm_spares[i - 1] + 1). + */ + if (bitnum > metap->hashm_spares[i - 1] && + bitnum <= metap->hashm_spares[i]) + return bitnum - 1; /* -1 to convert 1-based to 0-based */ + } + + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid overflow block number %u", ovflblkno))); + return 0; /* keep compiler quiet */ +} + +/* + * _hash_addovflpage + * + * Add an overflow page to the bucket whose last page is pointed to by 'buf'. + * + * On entry, the caller must hold a pin but no lock on 'buf'. The pin is + * dropped before exiting (we assume the caller is not interested in 'buf' + * anymore) if not asked to retain. The pin will be retained only for the + * primary bucket. The returned overflow page will be pinned and + * write-locked; it is guaranteed to be empty. + * + * The caller must hold a pin, but no lock, on the metapage buffer. + * That buffer is returned in the same state. + * + * NB: since this could be executed concurrently by multiple processes, + * one should not assume that the returned overflow page will be the + * immediate successor of the originally passed 'buf'. Additional overflow + * pages might have been added to the bucket chain in between. + */ +Buffer +_hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) +{ + Buffer ovflbuf; + Page page; + Page ovflpage; + HashPageOpaque pageopaque; + HashPageOpaque ovflopaque; + HashMetaPage metap; + Buffer mapbuf = InvalidBuffer; + Buffer newmapbuf = InvalidBuffer; + BlockNumber blkno; + uint32 orig_firstfree; + uint32 splitnum; + uint32 *freep = NULL; + uint32 max_ovflpg; + uint32 bit; + uint32 bitmap_page_bit; + uint32 first_page; + uint32 last_bit; + uint32 last_page; + uint32 i, + j; + bool page_found = false; + + /* + * Write-lock the tail page. Here, we need to maintain locking order such + * that, first acquire the lock on tail page of bucket, then on meta page + * to find and lock the bitmap page and if it is found, then lock on meta + * page is released, then finally acquire the lock on new overflow buffer. + * We need this locking order to avoid deadlock with backends that are + * doing inserts. + * + * Note: We could have avoided locking many buffers here if we made two + * WAL records for acquiring an overflow page (one to allocate an overflow + * page and another to add it to overflow bucket chain). However, doing + * so can leak an overflow page, if the system crashes after allocation. + * Needless to say, it is better to have a single record from a + * performance point of view as well. + */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* probably redundant... */ + _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + + /* loop to find current tail page, in case someone else inserted too */ + for (;;) + { + BlockNumber nextblkno; + + page = BufferGetPage(buf); + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + nextblkno = pageopaque->hasho_nextblkno; + + if (!BlockNumberIsValid(nextblkno)) + break; + + /* we assume we do not need to write the unmodified page */ + if (retain_pin) + { + /* pin will be retained only for the primary bucket page */ + Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_BUCKET_PAGE); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + else + _hash_relbuf(rel, buf); + + retain_pin = false; + + buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); + } + + /* Get exclusive lock on the meta page */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + _hash_checkpage(rel, metabuf, LH_META_PAGE); + metap = HashPageGetMeta(BufferGetPage(metabuf)); + + /* start search at hashm_firstfree */ + orig_firstfree = metap->hashm_firstfree; + first_page = orig_firstfree >> BMPG_SHIFT(metap); + bit = orig_firstfree & BMPG_MASK(metap); + i = first_page; + j = bit / BITS_PER_MAP; + bit &= ~(BITS_PER_MAP - 1); + + /* outer loop iterates once per bitmap page */ + for (;;) + { + BlockNumber mapblkno; + Page mappage; + uint32 last_inpage; + + /* want to end search with the last existing overflow page */ + splitnum = metap->hashm_ovflpoint; + max_ovflpg = metap->hashm_spares[splitnum] - 1; + last_page = max_ovflpg >> BMPG_SHIFT(metap); + last_bit = max_ovflpg & BMPG_MASK(metap); + + if (i > last_page) + break; + + Assert(i < metap->hashm_nmaps); + mapblkno = metap->hashm_mapp[i]; + + if (i == last_page) + last_inpage = last_bit; + else + last_inpage = BMPGSZ_BIT(metap) - 1; + + /* Release exclusive lock on metapage while reading bitmap page */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE); + mappage = BufferGetPage(mapbuf); + freep = HashPageGetBitmap(mappage); + + for (; bit <= last_inpage; j++, bit += BITS_PER_MAP) + { + if (freep[j] != ALL_SET) + { + page_found = true; + + /* Reacquire exclusive lock on the meta page */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* convert bit to bit number within page */ + bit += _hash_firstfreebit(freep[j]); + bitmap_page_bit = bit; + + /* convert bit to absolute bit number */ + bit += (i << BMPG_SHIFT(metap)); + /* Calculate address of the recycled overflow page */ + blkno = bitno_to_blkno(metap, bit); + + /* Fetch and init the recycled page */ + ovflbuf = _hash_getinitbuf(rel, blkno); + + goto found; + } + } + + /* No free space here, try to advance to next map page */ + _hash_relbuf(rel, mapbuf); + mapbuf = InvalidBuffer; + i++; + j = 0; /* scan from start of next map page */ + bit = 0; + + /* Reacquire exclusive lock on the meta page */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + } + + /* + * No free pages --- have to extend the relation to add an overflow page. + * First, check to see if we have to add a new bitmap page too. + */ + if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1)) + { + /* + * We create the new bitmap page with all pages marked "in use". + * Actually two pages in the new bitmap's range will exist + * immediately: the bitmap page itself, and the following page which + * is the one we return to the caller. Both of these are correctly + * marked "in use". Subsequent pages do not exist yet, but it is + * convenient to pre-mark them as "in use" too. + */ + bit = metap->hashm_spares[splitnum]; + + /* metapage already has a write lock */ + if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("out of overflow pages in hash index \"%s\"", + RelationGetRelationName(rel)))); + + newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM); + } + else + { + /* + * Nothing to do here; since the page will be past the last used page, + * we know its bitmap bit was preinitialized to "in use". + */ + } + + /* Calculate address of the new overflow page */ + bit = BufferIsValid(newmapbuf) ? + metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum]; + blkno = bitno_to_blkno(metap, bit); + + /* + * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the + * relation length stays in sync with ours. XXX It's annoying to do this + * with metapage write lock held; would be better to use a lock that + * doesn't block incoming searches. + * + * It is okay to hold two buffer locks here (one on tail page of bucket + * and other on new overflow page) since there cannot be anyone else + * contending for access to ovflbuf. + */ + ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); + +found: + + /* + * Do the update. No ereport(ERROR) until changes are logged. We want to + * log the changes for bitmap page and overflow page together to avoid + * loss of pages in case the new page is added. + */ + START_CRIT_SECTION(); + + if (page_found) + { + Assert(BufferIsValid(mapbuf)); + + /* mark page "in use" in the bitmap */ + SETBIT(freep, bitmap_page_bit); + MarkBufferDirty(mapbuf); + } + else + { + /* update the count to indicate new overflow page is added */ + metap->hashm_spares[splitnum]++; + + if (BufferIsValid(newmapbuf)) + { + _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false); + MarkBufferDirty(newmapbuf); + + /* add the new bitmap page to the metapage's list of bitmaps */ + metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf); + metap->hashm_nmaps++; + metap->hashm_spares[splitnum]++; + } + + MarkBufferDirty(metabuf); + + /* + * for new overflow page, we don't need to explicitly set the bit in + * bitmap page, as by default that will be set to "in use". + */ + } + + /* + * Adjust hashm_firstfree to avoid redundant searches. But don't risk + * changing it if someone moved it while we were searching bitmap pages. + */ + if (metap->hashm_firstfree == orig_firstfree) + { + metap->hashm_firstfree = bit + 1; + MarkBufferDirty(metabuf); + } + + /* initialize new overflow page */ + ovflpage = BufferGetPage(ovflbuf); + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); + ovflopaque->hasho_nextblkno = InvalidBlockNumber; + ovflopaque->hasho_bucket = pageopaque->hasho_bucket; + ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; + ovflopaque->hasho_page_id = HASHO_PAGE_ID; + + MarkBufferDirty(ovflbuf); + + /* logically chain overflow page to previous page */ + pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); + + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + xl_hash_add_ovfl_page xlrec; + + xlrec.bmpage_found = page_found; + xlrec.bmsize = metap->hashm_bmsize; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage); + + XLogRegisterBuffer(0, ovflbuf, REGBUF_WILL_INIT); + XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket)); + + XLogRegisterBuffer(1, buf, REGBUF_STANDARD); + + if (BufferIsValid(mapbuf)) + { + XLogRegisterBuffer(2, mapbuf, REGBUF_STANDARD); + XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32)); + } + + if (BufferIsValid(newmapbuf)) + XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT); + + XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD); + XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE); + + PageSetLSN(BufferGetPage(ovflbuf), recptr); + PageSetLSN(BufferGetPage(buf), recptr); + + if (BufferIsValid(mapbuf)) + PageSetLSN(BufferGetPage(mapbuf), recptr); + + if (BufferIsValid(newmapbuf)) + PageSetLSN(BufferGetPage(newmapbuf), recptr); + + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + if (retain_pin) + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, buf); + + if (BufferIsValid(mapbuf)) + _hash_relbuf(rel, mapbuf); + + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + if (BufferIsValid(newmapbuf)) + _hash_relbuf(rel, newmapbuf); + + return ovflbuf; +} + +/* + * _hash_firstfreebit() + * + * Return the number of the first bit that is not set in the word 'map'. + */ +static uint32 +_hash_firstfreebit(uint32 map) +{ + uint32 i, + mask; + + mask = 0x1; + for (i = 0; i < BITS_PER_MAP; i++) + { + if (!(mask & map)) + return i; + mask <<= 1; + } + + elog(ERROR, "firstfreebit found no free bit"); + + return 0; /* keep compiler quiet */ +} + +/* + * _hash_freeovflpage() - + * + * Remove this overflow page from its bucket's chain, and mark the page as + * free. On entry, ovflbuf is write-locked; it is released before exiting. + * + * Add the tuples (itups) to wbuf in this function. We could do that in the + * caller as well, but the advantage of doing it here is we can easily write + * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and + * removal of overflow page has to done as an atomic operation, otherwise + * during replay on standby users might find duplicate records. + * + * Since this function is invoked in VACUUM, we provide an access strategy + * parameter that controls fetches of the bucket pages. + * + * Returns the block number of the page that followed the given page + * in the bucket, or InvalidBlockNumber if no following page. + * + * NB: caller must not hold lock on metapage, nor on page, that's next to + * ovflbuf in the bucket chain. We don't acquire the lock on page that's + * prior to ovflbuf in chain if it is same as wbuf because the caller already + * has a lock on same. + */ +BlockNumber +_hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, + Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, + Size *tups_size, uint16 nitups, + BufferAccessStrategy bstrategy) +{ + HashMetaPage metap; + Buffer metabuf; + Buffer mapbuf; + BlockNumber ovflblkno; + BlockNumber prevblkno; + BlockNumber blkno; + BlockNumber nextblkno; + BlockNumber writeblkno; + HashPageOpaque ovflopaque; + Page ovflpage; + Page mappage; + uint32 *freep; + uint32 ovflbitno; + int32 bitmappage, + bitmapbit; + Bucket bucket PG_USED_FOR_ASSERTS_ONLY; + Buffer prevbuf = InvalidBuffer; + Buffer nextbuf = InvalidBuffer; + bool update_metap = false; + + /* Get information from the doomed page */ + _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE); + ovflblkno = BufferGetBlockNumber(ovflbuf); + ovflpage = BufferGetPage(ovflbuf); + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + nextblkno = ovflopaque->hasho_nextblkno; + prevblkno = ovflopaque->hasho_prevblkno; + writeblkno = BufferGetBlockNumber(wbuf); + bucket = ovflopaque->hasho_bucket; + + /* + * Fix up the bucket chain. this is a doubly-linked list, so we must fix + * up the bucket chain members behind and ahead of the overflow page being + * deleted. Concurrency issues are avoided by using lock chaining as + * described atop hashbucketcleanup. + */ + if (BlockNumberIsValid(prevblkno)) + { + if (prevblkno == writeblkno) + prevbuf = wbuf; + else + prevbuf = _hash_getbuf_with_strategy(rel, + prevblkno, + HASH_WRITE, + LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, + bstrategy); + } + if (BlockNumberIsValid(nextblkno)) + nextbuf = _hash_getbuf_with_strategy(rel, + nextblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); + + /* Note: bstrategy is intentionally not used for metapage and bitmap */ + + /* Read the metapage so we can determine which bitmap page to use */ + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); + metap = HashPageGetMeta(BufferGetPage(metabuf)); + + /* Identify which bit to set */ + ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno); + + bitmappage = ovflbitno >> BMPG_SHIFT(metap); + bitmapbit = ovflbitno & BMPG_MASK(metap); + + if (bitmappage >= metap->hashm_nmaps) + elog(ERROR, "invalid overflow bit number %u", ovflbitno); + blkno = metap->hashm_mapp[bitmappage]; + + /* Release metapage lock while we access the bitmap page */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + /* read the bitmap page to clear the bitmap bit */ + mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE); + mappage = BufferGetPage(mapbuf); + freep = HashPageGetBitmap(mappage); + Assert(ISSET(freep, bitmapbit)); + + /* Get write-lock on metapage to update firstfree */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* This operation needs to log multiple tuples, prepare WAL for that */ + if (RelationNeedsWAL(rel)) + XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, 4 + nitups); + + START_CRIT_SECTION(); + + /* + * we have to insert tuples on the "write" page, being careful to preserve + * hashkey ordering. (If we insert many tuples into the same "write" page + * it would be worth qsort'ing them). + */ + if (nitups > 0) + { + _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); + MarkBufferDirty(wbuf); + } + + /* + * Reinitialize the freed overflow page. Just zeroing the page won't + * work, because WAL replay routines expect pages to be initialized. See + * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. We are + * careful to make the special space valid here so that tools like + * pageinspect won't get confused. + */ + _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); + + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + + ovflopaque->hasho_prevblkno = InvalidBlockNumber; + ovflopaque->hasho_nextblkno = InvalidBlockNumber; + ovflopaque->hasho_bucket = -1; + ovflopaque->hasho_flag = LH_UNUSED_PAGE; + ovflopaque->hasho_page_id = HASHO_PAGE_ID; + + MarkBufferDirty(ovflbuf); + + if (BufferIsValid(prevbuf)) + { + Page prevpage = BufferGetPage(prevbuf); + HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); + + Assert(prevopaque->hasho_bucket == bucket); + prevopaque->hasho_nextblkno = nextblkno; + MarkBufferDirty(prevbuf); + } + if (BufferIsValid(nextbuf)) + { + Page nextpage = BufferGetPage(nextbuf); + HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); + + Assert(nextopaque->hasho_bucket == bucket); + nextopaque->hasho_prevblkno = prevblkno; + MarkBufferDirty(nextbuf); + } + + /* Clear the bitmap bit to indicate that this overflow page is free */ + CLRBIT(freep, bitmapbit); + MarkBufferDirty(mapbuf); + + /* if this is now the first free page, update hashm_firstfree */ + if (ovflbitno < metap->hashm_firstfree) + { + metap->hashm_firstfree = ovflbitno; + update_metap = true; + MarkBufferDirty(metabuf); + } + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_squeeze_page xlrec; + XLogRecPtr recptr; + int i; + + xlrec.prevblkno = prevblkno; + xlrec.nextblkno = nextblkno; + xlrec.ntups = nitups; + xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf); + xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf); + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage); + + /* + * bucket buffer needs to be registered to ensure that we can acquire + * a cleanup lock on it during replay. + */ + if (!xlrec.is_prim_bucket_same_wrt) + XLogRegisterBuffer(0, bucketbuf, REGBUF_STANDARD | REGBUF_NO_IMAGE); + + XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); + if (xlrec.ntups > 0) + { + XLogRegisterBufData(1, (char *) itup_offsets, + nitups * sizeof(OffsetNumber)); + for (i = 0; i < nitups; i++) + XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); + } + + XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD); + + /* + * If prevpage and the writepage (block in which we are moving tuples + * from overflow) are same, then no need to separately register + * prevpage. During replay, we can directly update the nextblock in + * writepage. + */ + if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) + XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD); + + if (BufferIsValid(nextbuf)) + XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD); + + XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD); + XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32)); + + if (update_metap) + { + XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD); + XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32)); + } + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE); + + PageSetLSN(BufferGetPage(wbuf), recptr); + PageSetLSN(BufferGetPage(ovflbuf), recptr); + + if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) + PageSetLSN(BufferGetPage(prevbuf), recptr); + if (BufferIsValid(nextbuf)) + PageSetLSN(BufferGetPage(nextbuf), recptr); + + PageSetLSN(BufferGetPage(mapbuf), recptr); + + if (update_metap) + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + /* release previous bucket if it is not same as write bucket */ + if (BufferIsValid(prevbuf) && prevblkno != writeblkno) + _hash_relbuf(rel, prevbuf); + + if (BufferIsValid(ovflbuf)) + _hash_relbuf(rel, ovflbuf); + + if (BufferIsValid(nextbuf)) + _hash_relbuf(rel, nextbuf); + + _hash_relbuf(rel, mapbuf); + _hash_relbuf(rel, metabuf); + + return nextblkno; +} + + +/* + * _hash_initbitmapbuffer() + * + * Initialize a new bitmap page. All bits in the new bitmap page are set to + * "1", indicating "in use". + */ +void +_hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage) +{ + Page pg; + HashPageOpaque op; + uint32 *freep; + + pg = BufferGetPage(buf); + + /* initialize the page */ + if (initpage) + _hash_pageinit(pg, BufferGetPageSize(buf)); + + /* initialize the page's special space */ + op = (HashPageOpaque) PageGetSpecialPointer(pg); + op->hasho_prevblkno = InvalidBlockNumber; + op->hasho_nextblkno = InvalidBlockNumber; + op->hasho_bucket = -1; + op->hasho_flag = LH_BITMAP_PAGE; + op->hasho_page_id = HASHO_PAGE_ID; + + /* set all of the bits to 1 */ + freep = HashPageGetBitmap(pg); + MemSet(freep, 0xFF, bmsize); + + /* + * Set pd_lower just past the end of the bitmap page data. We could even + * set pd_lower equal to pd_upper, but this is more precise and makes the + * page look compressible to xlog.c. + */ + ((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg; +} + + +/* + * _hash_squeezebucket(rel, bucket) + * + * Try to squeeze the tuples onto pages occurring earlier in the + * bucket chain in an attempt to free overflow pages. When we start + * the "squeezing", the page from which we start taking tuples (the + * "read" page) is the last bucket in the bucket chain and the page + * onto which we start squeezing tuples (the "write" page) is the + * first page in the bucket chain. The read page works backward and + * the write page works forward; the procedure terminates when the + * read page and write page are the same page. + * + * At completion of this procedure, it is guaranteed that all pages in + * the bucket are nonempty, unless the bucket is totally empty (in + * which case all overflow pages will be freed). The original implementation + * required that to be true on entry as well, but it's a lot easier for + * callers to leave empty overflow pages and let this guy clean it up. + * + * Caller must acquire cleanup lock on the primary page of the target + * bucket to exclude any scans that are in progress, which could easily + * be confused into returning the same tuple more than once or some tuples + * not at all by the rearrangement we are performing here. To prevent + * any concurrent scan to cross the squeeze scan we use lock chaining + * similar to hashbucketcleanup. Refer comments atop hashbucketcleanup. + * + * We need to retain a pin on the primary bucket to ensure that no concurrent + * split can start. + * + * Since this function is invoked in VACUUM, we provide an access strategy + * parameter that controls fetches of the bucket pages. + */ +void +_hash_squeezebucket(Relation rel, + Bucket bucket, + BlockNumber bucket_blkno, + Buffer bucket_buf, + BufferAccessStrategy bstrategy) +{ + BlockNumber wblkno; + BlockNumber rblkno; + Buffer wbuf; + Buffer rbuf; + Page wpage; + Page rpage; + HashPageOpaque wopaque; + HashPageOpaque ropaque; + + /* + * start squeezing into the primary bucket page. + */ + wblkno = bucket_blkno; + wbuf = bucket_buf; + wpage = BufferGetPage(wbuf); + wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); + + /* + * if there aren't any overflow pages, there's nothing to squeeze. caller + * is responsible for releasing the pin on primary bucket page. + */ + if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) + { + LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); + return; + } + + /* + * Find the last page in the bucket chain by starting at the base bucket + * page and working forward. Note: we assume that a hash bucket chain is + * usually smaller than the buffer ring being used by VACUUM, else using + * the access strategy here would be counterproductive. + */ + rbuf = InvalidBuffer; + ropaque = wopaque; + do + { + rblkno = ropaque->hasho_nextblkno; + if (rbuf != InvalidBuffer) + _hash_relbuf(rel, rbuf); + rbuf = _hash_getbuf_with_strategy(rel, + rblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); + rpage = BufferGetPage(rbuf); + ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); + Assert(ropaque->hasho_bucket == bucket); + } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); + + /* + * squeeze the tuples. + */ + for (;;) + { + OffsetNumber roffnum; + OffsetNumber maxroffnum; + OffsetNumber deletable[MaxOffsetNumber]; + IndexTuple itups[MaxIndexTuplesPerPage]; + Size tups_size[MaxIndexTuplesPerPage]; + OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; + uint16 ndeletable = 0; + uint16 nitups = 0; + Size all_tups_size = 0; + int i; + bool retain_pin = false; + +readpage: + /* Scan each tuple in "read" page */ + maxroffnum = PageGetMaxOffsetNumber(rpage); + for (roffnum = FirstOffsetNumber; + roffnum <= maxroffnum; + roffnum = OffsetNumberNext(roffnum)) + { + IndexTuple itup; + Size itemsz; + + /* skip dead tuples */ + if (ItemIdIsDead(PageGetItemId(rpage, roffnum))) + continue; + + itup = (IndexTuple) PageGetItem(rpage, + PageGetItemId(rpage, roffnum)); + itemsz = IndexTupleSize(itup); + itemsz = MAXALIGN(itemsz); + + /* + * Walk up the bucket chain, looking for a page big enough for + * this item and all other accumulated items. Exit if we reach + * the read page. + */ + while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz)) + { + Buffer next_wbuf = InvalidBuffer; + bool tups_moved = false; + + Assert(!PageIsEmpty(wpage)); + + if (wblkno == bucket_blkno) + retain_pin = true; + + wblkno = wopaque->hasho_nextblkno; + Assert(BlockNumberIsValid(wblkno)); + + /* don't need to move to next page if we reached the read page */ + if (wblkno != rblkno) + next_wbuf = _hash_getbuf_with_strategy(rel, + wblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); + + if (nitups > 0) + { + Assert(nitups == ndeletable); + + /* + * This operation needs to log multiple tuples, prepare + * WAL for that. + */ + if (RelationNeedsWAL(rel)) + XLogEnsureRecordSpace(0, 3 + nitups); + + START_CRIT_SECTION(); + + /* + * we have to insert tuples on the "write" page, being + * careful to preserve hashkey ordering. (If we insert + * many tuples into the same "write" page it would be + * worth qsort'ing them). + */ + _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); + MarkBufferDirty(wbuf); + + /* Delete tuples we already moved off read page */ + PageIndexMultiDelete(rpage, deletable, ndeletable); + MarkBufferDirty(rbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + xl_hash_move_page_contents xlrec; + + xlrec.ntups = nitups; + xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf) ? true : false; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashMovePageContents); + + /* + * bucket buffer needs to be registered to ensure that + * we can acquire a cleanup lock on it during replay. + */ + if (!xlrec.is_prim_bucket_same_wrt) + XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); + + XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); + XLogRegisterBufData(1, (char *) itup_offsets, + nitups * sizeof(OffsetNumber)); + for (i = 0; i < nitups; i++) + XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); + + XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD); + XLogRegisterBufData(2, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS); + + PageSetLSN(BufferGetPage(wbuf), recptr); + PageSetLSN(BufferGetPage(rbuf), recptr); + } + + END_CRIT_SECTION(); + + tups_moved = true; + } + + /* + * release the lock on previous page after acquiring the lock + * on next page + */ + if (retain_pin) + LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, wbuf); + + /* nothing more to do if we reached the read page */ + if (rblkno == wblkno) + { + _hash_relbuf(rel, rbuf); + return; + } + + wbuf = next_wbuf; + wpage = BufferGetPage(wbuf); + wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); + Assert(wopaque->hasho_bucket == bucket); + retain_pin = false; + + /* be tidy */ + for (i = 0; i < nitups; i++) + pfree(itups[i]); + nitups = 0; + all_tups_size = 0; + ndeletable = 0; + + /* + * after moving the tuples, rpage would have been compacted, + * so we need to rescan it. + */ + if (tups_moved) + goto readpage; + } + + /* remember tuple for deletion from "read" page */ + deletable[ndeletable++] = roffnum; + + /* + * we need a copy of index tuples as they can be freed as part of + * overflow page, however we need them to write a WAL record in + * _hash_freeovflpage. + */ + itups[nitups] = CopyIndexTuple(itup); + tups_size[nitups++] = itemsz; + all_tups_size += itemsz; + } + + /* + * If we reach here, there are no live tuples on the "read" page --- + * it was empty when we got to it, or we moved them all. So we can + * just free the page without bothering with deleting tuples + * individually. Then advance to the previous "read" page. + * + * Tricky point here: if our read and write pages are adjacent in the + * bucket chain, our write lock on wbuf will conflict with + * _hash_freeovflpage's attempt to update the sibling links of the + * removed page. In that case, we don't need to lock it again. + */ + rblkno = ropaque->hasho_prevblkno; + Assert(BlockNumberIsValid(rblkno)); + + /* free this overflow page (releases rbuf) */ + _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets, + tups_size, nitups, bstrategy); + + /* be tidy */ + for (i = 0; i < nitups; i++) + pfree(itups[i]); + + /* are we freeing the page adjacent to wbuf? */ + if (rblkno == wblkno) + { + /* retain the pin on primary bucket page till end of bucket scan */ + if (wblkno == bucket_blkno) + LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, wbuf); + return; + } + + rbuf = _hash_getbuf_with_strategy(rel, + rblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); + rpage = BufferGetPage(rbuf); + ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); + Assert(ropaque->hasho_bucket == bucket); + } + + /* NOTREACHED */ +} |