diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
commit | 46651ce6fe013220ed397add242004d764fc0153 (patch) | |
tree | 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/access/brin/brin_pageops.c | |
parent | Initial commit. (diff) | |
download | postgresql-14-upstream.tar.xz postgresql-14-upstream.zip |
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/access/brin/brin_pageops.c')
-rw-r--r-- | src/backend/access/brin/brin_pageops.c | 920 |
1 files changed, 920 insertions, 0 deletions
diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c new file mode 100644 index 0000000..992b33a --- /dev/null +++ b/src/backend/access/brin/brin_pageops.c @@ -0,0 +1,920 @@ +/* + * brin_pageops.c + * Page-handling routines for BRIN indexes + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_pageops.c + */ +#include "postgres.h" + +#include "access/brin_page.h" +#include "access/brin_pageops.h" +#include "access/brin_revmap.h" +#include "access/brin_xlog.h" +#include "access/xloginsert.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/freespace.h" +#include "storage/lmgr.h" +#include "storage/smgr.h" +#include "utils/rel.h" + +/* + * Maximum size of an entry in a BRIN_PAGETYPE_REGULAR page. We can tolerate + * a single item per page, unlike other index AMs. + */ +#define BrinMaxItemSize \ + MAXALIGN_DOWN(BLCKSZ - \ + (MAXALIGN(SizeOfPageHeaderData + \ + sizeof(ItemIdData)) + \ + MAXALIGN(sizeof(BrinSpecialSpace)))) + +static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, + bool *extended); +static Size br_page_get_freespace(Page page); +static void brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer); + + +/* + * Update tuple origtup (size origsz), located in offset oldoff of buffer + * oldbuf, to newtup (size newsz) as summary tuple for the page range starting + * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit. + * + * If samepage is true, attempt to put the new tuple in the same page, but if + * there's no room, use some other one. + * + * If the update is successful, return true; the revmap is updated to point to + * the new tuple. If the update is not done for whatever reason, return false. + * Caller may retry the update if this happens. + */ +bool +brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, + BrinRevmap *revmap, BlockNumber heapBlk, + Buffer oldbuf, OffsetNumber oldoff, + const BrinTuple *origtup, Size origsz, + const BrinTuple *newtup, Size newsz, + bool samepage) +{ + Page oldpage; + ItemId oldlp; + BrinTuple *oldtup; + Size oldsz; + Buffer newbuf; + BlockNumber newblk = InvalidBlockNumber; + bool extended; + + Assert(newsz == MAXALIGN(newsz)); + + /* If the item is oversized, don't bother. */ + if (newsz > BrinMaxItemSize) + { + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", + newsz, BrinMaxItemSize, RelationGetRelationName(idxrel)))); + return false; /* keep compiler quiet */ + } + + /* make sure the revmap is long enough to contain the entry we need */ + brinRevmapExtend(revmap, heapBlk); + + if (!samepage) + { + /* need a page on which to put the item */ + newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended); + if (!BufferIsValid(newbuf)) + { + Assert(!extended); + return false; + } + + /* + * Note: it's possible (though unlikely) that the returned newbuf is + * the same as oldbuf, if brin_getinsertbuffer determined that the old + * buffer does in fact have enough space. + */ + if (newbuf == oldbuf) + { + Assert(!extended); + newbuf = InvalidBuffer; + } + else + newblk = BufferGetBlockNumber(newbuf); + } + else + { + LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); + newbuf = InvalidBuffer; + extended = false; + } + oldpage = BufferGetPage(oldbuf); + oldlp = PageGetItemId(oldpage, oldoff); + + /* + * Check that the old tuple wasn't updated concurrently: it might have + * moved someplace else entirely, and for that matter the whole page + * might've become a revmap page. Note that in the first two cases + * checked here, the "oldlp" we just calculated is garbage; but + * PageGetItemId() is simple enough that it was safe to do that + * calculation anyway. + */ + if (!BRIN_IS_REGULAR_PAGE(oldpage) || + oldoff > PageGetMaxOffsetNumber(oldpage) || + !ItemIdIsNormal(oldlp)) + { + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + + /* + * If this happens, and the new buffer was obtained by extending the + * relation, then we need to ensure we don't leave it uninitialized or + * forget about it. + */ + if (BufferIsValid(newbuf)) + { + if (extended) + brin_initialize_empty_new_buffer(idxrel, newbuf); + UnlockReleaseBuffer(newbuf); + if (extended) + FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); + } + return false; + } + + oldsz = ItemIdGetLength(oldlp); + oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp); + + /* + * ... or it might have been updated in place to different contents. + */ + if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz)) + { + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + if (BufferIsValid(newbuf)) + { + /* As above, initialize and record new page if we got one */ + if (extended) + brin_initialize_empty_new_buffer(idxrel, newbuf); + UnlockReleaseBuffer(newbuf); + if (extended) + FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); + } + return false; + } + + /* + * Great, the old tuple is intact. We can proceed with the update. + * + * If there's enough room in the old page for the new tuple, replace it. + * + * Note that there might now be enough space on the page even though the + * caller told us there isn't, if a concurrent update moved another tuple + * elsewhere or replaced a tuple with a smaller one. + */ + if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) && + brin_can_do_samepage_update(oldbuf, origsz, newsz)) + { + START_CRIT_SECTION(); + if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) unconstify(BrinTuple *, newtup), newsz)) + elog(ERROR, "failed to replace BRIN tuple"); + MarkBufferDirty(oldbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(idxrel)) + { + xl_brin_samepage_update xlrec; + XLogRecPtr recptr; + uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE; + + xlrec.offnum = oldoff; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBrinSamepageUpdate); + + XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) unconstify(BrinTuple *, newtup), newsz); + + recptr = XLogInsert(RM_BRIN_ID, info); + + PageSetLSN(oldpage, recptr); + } + + END_CRIT_SECTION(); + + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + + if (BufferIsValid(newbuf)) + { + /* As above, initialize and record new page if we got one */ + if (extended) + brin_initialize_empty_new_buffer(idxrel, newbuf); + UnlockReleaseBuffer(newbuf); + if (extended) + FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); + } + + return true; + } + else if (newbuf == InvalidBuffer) + { + /* + * Not enough space, but caller said that there was. Tell them to + * start over. + */ + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + return false; + } + else + { + /* + * Not enough free space on the oldpage. Put the new tuple on the new + * page, and update the revmap. + */ + Page newpage = BufferGetPage(newbuf); + Buffer revmapbuf; + ItemPointerData newtid; + OffsetNumber newoff; + Size freespace = 0; + + revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); + + START_CRIT_SECTION(); + + /* + * We need to initialize the page if it's newly obtained. Note we + * will WAL-log the initialization as part of the update, so we don't + * need to do that here. + */ + if (extended) + brin_page_init(newpage, BRIN_PAGETYPE_REGULAR); + + PageIndexTupleDeleteNoCompact(oldpage, oldoff); + newoff = PageAddItem(newpage, (Item) unconstify(BrinTuple *, newtup), newsz, + InvalidOffsetNumber, false, false); + if (newoff == InvalidOffsetNumber) + elog(ERROR, "failed to add BRIN tuple to new page"); + MarkBufferDirty(oldbuf); + MarkBufferDirty(newbuf); + + /* needed to update FSM below */ + if (extended) + freespace = br_page_get_freespace(newpage); + + ItemPointerSet(&newtid, newblk, newoff); + brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid); + MarkBufferDirty(revmapbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(idxrel)) + { + xl_brin_update xlrec; + XLogRecPtr recptr; + uint8 info; + + info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0); + + xlrec.insert.offnum = newoff; + xlrec.insert.heapBlk = heapBlk; + xlrec.insert.pagesPerRange = pagesPerRange; + xlrec.oldOffnum = oldoff; + + XLogBeginInsert(); + + /* new page */ + XLogRegisterData((char *) &xlrec, SizeOfBrinUpdate); + + XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); + XLogRegisterBufData(0, (char *) unconstify(BrinTuple *, newtup), newsz); + + /* revmap page */ + XLogRegisterBuffer(1, revmapbuf, 0); + + /* old page */ + XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_BRIN_ID, info); + + PageSetLSN(oldpage, recptr); + PageSetLSN(newpage, recptr); + PageSetLSN(BufferGetPage(revmapbuf), recptr); + } + + END_CRIT_SECTION(); + + LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + UnlockReleaseBuffer(newbuf); + + if (extended) + { + RecordPageWithFreeSpace(idxrel, newblk, freespace); + FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); + } + + return true; + } +} + +/* + * Return whether brin_doupdate can do a samepage update. + */ +bool +brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz) +{ + return + ((newsz <= origsz) || + PageGetExactFreeSpace(BufferGetPage(buffer)) >= (newsz - origsz)); +} + +/* + * Insert an index tuple into the index relation. The revmap is updated to + * mark the range containing the given page as pointing to the inserted entry. + * A WAL record is written. + * + * The buffer, if valid, is first checked for free space to insert the new + * entry; if there isn't enough, a new buffer is obtained and pinned. No + * buffer lock must be held on entry, no buffer lock is held on exit. + * + * Return value is the offset number where the tuple was inserted. + */ +OffsetNumber +brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, + BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, + BrinTuple *tup, Size itemsz) +{ + Page page; + BlockNumber blk; + OffsetNumber off; + Size freespace = 0; + Buffer revmapbuf; + ItemPointerData tid; + bool extended; + + Assert(itemsz == MAXALIGN(itemsz)); + + /* If the item is oversized, don't even bother. */ + if (itemsz > BrinMaxItemSize) + { + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", + itemsz, BrinMaxItemSize, RelationGetRelationName(idxrel)))); + return InvalidOffsetNumber; /* keep compiler quiet */ + } + + /* Make sure the revmap is long enough to contain the entry we need */ + brinRevmapExtend(revmap, heapBlk); + + /* + * Acquire lock on buffer supplied by caller, if any. If it doesn't have + * enough space, unpin it to obtain a new one below. + */ + if (BufferIsValid(*buffer)) + { + /* + * It's possible that another backend (or ourselves!) extended the + * revmap over the page we held a pin on, so we cannot assume that + * it's still a regular page. + */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz) + { + UnlockReleaseBuffer(*buffer); + *buffer = InvalidBuffer; + } + } + + /* + * If we still don't have a usable buffer, have brin_getinsertbuffer + * obtain one for us. + */ + if (!BufferIsValid(*buffer)) + { + do + *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended); + while (!BufferIsValid(*buffer)); + } + else + extended = false; + + /* Now obtain lock on revmap buffer */ + revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); + + page = BufferGetPage(*buffer); + blk = BufferGetBlockNumber(*buffer); + + /* Execute the actual insertion */ + START_CRIT_SECTION(); + if (extended) + brin_page_init(page, BRIN_PAGETYPE_REGULAR); + off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber, + false, false); + if (off == InvalidOffsetNumber) + elog(ERROR, "failed to add BRIN tuple to new page"); + MarkBufferDirty(*buffer); + + /* needed to update FSM below */ + if (extended) + freespace = br_page_get_freespace(page); + + ItemPointerSet(&tid, blk, off); + brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid); + MarkBufferDirty(revmapbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(idxrel)) + { + xl_brin_insert xlrec; + XLogRecPtr recptr; + uint8 info; + + info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0); + xlrec.heapBlk = heapBlk; + xlrec.pagesPerRange = pagesPerRange; + xlrec.offnum = off; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBrinInsert); + + XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); + XLogRegisterBufData(0, (char *) tup, itemsz); + + XLogRegisterBuffer(1, revmapbuf, 0); + + recptr = XLogInsert(RM_BRIN_ID, info); + + PageSetLSN(page, recptr); + PageSetLSN(BufferGetPage(revmapbuf), recptr); + } + + END_CRIT_SECTION(); + + /* Tuple is firmly on buffer; we can release our locks */ + LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); + + BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u", + blk, off, heapBlk)); + + if (extended) + { + RecordPageWithFreeSpace(idxrel, blk, freespace); + FreeSpaceMapVacuumRange(idxrel, blk, blk + 1); + } + + return off; +} + +/* + * Initialize a page with the given type. + * + * Caller is responsible for marking it dirty, as appropriate. + */ +void +brin_page_init(Page page, uint16 type) +{ + PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace)); + + BrinPageType(page) = type; +} + +/* + * Initialize a new BRIN index's metapage. + */ +void +brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version) +{ + BrinMetaPageData *metadata; + + brin_page_init(page, BRIN_PAGETYPE_META); + + metadata = (BrinMetaPageData *) PageGetContents(page); + + metadata->brinMagic = BRIN_META_MAGIC; + metadata->brinVersion = version; + metadata->pagesPerRange = pagesPerRange; + + /* + * Note we cheat here a little. 0 is not a valid revmap block number + * (because it's the metapage buffer), but doing this enables the first + * revmap page to be created when the index is. + */ + metadata->lastRevmapPage = 0; + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c compresses + * the page. + */ + ((PageHeader) page)->pd_lower = + ((char *) metadata + sizeof(BrinMetaPageData)) - (char *) page; +} + +/* + * Initiate page evacuation protocol. + * + * The page must be locked in exclusive mode by the caller. + * + * If the page is not yet initialized or empty, return false without doing + * anything; it can be used for revmap without any further changes. If it + * contains tuples, mark it for evacuation and return true. + */ +bool +brin_start_evacuating_page(Relation idxRel, Buffer buf) +{ + OffsetNumber off; + OffsetNumber maxoff; + Page page; + + page = BufferGetPage(buf); + + if (PageIsNew(page)) + return false; + + maxoff = PageGetMaxOffsetNumber(page); + for (off = FirstOffsetNumber; off <= maxoff; off++) + { + ItemId lp; + + lp = PageGetItemId(page, off); + if (ItemIdIsUsed(lp)) + { + /* + * Prevent other backends from adding more stuff to this page: + * BRIN_EVACUATE_PAGE informs br_page_get_freespace that this page + * can no longer be used to add new tuples. Note that this flag + * is not WAL-logged, except accidentally. + */ + BrinPageFlags(page) |= BRIN_EVACUATE_PAGE; + MarkBufferDirtyHint(buf, true); + + return true; + } + } + return false; +} + +/* + * Move all tuples out of a page. + * + * The caller must hold lock on the page. The lock and pin are released. + */ +void +brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange, + BrinRevmap *revmap, Buffer buf) +{ + OffsetNumber off; + OffsetNumber maxoff; + Page page; + BrinTuple *btup = NULL; + Size btupsz = 0; + + page = BufferGetPage(buf); + + Assert(BrinPageFlags(page) & BRIN_EVACUATE_PAGE); + + maxoff = PageGetMaxOffsetNumber(page); + for (off = FirstOffsetNumber; off <= maxoff; off++) + { + BrinTuple *tup; + Size sz; + ItemId lp; + + CHECK_FOR_INTERRUPTS(); + + lp = PageGetItemId(page, off); + if (ItemIdIsUsed(lp)) + { + sz = ItemIdGetLength(lp); + tup = (BrinTuple *) PageGetItem(page, lp); + tup = brin_copy_tuple(tup, sz, btup, &btupsz); + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno, + buf, off, tup, sz, tup, sz, false)) + off--; /* retry */ + + LockBuffer(buf, BUFFER_LOCK_SHARE); + + /* It's possible that someone extended the revmap over this page */ + if (!BRIN_IS_REGULAR_PAGE(page)) + break; + } + } + + UnlockReleaseBuffer(buf); +} + +/* + * Given a BRIN index page, initialize it if necessary, and record its + * current free space in the FSM. + * + * The main use for this is when, during vacuuming, an uninitialized page is + * found, which could be the result of relation extension followed by a crash + * before the page can be used. + * + * Here, we don't bother to update upper FSM pages, instead expecting that our + * caller (brin_vacuum_scan) will fix them at the end of the scan. Elsewhere + * in this file, it's generally a good idea to propagate additions of free + * space into the upper FSM pages immediately. + */ +void +brin_page_cleanup(Relation idxrel, Buffer buf) +{ + Page page = BufferGetPage(buf); + + /* + * If a page was left uninitialized, initialize it now; also record it in + * FSM. + * + * Somebody else might be extending the relation concurrently. To avoid + * re-initializing the page before they can grab the buffer lock, we + * acquire the extension lock momentarily. Since they hold the extension + * lock from before getting the page and after its been initialized, we're + * sure to see their initialization. + */ + if (PageIsNew(page)) + { + LockRelationForExtension(idxrel, ShareLock); + UnlockRelationForExtension(idxrel, ShareLock); + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + if (PageIsNew(page)) + { + brin_initialize_empty_new_buffer(idxrel, buf); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + return; + } + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + + /* Nothing to be done for non-regular index pages */ + if (BRIN_IS_META_PAGE(BufferGetPage(buf)) || + BRIN_IS_REVMAP_PAGE(BufferGetPage(buf))) + return; + + /* Measure free space and record it */ + RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf), + br_page_get_freespace(page)); +} + +/* + * Return a pinned and exclusively locked buffer which can be used to insert an + * index item of size itemsz (caller must ensure not to request sizes + * impossible to fulfill). If oldbuf is a valid buffer, it is also locked (in + * an order determined to avoid deadlocks). + * + * If we find that the old page is no longer a regular index page (because + * of a revmap extension), the old buffer is unlocked and we return + * InvalidBuffer. + * + * If there's no existing page with enough free space to accommodate the new + * item, the relation is extended. If this happens, *extended is set to true, + * and it is the caller's responsibility to initialize the page (and WAL-log + * that fact) prior to use. The caller should also update the FSM with the + * page's remaining free space after the insertion. + * + * Note that the caller is not expected to update FSM unless *extended is set + * true. This policy means that we'll update FSM when a page is created, and + * when it's found to have too little space for a desired tuple insertion, + * but not every single time we add a tuple to the page. + * + * Note that in some corner cases it is possible for this routine to extend + * the relation and then not return the new page. It is this routine's + * responsibility to WAL-log the page initialization and to record the page in + * FSM if that happens, since the caller certainly can't do it. + */ +static Buffer +brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, + bool *extended) +{ + BlockNumber oldblk; + BlockNumber newblk; + Page page; + Size freespace; + + /* callers must have checked */ + Assert(itemsz <= BrinMaxItemSize); + + if (BufferIsValid(oldbuf)) + oldblk = BufferGetBlockNumber(oldbuf); + else + oldblk = InvalidBlockNumber; + + /* Choose initial target page, re-using existing target if known */ + newblk = RelationGetTargetBlock(irel); + if (newblk == InvalidBlockNumber) + newblk = GetPageWithFreeSpace(irel, itemsz); + + /* + * Loop until we find a page with sufficient free space. By the time we + * return to caller out of this loop, both buffers are valid and locked; + * if we have to restart here, neither page is locked and newblk isn't + * pinned (if it's even valid). + */ + for (;;) + { + Buffer buf; + bool extensionLockHeld = false; + + CHECK_FOR_INTERRUPTS(); + + *extended = false; + + if (newblk == InvalidBlockNumber) + { + /* + * There's not enough free space in any existing index page, + * according to the FSM: extend the relation to obtain a shiny new + * page. + */ + if (!RELATION_IS_LOCAL(irel)) + { + LockRelationForExtension(irel, ExclusiveLock); + extensionLockHeld = true; + } + buf = ReadBuffer(irel, P_NEW); + newblk = BufferGetBlockNumber(buf); + *extended = true; + + BRIN_elog((DEBUG2, "brin_getinsertbuffer: extending to page %u", + BufferGetBlockNumber(buf))); + } + else if (newblk == oldblk) + { + /* + * There's an odd corner-case here where the FSM is out-of-date, + * and gave us the old page. + */ + buf = oldbuf; + } + else + { + buf = ReadBuffer(irel, newblk); + } + + /* + * We lock the old buffer first, if it's earlier than the new one; but + * then we need to check that it hasn't been turned into a revmap page + * concurrently. If we detect that that happened, give up and tell + * caller to start over. + */ + if (BufferIsValid(oldbuf) && oldblk < newblk) + { + LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); + if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))) + { + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + + /* + * It is possible that the new page was obtained from + * extending the relation. In that case, we must be sure to + * record it in the FSM before leaving, because otherwise the + * space would be lost forever. However, we cannot let an + * uninitialized page get in the FSM, so we need to initialize + * it first. + */ + if (*extended) + brin_initialize_empty_new_buffer(irel, buf); + + if (extensionLockHeld) + UnlockRelationForExtension(irel, ExclusiveLock); + + ReleaseBuffer(buf); + + if (*extended) + { + FreeSpaceMapVacuumRange(irel, newblk, newblk + 1); + /* shouldn't matter, but don't confuse caller */ + *extended = false; + } + + return InvalidBuffer; + } + } + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + if (extensionLockHeld) + UnlockRelationForExtension(irel, ExclusiveLock); + + page = BufferGetPage(buf); + + /* + * We have a new buffer to insert into. Check that the new page has + * enough free space, and return it if it does; otherwise start over. + * (br_page_get_freespace also checks that the FSM didn't hand us a + * page that has since been repurposed for the revmap.) + */ + freespace = *extended ? + BrinMaxItemSize : br_page_get_freespace(page); + if (freespace >= itemsz) + { + RelationSetTargetBlock(irel, newblk); + + /* + * Lock the old buffer if not locked already. Note that in this + * case we know for sure it's a regular page: it's later than the + * new page we just got, which is not a revmap page, and revmap + * pages are always consecutive. + */ + if (BufferIsValid(oldbuf) && oldblk > newblk) + { + LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); + Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))); + } + + return buf; + } + + /* This page is no good. */ + + /* + * If an entirely new page does not contain enough free space for the + * new item, then surely that item is oversized. Complain loudly; but + * first make sure we initialize the page and record it as free, for + * next time. + */ + if (*extended) + { + brin_initialize_empty_new_buffer(irel, buf); + /* since this should not happen, skip FreeSpaceMapVacuum */ + + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", + itemsz, freespace, RelationGetRelationName(irel)))); + return InvalidBuffer; /* keep compiler quiet */ + } + + if (newblk != oldblk) + UnlockReleaseBuffer(buf); + if (BufferIsValid(oldbuf) && oldblk <= newblk) + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + + /* + * Update the FSM with the new, presumably smaller, freespace value + * for this page, then search for a new target page. + */ + newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz); + } +} + +/* + * Initialize a page as an empty regular BRIN page, WAL-log this, and record + * the page in FSM. + * + * There are several corner situations in which we extend the relation to + * obtain a new page and later find that we cannot use it immediately. When + * that happens, we don't want to leave the page go unrecorded in FSM, because + * there is no mechanism to get the space back and the index would bloat. + * Also, because we would not WAL-log the action that would initialize the + * page, the page would go uninitialized in a standby (or after recovery). + * + * While we record the page in FSM here, caller is responsible for doing FSM + * upper-page update if that seems appropriate. + */ +static void +brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer) +{ + Page page; + + BRIN_elog((DEBUG2, + "brin_initialize_empty_new_buffer: initializing blank page %u", + BufferGetBlockNumber(buffer))); + + START_CRIT_SECTION(); + page = BufferGetPage(buffer); + brin_page_init(page, BRIN_PAGETYPE_REGULAR); + MarkBufferDirty(buffer); + log_newpage_buffer(buffer, true); + END_CRIT_SECTION(); + + /* + * We update the FSM for this page, but this is not WAL-logged. This is + * acceptable because VACUUM will scan the index and update the FSM with + * pages whose FSM records were forgotten in a crash. + */ + RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer), + br_page_get_freespace(page)); +} + + +/* + * Return the amount of free space on a regular BRIN index page. + * + * If the page is not a regular page, or has been marked with the + * BRIN_EVACUATE_PAGE flag, returns 0. + */ +static Size +br_page_get_freespace(Page page) +{ + if (!BRIN_IS_REGULAR_PAGE(page) || + (BrinPageFlags(page) & BRIN_EVACUATE_PAGE) != 0) + return 0; + else + return PageGetFreeSpace(page); +} |