diff options
Diffstat (limited to 'src/backend/storage/page')
-rw-r--r-- | src/backend/storage/page/Makefile | 23 | ||||
-rw-r--r-- | src/backend/storage/page/README | 64 | ||||
-rw-r--r-- | src/backend/storage/page/bufpage.c | 1221 | ||||
-rw-r--r-- | src/backend/storage/page/checksum.c | 22 | ||||
-rw-r--r-- | src/backend/storage/page/itemptr.c | 73 |
5 files changed, 1403 insertions, 0 deletions
diff --git a/src/backend/storage/page/Makefile b/src/backend/storage/page/Makefile new file mode 100644 index 0000000..10021e2 --- /dev/null +++ b/src/backend/storage/page/Makefile @@ -0,0 +1,23 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for storage/page +# +# IDENTIFICATION +# src/backend/storage/page/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/storage/page +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + bufpage.o \ + checksum.o \ + itemptr.o + +include $(top_srcdir)/src/backend/common.mk + +# important optimizations flags for checksum.c +checksum.o: CFLAGS += ${CFLAGS_VECTOR} diff --git a/src/backend/storage/page/README b/src/backend/storage/page/README new file mode 100644 index 0000000..4e45bd9 --- /dev/null +++ b/src/backend/storage/page/README @@ -0,0 +1,64 @@ +src/backend/storage/page/README + +Checksums +--------- + +Checksums on data pages are designed to detect corruption by the I/O system. +We do not protect buffers against uncorrectable memory errors, since these +have a very low measured incidence according to research on large server farms, +http://www.cs.toronto.edu/~bianca/papers/sigmetrics09.pdf, discussed +2010/12/22 on -hackers list. + +Current implementation requires this be enabled system-wide at initdb time, or +by using the pg_checksums tool on an offline cluster. + +The checksum is not valid at all times on a data page!! +The checksum is valid when the page leaves the shared pool and is checked +when it later re-enters the shared pool as a result of I/O. +We set the checksum on a buffer in the shared pool immediately before we +flush the buffer. As a result we implicitly invalidate the page's checksum +when we modify the page for a data change or even a hint. This means that +many or even most pages in shared buffers have invalid page checksums, +so be careful how you interpret the pd_checksum field. + +That means that WAL-logged changes to a page do NOT update the page checksum, +so full page images may not have a valid checksum. But those page images have +the WAL CRC covering them and so are verified separately from this +mechanism. WAL replay should not test the checksum of a full-page image. + +The best way to understand this is that WAL CRCs protect records entering the +WAL stream, and data page verification protects blocks entering the shared +buffer pool. They are similar in purpose, yet completely separate. Together +they ensure we are able to detect errors in data re-entering +PostgreSQL-controlled memory. Note also that the WAL checksum is a 32-bit CRC, +whereas the page checksum is only 16-bits. + +Any write of a data block can cause a torn page if the write is unsuccessful. +Full page writes protect us from that, which are stored in WAL. Setting hint +bits when a page is already dirty is OK because a full page write must already +have been written for it since the last checkpoint. Setting hint bits on an +otherwise clean page can allow torn pages; this doesn't normally matter since +they are just hints, but when the page has checksums, then losing a few bits +would cause the checksum to be invalid. So if we have full_page_writes = on +and checksums enabled then we must write a WAL record specifically so that we +record a full page image in WAL. Hint bits updates should be protected using +MarkBufferDirtyHint(), which is responsible for writing the full-page image +when necessary. + +Note that when we write a page checksum we include the hopefully zeroed bytes +that form the hole in the centre of a standard page. Thus, when we read the +block back from storage we implicitly check that the hole is still all zeroes. +We do this to ensure that we spot errors that could have destroyed data even +if they haven't actually done so. Full page images stored in WAL do *not* +check that the hole is all zero; the data in the hole is simply skipped and +re-zeroed if the backup block is reapplied. We do this because a failure in +WAL is a fatal error and prevents further recovery, whereas a checksum failure +on a normal data block is a hard error but not a critical one for the server, +even if it is a very bad thing for the user. + +New WAL records cannot be written during recovery, so hint bits set during +recovery must not dirty the page if the buffer is not already dirty, when +checksums are enabled. Systems in Hot-Standby mode may benefit from hint bits +being set, but with checksums enabled, a page cannot be dirtied after setting a +hint bit (due to the torn page risk). So, it must wait for full-page images +containing the hint bit updates to arrive from the master. diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c new file mode 100644 index 0000000..8dc67c9 --- /dev/null +++ b/src/backend/storage/page/bufpage.c @@ -0,0 +1,1221 @@ +/*------------------------------------------------------------------------- + * + * bufpage.c + * POSTGRES standard buffer page code. + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/page/bufpage.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/itup.h" +#include "access/xlog.h" +#include "pgstat.h" +#include "storage/checksum.h" +#include "utils/memdebug.h" +#include "utils/memutils.h" + + +/* GUC variable */ +bool ignore_checksum_failure = false; + + +/* ---------------------------------------------------------------- + * Page support functions + * ---------------------------------------------------------------- + */ + +/* + * PageInit + * Initializes the contents of a page. + * Note that we don't calculate an initial checksum here; that's not done + * until it's time to write. + */ +void +PageInit(Page page, Size pageSize, Size specialSize) +{ + PageHeader p = (PageHeader) page; + + specialSize = MAXALIGN(specialSize); + + Assert(pageSize == BLCKSZ); + Assert(pageSize > specialSize + SizeOfPageHeaderData); + + /* Make sure all fields of page are zero, as well as unused space */ + MemSet(p, 0, pageSize); + + p->pd_flags = 0; + p->pd_lower = SizeOfPageHeaderData; + p->pd_upper = pageSize - specialSize; + p->pd_special = pageSize - specialSize; + PageSetPageSizeAndVersion(page, pageSize, PG_PAGE_LAYOUT_VERSION); + /* p->pd_prune_xid = InvalidTransactionId; done by above MemSet */ +} + + +/* + * PageIsVerified + * Utility wrapper for PageIsVerifiedExtended(). + */ +bool +PageIsVerified(Page page, BlockNumber blkno) +{ + return PageIsVerifiedExtended(page, blkno, + PIV_LOG_WARNING | PIV_REPORT_STAT); +} + + +/* + * PageIsVerifiedExtended + * Check that the page header and checksum (if any) appear valid. + * + * This is called when a page has just been read in from disk. The idea is + * to cheaply detect trashed pages before we go nuts following bogus line + * pointers, testing invalid transaction identifiers, etc. + * + * It turns out to be necessary to allow zeroed pages here too. Even though + * this routine is *not* called when deliberately adding a page to a relation, + * there are scenarios in which a zeroed page might be found in a table. + * (Example: a backend extends a relation, then crashes before it can write + * any WAL entry about the new page. The kernel will already have the + * zeroed page in the file, and it will stay that way after restart.) So we + * allow zeroed pages here, and are careful that the page access macros + * treat such a page as empty and without free space. Eventually, VACUUM + * will clean up such a page and make it usable. + * + * If flag PIV_LOG_WARNING is set, a WARNING is logged in the event of + * a checksum failure. + * + * If flag PIV_REPORT_STAT is set, a checksum failure is reported directly + * to pgstat. + */ +bool +PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags) +{ + PageHeader p = (PageHeader) page; + size_t *pagebytes; + int i; + bool checksum_failure = false; + bool header_sane = false; + bool all_zeroes = false; + uint16 checksum = 0; + + /* + * Don't verify page data unless the page passes basic non-zero test + */ + if (!PageIsNew(page)) + { + if (DataChecksumsEnabled()) + { + checksum = pg_checksum_page((char *) page, blkno); + + if (checksum != p->pd_checksum) + checksum_failure = true; + } + + /* + * The following checks don't prove the header is correct, only that + * it looks sane enough to allow into the buffer pool. Later usage of + * the block can still reveal problems, which is why we offer the + * checksum option. + */ + if ((p->pd_flags & ~PD_VALID_FLAG_BITS) == 0 && + p->pd_lower <= p->pd_upper && + p->pd_upper <= p->pd_special && + p->pd_special <= BLCKSZ && + p->pd_special == MAXALIGN(p->pd_special)) + header_sane = true; + + if (header_sane && !checksum_failure) + return true; + } + + /* Check all-zeroes case */ + all_zeroes = true; + pagebytes = (size_t *) page; + for (i = 0; i < (BLCKSZ / sizeof(size_t)); i++) + { + if (pagebytes[i] != 0) + { + all_zeroes = false; + break; + } + } + + if (all_zeroes) + return true; + + /* + * Throw a WARNING if the checksum fails, but only after we've checked for + * the all-zeroes case. + */ + if (checksum_failure) + { + if ((flags & PIV_LOG_WARNING) != 0) + ereport(WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("page verification failed, calculated checksum %u but expected %u", + checksum, p->pd_checksum))); + + if ((flags & PIV_REPORT_STAT) != 0) + pgstat_report_checksum_failure(); + + if (header_sane && ignore_checksum_failure) + return true; + } + + return false; +} + + +/* + * PageAddItemExtended + * + * Add an item to a page. Return value is the offset at which it was + * inserted, or InvalidOffsetNumber if the item is not inserted for any + * reason. A WARNING is issued indicating the reason for the refusal. + * + * offsetNumber must be either InvalidOffsetNumber to specify finding a + * free line pointer, or a value between FirstOffsetNumber and one past + * the last existing item, to specify using that particular line pointer. + * + * If offsetNumber is valid and flag PAI_OVERWRITE is set, we just store + * the item at the specified offsetNumber, which must be either a + * currently-unused line pointer, or one past the last existing item. + * + * If offsetNumber is valid and flag PAI_OVERWRITE is not set, insert + * the item at the specified offsetNumber, moving existing items later + * in the array to make room. + * + * If offsetNumber is not valid, then assign a slot by finding the first + * one that is both unused and deallocated. + * + * If flag PAI_IS_HEAP is set, we enforce that there can't be more than + * MaxHeapTuplesPerPage line pointers on the page. + * + * !!! EREPORT(ERROR) IS DISALLOWED HERE !!! + */ +OffsetNumber +PageAddItemExtended(Page page, + Item item, + Size size, + OffsetNumber offsetNumber, + int flags) +{ + PageHeader phdr = (PageHeader) page; + Size alignedSize; + int lower; + int upper; + ItemId itemId; + OffsetNumber limit; + bool needshuffle = false; + + /* + * Be wary about corrupted page pointers + */ + if (phdr->pd_lower < SizeOfPageHeaderData || + phdr->pd_lower > phdr->pd_upper || + phdr->pd_upper > phdr->pd_special || + phdr->pd_special > BLCKSZ) + ereport(PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", + phdr->pd_lower, phdr->pd_upper, phdr->pd_special))); + + /* + * Select offsetNumber to place the new item at + */ + limit = OffsetNumberNext(PageGetMaxOffsetNumber(page)); + + /* was offsetNumber passed in? */ + if (OffsetNumberIsValid(offsetNumber)) + { + /* yes, check it */ + if ((flags & PAI_OVERWRITE) != 0) + { + if (offsetNumber < limit) + { + itemId = PageGetItemId(phdr, offsetNumber); + if (ItemIdIsUsed(itemId) || ItemIdHasStorage(itemId)) + { + elog(WARNING, "will not overwrite a used ItemId"); + return InvalidOffsetNumber; + } + } + } + else + { + if (offsetNumber < limit) + needshuffle = true; /* need to move existing linp's */ + } + } + else + { + /* offsetNumber was not passed in, so find a free slot */ + /* if no free slot, we'll put it at limit (1st open slot) */ + if (PageHasFreeLinePointers(phdr)) + { + /* + * Look for "recyclable" (unused) ItemId. We check for no storage + * as well, just to be paranoid --- unused items should never have + * storage. + */ + for (offsetNumber = 1; offsetNumber < limit; offsetNumber++) + { + itemId = PageGetItemId(phdr, offsetNumber); + if (!ItemIdIsUsed(itemId) && !ItemIdHasStorage(itemId)) + break; + } + if (offsetNumber >= limit) + { + /* the hint is wrong, so reset it */ + PageClearHasFreeLinePointers(phdr); + } + } + else + { + /* don't bother searching if hint says there's no free slot */ + offsetNumber = limit; + } + } + + /* Reject placing items beyond the first unused line pointer */ + if (offsetNumber > limit) + { + elog(WARNING, "specified item offset is too large"); + return InvalidOffsetNumber; + } + + /* Reject placing items beyond heap boundary, if heap */ + if ((flags & PAI_IS_HEAP) != 0 && offsetNumber > MaxHeapTuplesPerPage) + { + elog(WARNING, "can't put more than MaxHeapTuplesPerPage items in a heap page"); + return InvalidOffsetNumber; + } + + /* + * Compute new lower and upper pointers for page, see if it'll fit. + * + * Note: do arithmetic as signed ints, to avoid mistakes if, say, + * alignedSize > pd_upper. + */ + if (offsetNumber == limit || needshuffle) + lower = phdr->pd_lower + sizeof(ItemIdData); + else + lower = phdr->pd_lower; + + alignedSize = MAXALIGN(size); + + upper = (int) phdr->pd_upper - (int) alignedSize; + + if (lower > upper) + return InvalidOffsetNumber; + + /* + * OK to insert the item. First, shuffle the existing pointers if needed. + */ + itemId = PageGetItemId(phdr, offsetNumber); + + if (needshuffle) + memmove(itemId + 1, itemId, + (limit - offsetNumber) * sizeof(ItemIdData)); + + /* set the line pointer */ + ItemIdSetNormal(itemId, upper, size); + + /* + * Items normally contain no uninitialized bytes. Core bufpage consumers + * conform, but this is not a necessary coding rule; a new index AM could + * opt to depart from it. However, data type input functions and other + * C-language functions that synthesize datums should initialize all + * bytes; datumIsEqual() relies on this. Testing here, along with the + * similar check in printtup(), helps to catch such mistakes. + * + * Values of the "name" type retrieved via index-only scans may contain + * uninitialized bytes; see comment in btrescan(). Valgrind will report + * this as an error, but it is safe to ignore. + */ + VALGRIND_CHECK_MEM_IS_DEFINED(item, size); + + /* copy the item's data onto the page */ + memcpy((char *) page + upper, item, size); + + /* adjust page header */ + phdr->pd_lower = (LocationIndex) lower; + phdr->pd_upper = (LocationIndex) upper; + + return offsetNumber; +} + + +/* + * PageGetTempPage + * Get a temporary page in local memory for special processing. + * The returned page is not initialized at all; caller must do that. + */ +Page +PageGetTempPage(Page page) +{ + Size pageSize; + Page temp; + + pageSize = PageGetPageSize(page); + temp = (Page) palloc(pageSize); + + return temp; +} + +/* + * PageGetTempPageCopy + * Get a temporary page in local memory for special processing. + * The page is initialized by copying the contents of the given page. + */ +Page +PageGetTempPageCopy(Page page) +{ + Size pageSize; + Page temp; + + pageSize = PageGetPageSize(page); + temp = (Page) palloc(pageSize); + + memcpy(temp, page, pageSize); + + return temp; +} + +/* + * PageGetTempPageCopySpecial + * Get a temporary page in local memory for special processing. + * The page is PageInit'd with the same special-space size as the + * given page, and the special space is copied from the given page. + */ +Page +PageGetTempPageCopySpecial(Page page) +{ + Size pageSize; + Page temp; + + pageSize = PageGetPageSize(page); + temp = (Page) palloc(pageSize); + + PageInit(temp, pageSize, PageGetSpecialSize(page)); + memcpy(PageGetSpecialPointer(temp), + PageGetSpecialPointer(page), + PageGetSpecialSize(page)); + + return temp; +} + +/* + * PageRestoreTempPage + * Copy temporary page back to permanent page after special processing + * and release the temporary page. + */ +void +PageRestoreTempPage(Page tempPage, Page oldPage) +{ + Size pageSize; + + pageSize = PageGetPageSize(tempPage); + memcpy((char *) oldPage, (char *) tempPage, pageSize); + + pfree(tempPage); +} + +/* + * sorting support for PageRepairFragmentation and PageIndexMultiDelete + */ +typedef struct itemIdSortData +{ + uint16 offsetindex; /* linp array index */ + int16 itemoff; /* page offset of item data */ + uint16 alignedlen; /* MAXALIGN(item data len) */ +} itemIdSortData; +typedef itemIdSortData *itemIdSort; + +static int +itemoffcompare(const void *itemidp1, const void *itemidp2) +{ + /* Sort in decreasing itemoff order */ + return ((itemIdSort) itemidp2)->itemoff - + ((itemIdSort) itemidp1)->itemoff; +} + +/* + * After removing or marking some line pointers unused, move the tuples to + * remove the gaps caused by the removed items. + */ +static void +compactify_tuples(itemIdSort itemidbase, int nitems, Page page) +{ + PageHeader phdr = (PageHeader) page; + Offset upper; + int i; + + /* sort itemIdSortData array into decreasing itemoff order */ + qsort((char *) itemidbase, nitems, sizeof(itemIdSortData), + itemoffcompare); + + upper = phdr->pd_special; + for (i = 0; i < nitems; i++) + { + itemIdSort itemidptr = &itemidbase[i]; + ItemId lp; + + lp = PageGetItemId(page, itemidptr->offsetindex + 1); + upper -= itemidptr->alignedlen; + memmove((char *) page + upper, + (char *) page + itemidptr->itemoff, + itemidptr->alignedlen); + lp->lp_off = upper; + } + + phdr->pd_upper = upper; +} + +/* + * PageRepairFragmentation + * + * Frees fragmented space on a page. + * It doesn't remove unused line pointers! Please don't change this. + * + * This routine is usable for heap pages only, but see PageIndexMultiDelete. + * + * As a side effect, the page's PD_HAS_FREE_LINES hint bit is updated. + */ +void +PageRepairFragmentation(Page page) +{ + Offset pd_lower = ((PageHeader) page)->pd_lower; + Offset pd_upper = ((PageHeader) page)->pd_upper; + Offset pd_special = ((PageHeader) page)->pd_special; + itemIdSortData itemidbase[MaxHeapTuplesPerPage]; + itemIdSort itemidptr; + ItemId lp; + int nline, + nstorage, + nunused; + int i; + Size totallen; + + /* + * It's worth the trouble to be more paranoid here than in most places, + * because we are about to reshuffle data in (what is usually) a shared + * disk buffer. If we aren't careful then corrupted pointers, lengths, + * etc could cause us to clobber adjacent disk buffers, spreading the data + * loss further. So, check everything. + */ + if (pd_lower < SizeOfPageHeaderData || + pd_lower > pd_upper || + pd_upper > pd_special || + pd_special > BLCKSZ || + pd_special != MAXALIGN(pd_special)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", + pd_lower, pd_upper, pd_special))); + + /* + * Run through the line pointer array and collect data about live items. + */ + nline = PageGetMaxOffsetNumber(page); + itemidptr = itemidbase; + nunused = totallen = 0; + for (i = FirstOffsetNumber; i <= nline; i++) + { + lp = PageGetItemId(page, i); + if (ItemIdIsUsed(lp)) + { + if (ItemIdHasStorage(lp)) + { + itemidptr->offsetindex = i - 1; + itemidptr->itemoff = ItemIdGetOffset(lp); + if (unlikely(itemidptr->itemoff < (int) pd_upper || + itemidptr->itemoff >= (int) pd_special)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted line pointer: %u", + itemidptr->itemoff))); + itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp)); + totallen += itemidptr->alignedlen; + itemidptr++; + } + } + else + { + /* Unused entries should have lp_len = 0, but make sure */ + ItemIdSetUnused(lp); + nunused++; + } + } + + nstorage = itemidptr - itemidbase; + if (nstorage == 0) + { + /* Page is completely empty, so just reset it quickly */ + ((PageHeader) page)->pd_upper = pd_special; + } + else + { + /* Need to compact the page the hard way */ + if (totallen > (Size) (pd_special - pd_lower)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted item lengths: total %u, available space %u", + (unsigned int) totallen, pd_special - pd_lower))); + + compactify_tuples(itemidbase, nstorage, page); + } + + /* Set hint bit for PageAddItem */ + if (nunused > 0) + PageSetHasFreeLinePointers(page); + else + PageClearHasFreeLinePointers(page); +} + +/* + * PageGetFreeSpace + * Returns the size of the free (allocatable) space on a page, + * reduced by the space needed for a new line pointer. + * + * Note: this should usually only be used on index pages. Use + * PageGetHeapFreeSpace on heap pages. + */ +Size +PageGetFreeSpace(Page page) +{ + int space; + + /* + * Use signed arithmetic here so that we behave sensibly if pd_lower > + * pd_upper. + */ + space = (int) ((PageHeader) page)->pd_upper - + (int) ((PageHeader) page)->pd_lower; + + if (space < (int) sizeof(ItemIdData)) + return 0; + space -= sizeof(ItemIdData); + + return (Size) space; +} + +/* + * PageGetFreeSpaceForMultipleTuples + * Returns the size of the free (allocatable) space on a page, + * reduced by the space needed for multiple new line pointers. + * + * Note: this should usually only be used on index pages. Use + * PageGetHeapFreeSpace on heap pages. + */ +Size +PageGetFreeSpaceForMultipleTuples(Page page, int ntups) +{ + int space; + + /* + * Use signed arithmetic here so that we behave sensibly if pd_lower > + * pd_upper. + */ + space = (int) ((PageHeader) page)->pd_upper - + (int) ((PageHeader) page)->pd_lower; + + if (space < (int) (ntups * sizeof(ItemIdData))) + return 0; + space -= ntups * sizeof(ItemIdData); + + return (Size) space; +} + +/* + * PageGetExactFreeSpace + * Returns the size of the free (allocatable) space on a page, + * without any consideration for adding/removing line pointers. + */ +Size +PageGetExactFreeSpace(Page page) +{ + int space; + + /* + * Use signed arithmetic here so that we behave sensibly if pd_lower > + * pd_upper. + */ + space = (int) ((PageHeader) page)->pd_upper - + (int) ((PageHeader) page)->pd_lower; + + if (space < 0) + return 0; + + return (Size) space; +} + + +/* + * PageGetHeapFreeSpace + * Returns the size of the free (allocatable) space on a page, + * reduced by the space needed for a new line pointer. + * + * The difference between this and PageGetFreeSpace is that this will return + * zero if there are already MaxHeapTuplesPerPage line pointers in the page + * and none are free. We use this to enforce that no more than + * MaxHeapTuplesPerPage line pointers are created on a heap page. (Although + * no more tuples than that could fit anyway, in the presence of redirected + * or dead line pointers it'd be possible to have too many line pointers. + * To avoid breaking code that assumes MaxHeapTuplesPerPage is a hard limit + * on the number of line pointers, we make this extra check.) + */ +Size +PageGetHeapFreeSpace(Page page) +{ + Size space; + + space = PageGetFreeSpace(page); + if (space > 0) + { + OffsetNumber offnum, + nline; + + /* + * Are there already MaxHeapTuplesPerPage line pointers in the page? + */ + nline = PageGetMaxOffsetNumber(page); + if (nline >= MaxHeapTuplesPerPage) + { + if (PageHasFreeLinePointers((PageHeader) page)) + { + /* + * Since this is just a hint, we must confirm that there is + * indeed a free line pointer + */ + for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum)) + { + ItemId lp = PageGetItemId(page, offnum); + + if (!ItemIdIsUsed(lp)) + break; + } + + if (offnum > nline) + { + /* + * The hint is wrong, but we can't clear it here since we + * don't have the ability to mark the page dirty. + */ + space = 0; + } + } + else + { + /* + * Although the hint might be wrong, PageAddItem will believe + * it anyway, so we must believe it too. + */ + space = 0; + } + } + } + return space; +} + + +/* + * PageIndexTupleDelete + * + * This routine does the work of removing a tuple from an index page. + * + * Unlike heap pages, we compact out the line pointer for the removed tuple. + */ +void +PageIndexTupleDelete(Page page, OffsetNumber offnum) +{ + PageHeader phdr = (PageHeader) page; + char *addr; + ItemId tup; + Size size; + unsigned offset; + int nbytes; + int offidx; + int nline; + + /* + * As with PageRepairFragmentation, paranoia seems justified. + */ + if (phdr->pd_lower < SizeOfPageHeaderData || + phdr->pd_lower > phdr->pd_upper || + phdr->pd_upper > phdr->pd_special || + phdr->pd_special > BLCKSZ || + phdr->pd_special != MAXALIGN(phdr->pd_special)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", + phdr->pd_lower, phdr->pd_upper, phdr->pd_special))); + + nline = PageGetMaxOffsetNumber(page); + if ((int) offnum <= 0 || (int) offnum > nline) + elog(ERROR, "invalid index offnum: %u", offnum); + + /* change offset number to offset index */ + offidx = offnum - 1; + + tup = PageGetItemId(page, offnum); + Assert(ItemIdHasStorage(tup)); + size = ItemIdGetLength(tup); + offset = ItemIdGetOffset(tup); + + if (offset < phdr->pd_upper || (offset + size) > phdr->pd_special || + offset != MAXALIGN(offset)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted line pointer: offset = %u, size = %u", + offset, (unsigned int) size))); + + /* Amount of space to actually be deleted */ + size = MAXALIGN(size); + + /* + * First, we want to get rid of the pd_linp entry for the index tuple. We + * copy all subsequent linp's back one slot in the array. We don't use + * PageGetItemId, because we are manipulating the _array_, not individual + * linp's. + */ + nbytes = phdr->pd_lower - + ((char *) &phdr->pd_linp[offidx + 1] - (char *) phdr); + + if (nbytes > 0) + memmove((char *) &(phdr->pd_linp[offidx]), + (char *) &(phdr->pd_linp[offidx + 1]), + nbytes); + + /* + * Now move everything between the old upper bound (beginning of tuple + * space) and the beginning of the deleted tuple forward, so that space in + * the middle of the page is left free. If we've just deleted the tuple + * at the beginning of tuple space, then there's no need to do the copy. + */ + + /* beginning of tuple space */ + addr = (char *) page + phdr->pd_upper; + + if (offset > phdr->pd_upper) + memmove(addr + size, addr, offset - phdr->pd_upper); + + /* adjust free space boundary pointers */ + phdr->pd_upper += size; + phdr->pd_lower -= sizeof(ItemIdData); + + /* + * Finally, we need to adjust the linp entries that remain. + * + * Anything that used to be before the deleted tuple's data was moved + * forward by the size of the deleted tuple. + */ + if (!PageIsEmpty(page)) + { + int i; + + nline--; /* there's one less than when we started */ + for (i = 1; i <= nline; i++) + { + ItemId ii = PageGetItemId(phdr, i); + + Assert(ItemIdHasStorage(ii)); + if (ItemIdGetOffset(ii) <= offset) + ii->lp_off += size; + } + } +} + + +/* + * PageIndexMultiDelete + * + * This routine handles the case of deleting multiple tuples from an + * index page at once. It is considerably faster than a loop around + * PageIndexTupleDelete ... however, the caller *must* supply the array + * of item numbers to be deleted in item number order! + */ +void +PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) +{ + PageHeader phdr = (PageHeader) page; + Offset pd_lower = phdr->pd_lower; + Offset pd_upper = phdr->pd_upper; + Offset pd_special = phdr->pd_special; + itemIdSortData itemidbase[MaxIndexTuplesPerPage]; + ItemIdData newitemids[MaxIndexTuplesPerPage]; + itemIdSort itemidptr; + ItemId lp; + int nline, + nused; + Size totallen; + Size size; + unsigned offset; + int nextitm; + OffsetNumber offnum; + + Assert(nitems <= MaxIndexTuplesPerPage); + + /* + * If there aren't very many items to delete, then retail + * PageIndexTupleDelete is the best way. Delete the items in reverse + * order so we don't have to think about adjusting item numbers for + * previous deletions. + * + * TODO: tune the magic number here + */ + if (nitems <= 2) + { + while (--nitems >= 0) + PageIndexTupleDelete(page, itemnos[nitems]); + return; + } + + /* + * As with PageRepairFragmentation, paranoia seems justified. + */ + if (pd_lower < SizeOfPageHeaderData || + pd_lower > pd_upper || + pd_upper > pd_special || + pd_special > BLCKSZ || + pd_special != MAXALIGN(pd_special)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", + pd_lower, pd_upper, pd_special))); + + /* + * Scan the line pointer array and build a list of just the ones we are + * going to keep. Notice we do not modify the page yet, since we are + * still validity-checking. + */ + nline = PageGetMaxOffsetNumber(page); + itemidptr = itemidbase; + totallen = 0; + nused = 0; + nextitm = 0; + for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum)) + { + lp = PageGetItemId(page, offnum); + Assert(ItemIdHasStorage(lp)); + size = ItemIdGetLength(lp); + offset = ItemIdGetOffset(lp); + if (offset < pd_upper || + (offset + size) > pd_special || + offset != MAXALIGN(offset)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted line pointer: offset = %u, size = %u", + offset, (unsigned int) size))); + + if (nextitm < nitems && offnum == itemnos[nextitm]) + { + /* skip item to be deleted */ + nextitm++; + } + else + { + itemidptr->offsetindex = nused; /* where it will go */ + itemidptr->itemoff = offset; + itemidptr->alignedlen = MAXALIGN(size); + totallen += itemidptr->alignedlen; + newitemids[nused] = *lp; + itemidptr++; + nused++; + } + } + + /* this will catch invalid or out-of-order itemnos[] */ + if (nextitm != nitems) + elog(ERROR, "incorrect index offsets supplied"); + + if (totallen > (Size) (pd_special - pd_lower)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted item lengths: total %u, available space %u", + (unsigned int) totallen, pd_special - pd_lower))); + + /* + * Looks good. Overwrite the line pointers with the copy, from which we've + * removed all the unused items. + */ + memcpy(phdr->pd_linp, newitemids, nused * sizeof(ItemIdData)); + phdr->pd_lower = SizeOfPageHeaderData + nused * sizeof(ItemIdData); + + /* and compactify the tuple data */ + compactify_tuples(itemidbase, nused, page); +} + + +/* + * PageIndexTupleDeleteNoCompact + * + * Remove the specified tuple from an index page, but set its line pointer + * to "unused" instead of compacting it out, except that it can be removed + * if it's the last line pointer on the page. + * + * This is used for index AMs that require that existing TIDs of live tuples + * remain unchanged, and are willing to allow unused line pointers instead. + */ +void +PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum) +{ + PageHeader phdr = (PageHeader) page; + char *addr; + ItemId tup; + Size size; + unsigned offset; + int nline; + + /* + * As with PageRepairFragmentation, paranoia seems justified. + */ + if (phdr->pd_lower < SizeOfPageHeaderData || + phdr->pd_lower > phdr->pd_upper || + phdr->pd_upper > phdr->pd_special || + phdr->pd_special > BLCKSZ || + phdr->pd_special != MAXALIGN(phdr->pd_special)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", + phdr->pd_lower, phdr->pd_upper, phdr->pd_special))); + + nline = PageGetMaxOffsetNumber(page); + if ((int) offnum <= 0 || (int) offnum > nline) + elog(ERROR, "invalid index offnum: %u", offnum); + + tup = PageGetItemId(page, offnum); + Assert(ItemIdHasStorage(tup)); + size = ItemIdGetLength(tup); + offset = ItemIdGetOffset(tup); + + if (offset < phdr->pd_upper || (offset + size) > phdr->pd_special || + offset != MAXALIGN(offset)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted line pointer: offset = %u, size = %u", + offset, (unsigned int) size))); + + /* Amount of space to actually be deleted */ + size = MAXALIGN(size); + + /* + * Either set the line pointer to "unused", or zap it if it's the last + * one. (Note: it's possible that the next-to-last one(s) are already + * unused, but we do not trouble to try to compact them out if so.) + */ + if ((int) offnum < nline) + ItemIdSetUnused(tup); + else + { + phdr->pd_lower -= sizeof(ItemIdData); + nline--; /* there's one less than when we started */ + } + + /* + * Now move everything between the old upper bound (beginning of tuple + * space) and the beginning of the deleted tuple forward, so that space in + * the middle of the page is left free. If we've just deleted the tuple + * at the beginning of tuple space, then there's no need to do the copy. + */ + + /* beginning of tuple space */ + addr = (char *) page + phdr->pd_upper; + + if (offset > phdr->pd_upper) + memmove(addr + size, addr, offset - phdr->pd_upper); + + /* adjust free space boundary pointer */ + phdr->pd_upper += size; + + /* + * Finally, we need to adjust the linp entries that remain. + * + * Anything that used to be before the deleted tuple's data was moved + * forward by the size of the deleted tuple. + */ + if (!PageIsEmpty(page)) + { + int i; + + for (i = 1; i <= nline; i++) + { + ItemId ii = PageGetItemId(phdr, i); + + if (ItemIdHasStorage(ii) && ItemIdGetOffset(ii) <= offset) + ii->lp_off += size; + } + } +} + + +/* + * PageIndexTupleOverwrite + * + * Replace a specified tuple on an index page. + * + * The new tuple is placed exactly where the old one had been, shifting + * other tuples' data up or down as needed to keep the page compacted. + * This is better than deleting and reinserting the tuple, because it + * avoids any data shifting when the tuple size doesn't change; and + * even when it does, we avoid moving the line pointers around. + * This could be used by an index AM that doesn't want to unset the + * LP_DEAD bit when it happens to be set. It could conceivably also be + * used by an index AM that cares about the physical order of tuples as + * well as their logical/ItemId order. + * + * If there's insufficient space for the new tuple, return false. Other + * errors represent data-corruption problems, so we just elog. + */ +bool +PageIndexTupleOverwrite(Page page, OffsetNumber offnum, + Item newtup, Size newsize) +{ + PageHeader phdr = (PageHeader) page; + ItemId tupid; + int oldsize; + unsigned offset; + Size alignednewsize; + int size_diff; + int itemcount; + + /* + * As with PageRepairFragmentation, paranoia seems justified. + */ + if (phdr->pd_lower < SizeOfPageHeaderData || + phdr->pd_lower > phdr->pd_upper || + phdr->pd_upper > phdr->pd_special || + phdr->pd_special > BLCKSZ || + phdr->pd_special != MAXALIGN(phdr->pd_special)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", + phdr->pd_lower, phdr->pd_upper, phdr->pd_special))); + + itemcount = PageGetMaxOffsetNumber(page); + if ((int) offnum <= 0 || (int) offnum > itemcount) + elog(ERROR, "invalid index offnum: %u", offnum); + + tupid = PageGetItemId(page, offnum); + Assert(ItemIdHasStorage(tupid)); + oldsize = ItemIdGetLength(tupid); + offset = ItemIdGetOffset(tupid); + + if (offset < phdr->pd_upper || (offset + oldsize) > phdr->pd_special || + offset != MAXALIGN(offset)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted line pointer: offset = %u, size = %u", + offset, (unsigned int) oldsize))); + + /* + * Determine actual change in space requirement, check for page overflow. + */ + oldsize = MAXALIGN(oldsize); + alignednewsize = MAXALIGN(newsize); + if (alignednewsize > oldsize + (phdr->pd_upper - phdr->pd_lower)) + return false; + + /* + * Relocate existing data and update line pointers, unless the new tuple + * is the same size as the old (after alignment), in which case there's + * nothing to do. Notice that what we have to relocate is data before the + * target tuple, not data after, so it's convenient to express size_diff + * as the amount by which the tuple's size is decreasing, making it the + * delta to add to pd_upper and affected line pointers. + */ + size_diff = oldsize - (int) alignednewsize; + if (size_diff != 0) + { + char *addr = (char *) page + phdr->pd_upper; + int i; + + /* relocate all tuple data before the target tuple */ + memmove(addr + size_diff, addr, offset - phdr->pd_upper); + + /* adjust free space boundary pointer */ + phdr->pd_upper += size_diff; + + /* adjust affected line pointers too */ + for (i = FirstOffsetNumber; i <= itemcount; i++) + { + ItemId ii = PageGetItemId(phdr, i); + + /* Allow items without storage; currently only BRIN needs that */ + if (ItemIdHasStorage(ii) && ItemIdGetOffset(ii) <= offset) + ii->lp_off += size_diff; + } + } + + /* Update the item's tuple length without changing its lp_flags field */ + tupid->lp_off = offset + size_diff; + tupid->lp_len = newsize; + + /* Copy new tuple data onto page */ + memcpy(PageGetItem(page, tupid), newtup, newsize); + + return true; +} + + +/* + * Set checksum for a page in shared buffers. + * + * If checksums are disabled, or if the page is not initialized, just return + * the input. Otherwise, we must make a copy of the page before calculating + * the checksum, to prevent concurrent modifications (e.g. setting hint bits) + * from making the final checksum invalid. It doesn't matter if we include or + * exclude hints during the copy, as long as we write a valid page and + * associated checksum. + * + * Returns a pointer to the block-sized data that needs to be written. Uses + * statically-allocated memory, so the caller must immediately write the + * returned page and not refer to it again. + */ +char * +PageSetChecksumCopy(Page page, BlockNumber blkno) +{ + static char *pageCopy = NULL; + + /* If we don't need a checksum, just return the passed-in data */ + if (PageIsNew(page) || !DataChecksumsEnabled()) + return (char *) page; + + /* + * We allocate the copy space once and use it over on each subsequent + * call. The point of palloc'ing here, rather than having a static char + * array, is first to ensure adequate alignment for the checksumming code + * and second to avoid wasting space in processes that never call this. + */ + if (pageCopy == NULL) + pageCopy = MemoryContextAlloc(TopMemoryContext, BLCKSZ); + + memcpy(pageCopy, (char *) page, BLCKSZ); + ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno); + return pageCopy; +} + +/* + * Set checksum for a page in private memory. + * + * This must only be used when we know that no other process can be modifying + * the page buffer. + */ +void +PageSetChecksumInplace(Page page, BlockNumber blkno) +{ + /* If we don't need a checksum, just return */ + if (PageIsNew(page) || !DataChecksumsEnabled()) + return; + + ((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, blkno); +} diff --git a/src/backend/storage/page/checksum.c b/src/backend/storage/page/checksum.c new file mode 100644 index 0000000..e010691 --- /dev/null +++ b/src/backend/storage/page/checksum.c @@ -0,0 +1,22 @@ +/*------------------------------------------------------------------------- + * + * checksum.c + * Checksum implementation for data pages. + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/page/checksum.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/checksum.h" +/* + * The actual code is in storage/checksum_impl.h. This is done so that + * external programs can incorporate the checksum code by #include'ing + * that file from the exported Postgres headers. (Compare our CRC code.) + */ +#include "storage/checksum_impl.h" diff --git a/src/backend/storage/page/itemptr.c b/src/backend/storage/page/itemptr.c new file mode 100644 index 0000000..e7806cc --- /dev/null +++ b/src/backend/storage/page/itemptr.c @@ -0,0 +1,73 @@ +/*------------------------------------------------------------------------- + * + * itemptr.c + * POSTGRES disk item pointer code. + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/page/itemptr.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/itemptr.h" + + +/* + * ItemPointerEquals + * Returns true if both item pointers point to the same item, + * otherwise returns false. + * + * Note: + * Asserts that the disk item pointers are both valid! + */ +bool +ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2) +{ + /* + * We really want ItemPointerData to be exactly 6 bytes. This is rather a + * random place to check, but there is no better place. + */ + StaticAssertStmt(sizeof(ItemPointerData) == 3 * sizeof(uint16), + "ItemPointerData struct is improperly padded"); + + if (ItemPointerGetBlockNumber(pointer1) == + ItemPointerGetBlockNumber(pointer2) && + ItemPointerGetOffsetNumber(pointer1) == + ItemPointerGetOffsetNumber(pointer2)) + return true; + else + return false; +} + +/* + * ItemPointerCompare + * Generic btree-style comparison for item pointers. + */ +int32 +ItemPointerCompare(ItemPointer arg1, ItemPointer arg2) +{ + /* + * Use ItemPointerGet{Offset,Block}NumberNoCheck to avoid asserting + * ip_posid != 0, which may not be true for a user-supplied TID. + */ + BlockNumber b1 = ItemPointerGetBlockNumberNoCheck(arg1); + BlockNumber b2 = ItemPointerGetBlockNumberNoCheck(arg2); + + if (b1 < b2) + return -1; + else if (b1 > b2) + return 1; + else if (ItemPointerGetOffsetNumberNoCheck(arg1) < + ItemPointerGetOffsetNumberNoCheck(arg2)) + return -1; + else if (ItemPointerGetOffsetNumberNoCheck(arg1) > + ItemPointerGetOffsetNumberNoCheck(arg2)) + return 1; + else + return 0; +} |