diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 13:44:03 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 13:44:03 +0000 |
commit | 293913568e6a7a86fd1479e1cff8e2ecb58d6568 (patch) | |
tree | fc3b469a3ec5ab71b36ea97cc7aaddb838423a0c /src/backend/access/brin/brin.c | |
parent | Initial commit. (diff) | |
download | postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.tar.xz postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.zip |
Adding upstream version 16.2.upstream/16.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/access/brin/brin.c')
-rw-r--r-- | src/backend/access/brin/brin.c | 1933 |
1 files changed, 1933 insertions, 0 deletions
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c new file mode 100644 index 0000000..a257903 --- /dev/null +++ b/src/backend/access/brin/brin.c @@ -0,0 +1,1933 @@ +/* + * brin.c + * Implementation of BRIN indexes for Postgres + * + * See src/backend/access/brin/README for details. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin.c + * + * TODO + * * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY) + */ +#include "postgres.h" + +#include "access/brin.h" +#include "access/brin_page.h" +#include "access/brin_pageops.h" +#include "access/brin_xlog.h" +#include "access/relation.h" +#include "access/reloptions.h" +#include "access/relscan.h" +#include "access/table.h" +#include "access/tableam.h" +#include "access/xloginsert.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/autovacuum.h" +#include "storage/bufmgr.h" +#include "storage/freespace.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/guc.h" +#include "utils/index_selfuncs.h" +#include "utils/memutils.h" +#include "utils/rel.h" + + +/* + * We use a BrinBuildState during initial construction of a BRIN index. + * The running state is kept in a BrinMemTuple. + */ +typedef struct BrinBuildState +{ + Relation bs_irel; + int bs_numtuples; + Buffer bs_currentInsertBuf; + BlockNumber bs_pagesPerRange; + BlockNumber bs_currRangeStart; + BrinRevmap *bs_rmAccess; + BrinDesc *bs_bdesc; + BrinMemTuple *bs_dtuple; +} BrinBuildState; + +/* + * Struct used as "opaque" during index scans + */ +typedef struct BrinOpaque +{ + BlockNumber bo_pagesPerRange; + BrinRevmap *bo_rmAccess; + BrinDesc *bo_bdesc; +} BrinOpaque; + +#define BRIN_ALL_BLOCKRANGES InvalidBlockNumber + +static BrinBuildState *initialize_brin_buildstate(Relation idxRel, + BrinRevmap *revmap, BlockNumber pagesPerRange); +static void terminate_brin_buildstate(BrinBuildState *state); +static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange, + bool include_partial, double *numSummarized, double *numExisting); +static void form_and_insert_tuple(BrinBuildState *state); +static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a, + BrinTuple *b); +static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy); +static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc, + BrinMemTuple *dtup, Datum *values, bool *nulls); +static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys); + +/* + * BRIN handler function: return IndexAmRoutine with access method parameters + * and callbacks. + */ +Datum +brinhandler(PG_FUNCTION_ARGS) +{ + IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); + + amroutine->amstrategies = 0; + amroutine->amsupport = BRIN_LAST_OPTIONAL_PROCNUM; + amroutine->amoptsprocnum = BRIN_PROCNUM_OPTIONS; + amroutine->amcanorder = false; + amroutine->amcanorderbyop = false; + amroutine->amcanbackward = false; + amroutine->amcanunique = false; + amroutine->amcanmulticol = true; + amroutine->amoptionalkey = true; + amroutine->amsearcharray = false; + amroutine->amsearchnulls = true; + amroutine->amstorage = true; + amroutine->amclusterable = false; + amroutine->ampredlocks = false; + amroutine->amcanparallel = false; + amroutine->amcaninclude = false; + amroutine->amusemaintenanceworkmem = false; + amroutine->amsummarizing = true; + amroutine->amparallelvacuumoptions = + VACUUM_OPTION_PARALLEL_CLEANUP; + amroutine->amkeytype = InvalidOid; + + amroutine->ambuild = brinbuild; + amroutine->ambuildempty = brinbuildempty; + amroutine->aminsert = brininsert; + amroutine->ambulkdelete = brinbulkdelete; + amroutine->amvacuumcleanup = brinvacuumcleanup; + amroutine->amcanreturn = NULL; + amroutine->amcostestimate = brincostestimate; + amroutine->amoptions = brinoptions; + amroutine->amproperty = NULL; + amroutine->ambuildphasename = NULL; + amroutine->amvalidate = brinvalidate; + amroutine->amadjustmembers = NULL; + amroutine->ambeginscan = brinbeginscan; + amroutine->amrescan = brinrescan; + amroutine->amgettuple = NULL; + amroutine->amgetbitmap = bringetbitmap; + amroutine->amendscan = brinendscan; + amroutine->ammarkpos = NULL; + amroutine->amrestrpos = NULL; + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; + + PG_RETURN_POINTER(amroutine); +} + +/* + * A tuple in the heap is being inserted. To keep a brin index up to date, + * we need to obtain the relevant index tuple and compare its stored values + * with those of the new tuple. If the tuple values are not consistent with + * the summary tuple, we need to update the index tuple. + * + * If autosummarization is enabled, check if we need to summarize the previous + * page range. + * + * If the range is not currently summarized (i.e. the revmap returns NULL for + * it), there's nothing to do for this tuple. + */ +bool +brininsert(Relation idxRel, Datum *values, bool *nulls, + ItemPointer heaptid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + IndexInfo *indexInfo) +{ + BlockNumber pagesPerRange; + BlockNumber origHeapBlk; + BlockNumber heapBlk; + BrinDesc *bdesc = (BrinDesc *) indexInfo->ii_AmCache; + BrinRevmap *revmap; + Buffer buf = InvalidBuffer; + MemoryContext tupcxt = NULL; + MemoryContext oldcxt = CurrentMemoryContext; + bool autosummarize = BrinGetAutoSummarize(idxRel); + + revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL); + + /* + * origHeapBlk is the block number where the insertion occurred. heapBlk + * is the first block in the corresponding page range. + */ + origHeapBlk = ItemPointerGetBlockNumber(heaptid); + heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange; + + for (;;) + { + bool need_insert = false; + OffsetNumber off; + BrinTuple *brtup; + BrinMemTuple *dtup; + + CHECK_FOR_INTERRUPTS(); + + /* + * If auto-summarization is enabled and we just inserted the first + * tuple into the first block of a new non-first page range, request a + * summarization run of the previous range. + */ + if (autosummarize && + heapBlk > 0 && + heapBlk == origHeapBlk && + ItemPointerGetOffsetNumber(heaptid) == FirstOffsetNumber) + { + BlockNumber lastPageRange = heapBlk - 1; + BrinTuple *lastPageTuple; + + lastPageTuple = + brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off, + NULL, BUFFER_LOCK_SHARE, NULL); + if (!lastPageTuple) + { + bool recorded; + + recorded = AutoVacuumRequestWork(AVW_BRINSummarizeRange, + RelationGetRelid(idxRel), + lastPageRange); + if (!recorded) + ereport(LOG, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded", + RelationGetRelationName(idxRel), + lastPageRange))); + } + else + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + + brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, + NULL, BUFFER_LOCK_SHARE, NULL); + + /* if range is unsummarized, there's nothing to do */ + if (!brtup) + break; + + /* First time through in this statement? */ + if (bdesc == NULL) + { + MemoryContextSwitchTo(indexInfo->ii_Context); + bdesc = brin_build_desc(idxRel); + indexInfo->ii_AmCache = (void *) bdesc; + MemoryContextSwitchTo(oldcxt); + } + /* First time through in this brininsert call? */ + if (tupcxt == NULL) + { + tupcxt = AllocSetContextCreate(CurrentMemoryContext, + "brininsert cxt", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(tupcxt); + } + + dtup = brin_deform_tuple(bdesc, brtup, NULL); + + need_insert = add_values_to_range(idxRel, bdesc, dtup, values, nulls); + + if (!need_insert) + { + /* + * The tuple is consistent with the new values, so there's nothing + * to do. + */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + else + { + Page page = BufferGetPage(buf); + ItemId lp = PageGetItemId(page, off); + Size origsz; + BrinTuple *origtup; + Size newsz; + BrinTuple *newtup; + bool samepage; + + /* + * Make a copy of the old tuple, so that we can compare it after + * re-acquiring the lock. + */ + origsz = ItemIdGetLength(lp); + origtup = brin_copy_tuple(brtup, origsz, NULL, NULL); + + /* + * Before releasing the lock, check if we can attempt a same-page + * update. Another process could insert a tuple concurrently in + * the same page though, so downstream we must be prepared to cope + * if this turns out to not be possible after all. + */ + newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz); + samepage = brin_can_do_samepage_update(buf, origsz, newsz); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + /* + * Try to update the tuple. If this doesn't work for whatever + * reason, we need to restart from the top; the revmap might be + * pointing at a different tuple for this block now, so we need to + * recompute to ensure both our new heap tuple and the other + * inserter's are covered by the combined tuple. It might be that + * we don't need to update at all. + */ + if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk, + buf, off, origtup, origsz, newtup, newsz, + samepage)) + { + /* no luck; start over */ + MemoryContextResetAndDeleteChildren(tupcxt); + continue; + } + } + + /* success! */ + break; + } + + brinRevmapTerminate(revmap); + if (BufferIsValid(buf)) + ReleaseBuffer(buf); + MemoryContextSwitchTo(oldcxt); + if (tupcxt != NULL) + MemoryContextDelete(tupcxt); + + return false; +} + +/* + * Initialize state for a BRIN index scan. + * + * We read the metapage here to determine the pages-per-range number that this + * index was built with. Note that since this cannot be changed while we're + * holding lock on index, it's not necessary to recompute it during brinrescan. + */ +IndexScanDesc +brinbeginscan(Relation r, int nkeys, int norderbys) +{ + IndexScanDesc scan; + BrinOpaque *opaque; + + scan = RelationGetIndexScan(r, nkeys, norderbys); + + opaque = palloc_object(BrinOpaque); + opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange, + scan->xs_snapshot); + opaque->bo_bdesc = brin_build_desc(r); + scan->opaque = opaque; + + return scan; +} + +/* + * Execute the index scan. + * + * This works by reading index TIDs from the revmap, and obtaining the index + * tuples pointed to by them; the summary values in the index tuples are + * compared to the scan keys. We return into the TID bitmap all the pages in + * ranges corresponding to index tuples that match the scan keys. + * + * If a TID from the revmap is read as InvalidTID, we know that range is + * unsummarized. Pages in those ranges need to be returned regardless of scan + * keys. + */ +int64 +bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm) +{ + Relation idxRel = scan->indexRelation; + Buffer buf = InvalidBuffer; + BrinDesc *bdesc; + Oid heapOid; + Relation heapRel; + BrinOpaque *opaque; + BlockNumber nblocks; + BlockNumber heapBlk; + int totalpages = 0; + FmgrInfo *consistentFn; + MemoryContext oldcxt; + MemoryContext perRangeCxt; + BrinMemTuple *dtup; + BrinTuple *btup = NULL; + Size btupsz = 0; + ScanKey **keys, + **nullkeys; + int *nkeys, + *nnullkeys; + char *ptr; + Size len; + char *tmp PG_USED_FOR_ASSERTS_ONLY; + + opaque = (BrinOpaque *) scan->opaque; + bdesc = opaque->bo_bdesc; + pgstat_count_index_scan(idxRel); + + /* + * We need to know the size of the table so that we know how long to + * iterate on the revmap. + */ + heapOid = IndexGetRelation(RelationGetRelid(idxRel), false); + heapRel = table_open(heapOid, AccessShareLock); + nblocks = RelationGetNumberOfBlocks(heapRel); + table_close(heapRel, AccessShareLock); + + /* + * Make room for the consistent support procedures of indexed columns. We + * don't look them up here; we do that lazily the first time we see a scan + * key reference each of them. We rely on zeroing fn_oid to InvalidOid. + */ + consistentFn = palloc0_array(FmgrInfo, bdesc->bd_tupdesc->natts); + + /* + * Make room for per-attribute lists of scan keys that we'll pass to the + * consistent support procedure. We don't know which attributes have scan + * keys, so we allocate space for all attributes. That may use more memory + * but it's probably cheaper than determining which attributes are used. + * + * We keep null and regular keys separate, so that we can pass just the + * regular keys to the consistent function easily. + * + * To reduce the allocation overhead, we allocate one big chunk and then + * carve it into smaller arrays ourselves. All the pieces have exactly the + * same lifetime, so that's OK. + * + * XXX The widest index can have 32 attributes, so the amount of wasted + * memory is negligible. We could invent a more compact approach (with + * just space for used attributes) but that would make the matching more + * complex so it's not a good trade-off. + */ + len = + MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* regular keys */ + MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts + + MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts) + + MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* NULL keys */ + MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts + + MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts); + + ptr = palloc(len); + tmp = ptr; + + keys = (ScanKey **) ptr; + ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts); + + nullkeys = (ScanKey **) ptr; + ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts); + + nkeys = (int *) ptr; + ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts); + + nnullkeys = (int *) ptr; + ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts); + + for (int i = 0; i < bdesc->bd_tupdesc->natts; i++) + { + keys[i] = (ScanKey *) ptr; + ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys); + + nullkeys[i] = (ScanKey *) ptr; + ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys); + } + + Assert(tmp + len == ptr); + + /* zero the number of keys */ + memset(nkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts); + memset(nnullkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts); + + /* Preprocess the scan keys - split them into per-attribute arrays. */ + for (int keyno = 0; keyno < scan->numberOfKeys; keyno++) + { + ScanKey key = &scan->keyData[keyno]; + AttrNumber keyattno = key->sk_attno; + + /* + * The collation of the scan key must match the collation used in the + * index column (but only if the search is not IS NULL/ IS NOT NULL). + * Otherwise we shouldn't be using this index ... + */ + Assert((key->sk_flags & SK_ISNULL) || + (key->sk_collation == + TupleDescAttr(bdesc->bd_tupdesc, + keyattno - 1)->attcollation)); + + /* + * First time we see this index attribute, so init as needed. + * + * This is a bit of an overkill - we don't know how many scan keys are + * there for this attribute, so we simply allocate the largest number + * possible (as if all keys were for this attribute). This may waste a + * bit of memory, but we only expect small number of scan keys in + * general, so this should be negligible, and repeated repalloc calls + * are not free either. + */ + if (consistentFn[keyattno - 1].fn_oid == InvalidOid) + { + FmgrInfo *tmp; + + /* First time we see this attribute, so no key/null keys. */ + Assert(nkeys[keyattno - 1] == 0); + Assert(nnullkeys[keyattno - 1] == 0); + + tmp = index_getprocinfo(idxRel, keyattno, + BRIN_PROCNUM_CONSISTENT); + fmgr_info_copy(&consistentFn[keyattno - 1], tmp, + CurrentMemoryContext); + } + + /* Add key to the proper per-attribute array. */ + if (key->sk_flags & SK_ISNULL) + { + nullkeys[keyattno - 1][nnullkeys[keyattno - 1]] = key; + nnullkeys[keyattno - 1]++; + } + else + { + keys[keyattno - 1][nkeys[keyattno - 1]] = key; + nkeys[keyattno - 1]++; + } + } + + /* allocate an initial in-memory tuple, out of the per-range memcxt */ + dtup = brin_new_memtuple(bdesc); + + /* + * Setup and use a per-range memory context, which is reset every time we + * loop below. This avoids having to free the tuples within the loop. + */ + perRangeCxt = AllocSetContextCreate(CurrentMemoryContext, + "bringetbitmap cxt", + ALLOCSET_DEFAULT_SIZES); + oldcxt = MemoryContextSwitchTo(perRangeCxt); + + /* + * Now scan the revmap. We start by querying for heap page 0, + * incrementing by the number of pages per range; this gives us a full + * view of the table. + */ + for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange) + { + bool addrange; + bool gottuple = false; + BrinTuple *tup; + OffsetNumber off; + Size size; + + CHECK_FOR_INTERRUPTS(); + + MemoryContextResetAndDeleteChildren(perRangeCxt); + + tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf, + &off, &size, BUFFER_LOCK_SHARE, + scan->xs_snapshot); + if (tup) + { + gottuple = true; + btup = brin_copy_tuple(tup, size, btup, &btupsz); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + + /* + * For page ranges with no indexed tuple, we must return the whole + * range; otherwise, compare it to the scan keys. + */ + if (!gottuple) + { + addrange = true; + } + else + { + dtup = brin_deform_tuple(bdesc, btup, dtup); + if (dtup->bt_placeholder) + { + /* + * Placeholder tuples are always returned, regardless of the + * values stored in them. + */ + addrange = true; + } + else + { + int attno; + + /* + * Compare scan keys with summary values stored for the range. + * If scan keys are matched, the page range must be added to + * the bitmap. We initially assume the range needs to be + * added; in particular this serves the case where there are + * no keys. + */ + addrange = true; + for (attno = 1; attno <= bdesc->bd_tupdesc->natts; attno++) + { + BrinValues *bval; + Datum add; + Oid collation; + + /* + * skip attributes without any scan keys (both regular and + * IS [NOT] NULL) + */ + if (nkeys[attno - 1] == 0 && nnullkeys[attno - 1] == 0) + continue; + + bval = &dtup->bt_columns[attno - 1]; + + /* + * If the BRIN tuple indicates that this range is empty, + * we can skip it: there's nothing to match. We don't + * need to examine the next columns. + */ + if (dtup->bt_empty_range) + { + addrange = false; + break; + } + + /* + * First check if there are any IS [NOT] NULL scan keys, + * and if we're violating them. In that case we can + * terminate early, without invoking the support function. + * + * As there may be more keys, we can only determine + * mismatch within this loop. + */ + if (bdesc->bd_info[attno - 1]->oi_regular_nulls && + !check_null_keys(bval, nullkeys[attno - 1], + nnullkeys[attno - 1])) + { + /* + * If any of the IS [NOT] NULL keys failed, the page + * range as a whole can't pass. So terminate the loop. + */ + addrange = false; + break; + } + + /* + * So either there are no IS [NOT] NULL keys, or all + * passed. If there are no regular scan keys, we're done - + * the page range matches. If there are regular keys, but + * the page range is marked as 'all nulls' it can't + * possibly pass (we're assuming the operators are + * strict). + */ + + /* No regular scan keys - page range as a whole passes. */ + if (!nkeys[attno - 1]) + continue; + + Assert((nkeys[attno - 1] > 0) && + (nkeys[attno - 1] <= scan->numberOfKeys)); + + /* If it is all nulls, it cannot possibly be consistent. */ + if (bval->bv_allnulls) + { + addrange = false; + break; + } + + /* + * Collation from the first key (has to be the same for + * all keys for the same attribute). + */ + collation = keys[attno - 1][0]->sk_collation; + + /* + * Check whether the scan key is consistent with the page + * range values; if so, have the pages in the range added + * to the output bitmap. + * + * The opclass may or may not support processing of + * multiple scan keys. We can determine that based on the + * number of arguments - functions with extra parameter + * (number of scan keys) do support this, otherwise we + * have to simply pass the scan keys one by one. + */ + if (consistentFn[attno - 1].fn_nargs >= 4) + { + /* Check all keys at once */ + add = FunctionCall4Coll(&consistentFn[attno - 1], + collation, + PointerGetDatum(bdesc), + PointerGetDatum(bval), + PointerGetDatum(keys[attno - 1]), + Int32GetDatum(nkeys[attno - 1])); + addrange = DatumGetBool(add); + } + else + { + /* + * Check keys one by one + * + * When there are multiple scan keys, failure to meet + * the criteria for a single one of them is enough to + * discard the range as a whole, so break out of the + * loop as soon as a false return value is obtained. + */ + int keyno; + + for (keyno = 0; keyno < nkeys[attno - 1]; keyno++) + { + add = FunctionCall3Coll(&consistentFn[attno - 1], + keys[attno - 1][keyno]->sk_collation, + PointerGetDatum(bdesc), + PointerGetDatum(bval), + PointerGetDatum(keys[attno - 1][keyno])); + addrange = DatumGetBool(add); + if (!addrange) + break; + } + } + + /* + * If we found a scan key eliminating the range, no need + * to check additional ones. + */ + if (!addrange) + break; + } + } + } + + /* add the pages in the range to the output bitmap, if needed */ + if (addrange) + { + BlockNumber pageno; + + for (pageno = heapBlk; + pageno <= Min(nblocks, heapBlk + opaque->bo_pagesPerRange) - 1; + pageno++) + { + MemoryContextSwitchTo(oldcxt); + tbm_add_page(tbm, pageno); + totalpages++; + MemoryContextSwitchTo(perRangeCxt); + } + } + } + + MemoryContextSwitchTo(oldcxt); + MemoryContextDelete(perRangeCxt); + + if (buf != InvalidBuffer) + ReleaseBuffer(buf); + + /* + * XXX We have an approximation of the number of *pages* that our scan + * returns, but we don't have a precise idea of the number of heap tuples + * involved. + */ + return totalpages * 10; +} + +/* + * Re-initialize state for a BRIN index scan + */ +void +brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys) +{ + /* + * Other index AMs preprocess the scan keys at this point, or sometime + * early during the scan; this lets them optimize by removing redundant + * keys, or doing early returns when they are impossible to satisfy; see + * _bt_preprocess_keys for an example. Something like that could be added + * here someday, too. + */ + + if (scankey && scan->numberOfKeys > 0) + memmove(scan->keyData, scankey, + scan->numberOfKeys * sizeof(ScanKeyData)); +} + +/* + * Close down a BRIN index scan + */ +void +brinendscan(IndexScanDesc scan) +{ + BrinOpaque *opaque = (BrinOpaque *) scan->opaque; + + brinRevmapTerminate(opaque->bo_rmAccess); + brin_free_desc(opaque->bo_bdesc); + pfree(opaque); +} + +/* + * Per-heap-tuple callback for table_index_build_scan. + * + * Note we don't worry about the page range at the end of the table here; it is + * present in the build state struct after we're called the last time, but not + * inserted into the index. Caller must ensure to do so, if appropriate. + */ +static void +brinbuildCallback(Relation index, + ItemPointer tid, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *brstate) +{ + BrinBuildState *state = (BrinBuildState *) brstate; + BlockNumber thisblock; + + thisblock = ItemPointerGetBlockNumber(tid); + + /* + * If we're in a block that belongs to a future range, summarize what + * we've got and start afresh. Note the scan might have skipped many + * pages, if they were devoid of live tuples; make sure to insert index + * tuples for those too. + */ + while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1) + { + + BRIN_elog((DEBUG2, + "brinbuildCallback: completed a range: %u--%u", + state->bs_currRangeStart, + state->bs_currRangeStart + state->bs_pagesPerRange)); + + /* create the index tuple and insert it */ + form_and_insert_tuple(state); + + /* set state to correspond to the next range */ + state->bs_currRangeStart += state->bs_pagesPerRange; + + /* re-initialize state for it */ + brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc); + } + + /* Accumulate the current tuple into the running state */ + (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple, + values, isnull); +} + +/* + * brinbuild() -- build a new BRIN index. + */ +IndexBuildResult * +brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) +{ + IndexBuildResult *result; + double reltuples; + double idxtuples; + BrinRevmap *revmap; + BrinBuildState *state; + Buffer meta; + BlockNumber pagesPerRange; + + /* + * We expect to be called exactly once for any index relation. + */ + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + + /* + * Critical section not required, because on error the creation of the + * whole relation will be rolled back. + */ + + meta = ExtendBufferedRel(BMR_REL(index), MAIN_FORKNUM, NULL, + EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK); + Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO); + + brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index), + BRIN_CURRENT_VERSION); + MarkBufferDirty(meta); + + if (RelationNeedsWAL(index)) + { + xl_brin_createidx xlrec; + XLogRecPtr recptr; + Page page; + + xlrec.version = BRIN_CURRENT_VERSION; + xlrec.pagesPerRange = BrinGetPagesPerRange(index); + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBrinCreateIdx); + XLogRegisterBuffer(0, meta, REGBUF_WILL_INIT | REGBUF_STANDARD); + + recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX); + + page = BufferGetPage(meta); + PageSetLSN(page, recptr); + } + + UnlockReleaseBuffer(meta); + + /* + * Initialize our state, including the deformed tuple state. + */ + revmap = brinRevmapInitialize(index, &pagesPerRange, NULL); + state = initialize_brin_buildstate(index, revmap, pagesPerRange); + + /* + * Now scan the relation. No syncscan allowed here because we want the + * heap blocks in physical order. + */ + reltuples = table_index_build_scan(heap, index, indexInfo, false, true, + brinbuildCallback, (void *) state, NULL); + + /* process the final batch */ + form_and_insert_tuple(state); + + /* release resources */ + idxtuples = state->bs_numtuples; + brinRevmapTerminate(state->bs_rmAccess); + terminate_brin_buildstate(state); + + /* + * Return statistics + */ + result = palloc_object(IndexBuildResult); + + result->heap_tuples = reltuples; + result->index_tuples = idxtuples; + + return result; +} + +void +brinbuildempty(Relation index) +{ + Buffer metabuf; + + /* An empty BRIN index has a metapage only. */ + metabuf = ExtendBufferedRel(BMR_REL(index), INIT_FORKNUM, NULL, + EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK); + + /* Initialize and xlog metabuffer. */ + START_CRIT_SECTION(); + brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index), + BRIN_CURRENT_VERSION); + MarkBufferDirty(metabuf); + log_newpage_buffer(metabuf, true); + END_CRIT_SECTION(); + + UnlockReleaseBuffer(metabuf); +} + +/* + * brinbulkdelete + * Since there are no per-heap-tuple index tuples in BRIN indexes, + * there's not a lot we can do here. + * + * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap + * tuple is deleted), meaning the need to re-run summarization on the affected + * range. Would need to add an extra flag in brintuples for that. + */ +IndexBulkDeleteResult * +brinbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state) +{ + /* allocate stats if first time through, else re-use existing struct */ + if (stats == NULL) + stats = palloc0_object(IndexBulkDeleteResult); + + return stats; +} + +/* + * This routine is in charge of "vacuuming" a BRIN index: we just summarize + * ranges that are currently unsummarized. + */ +IndexBulkDeleteResult * +brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) +{ + Relation heapRel; + + /* No-op in ANALYZE ONLY mode */ + if (info->analyze_only) + return stats; + + if (!stats) + stats = palloc0_object(IndexBulkDeleteResult); + stats->num_pages = RelationGetNumberOfBlocks(info->index); + /* rest of stats is initialized by zeroing */ + + heapRel = table_open(IndexGetRelation(RelationGetRelid(info->index), false), + AccessShareLock); + + brin_vacuum_scan(info->index, info->strategy); + + brinsummarize(info->index, heapRel, BRIN_ALL_BLOCKRANGES, false, + &stats->num_index_tuples, &stats->num_index_tuples); + + table_close(heapRel, AccessShareLock); + + return stats; +} + +/* + * reloptions processor for BRIN indexes + */ +bytea * +brinoptions(Datum reloptions, bool validate) +{ + static const relopt_parse_elt tab[] = { + {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)}, + {"autosummarize", RELOPT_TYPE_BOOL, offsetof(BrinOptions, autosummarize)} + }; + + return (bytea *) build_reloptions(reloptions, validate, + RELOPT_KIND_BRIN, + sizeof(BrinOptions), + tab, lengthof(tab)); +} + +/* + * SQL-callable function to scan through an index and summarize all ranges + * that are not currently summarized. + */ +Datum +brin_summarize_new_values(PG_FUNCTION_ARGS) +{ + Datum relation = PG_GETARG_DATUM(0); + + return DirectFunctionCall2(brin_summarize_range, + relation, + Int64GetDatum((int64) BRIN_ALL_BLOCKRANGES)); +} + +/* + * SQL-callable function to summarize the indicated page range, if not already + * summarized. If the second argument is BRIN_ALL_BLOCKRANGES, all + * unsummarized ranges are summarized. + */ +Datum +brin_summarize_range(PG_FUNCTION_ARGS) +{ + Oid indexoid = PG_GETARG_OID(0); + int64 heapBlk64 = PG_GETARG_INT64(1); + BlockNumber heapBlk; + Oid heapoid; + Relation indexRel; + Relation heapRel; + Oid save_userid; + int save_sec_context; + int save_nestlevel; + double numSummarized = 0; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("BRIN control functions cannot be executed during recovery."))); + + if (heapBlk64 > BRIN_ALL_BLOCKRANGES || heapBlk64 < 0) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("block number out of range: %lld", + (long long) heapBlk64))); + heapBlk = (BlockNumber) heapBlk64; + + /* + * We must lock table before index to avoid deadlocks. However, if the + * passed indexoid isn't an index then IndexGetRelation() will fail. + * Rather than emitting a not-very-helpful error message, postpone + * complaining, expecting that the is-it-an-index test below will fail. + */ + heapoid = IndexGetRelation(indexoid, true); + if (OidIsValid(heapoid)) + { + heapRel = table_open(heapoid, ShareUpdateExclusiveLock); + + /* + * Autovacuum calls us. For its benefit, switch to the table owner's + * userid, so that any index functions are run as that user. Also + * lock down security-restricted operations and arrange to make GUC + * variable changes local to this command. This is harmless, albeit + * unnecessary, when called from SQL, because we fail shortly if the + * user does not own the index. + */ + GetUserIdAndSecContext(&save_userid, &save_sec_context); + SetUserIdAndSecContext(heapRel->rd_rel->relowner, + save_sec_context | SECURITY_RESTRICTED_OPERATION); + save_nestlevel = NewGUCNestLevel(); + } + else + { + heapRel = NULL; + /* Set these just to suppress "uninitialized variable" warnings */ + save_userid = InvalidOid; + save_sec_context = -1; + save_nestlevel = -1; + } + + indexRel = index_open(indexoid, ShareUpdateExclusiveLock); + + /* Must be a BRIN index */ + if (indexRel->rd_rel->relkind != RELKIND_INDEX || + indexRel->rd_rel->relam != BRIN_AM_OID) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is not a BRIN index", + RelationGetRelationName(indexRel)))); + + /* User must own the index (comparable to privileges needed for VACUUM) */ + if (heapRel != NULL && !object_ownercheck(RelationRelationId, indexoid, save_userid)) + aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX, + RelationGetRelationName(indexRel)); + + /* + * Since we did the IndexGetRelation call above without any lock, it's + * barely possible that a race against an index drop/recreation could have + * netted us the wrong table. Recheck. + */ + if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_TABLE), + errmsg("could not open parent table of index \"%s\"", + RelationGetRelationName(indexRel)))); + + /* see gin_clean_pending_list() */ + if (indexRel->rd_index->indisvalid) + brinsummarize(indexRel, heapRel, heapBlk, true, &numSummarized, NULL); + else + ereport(DEBUG1, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("index \"%s\" is not valid", + RelationGetRelationName(indexRel)))); + + /* Roll back any GUC changes executed by index functions */ + AtEOXact_GUC(false, save_nestlevel); + + /* Restore userid and security context */ + SetUserIdAndSecContext(save_userid, save_sec_context); + + relation_close(indexRel, ShareUpdateExclusiveLock); + relation_close(heapRel, ShareUpdateExclusiveLock); + + PG_RETURN_INT32((int32) numSummarized); +} + +/* + * SQL-callable interface to mark a range as no longer summarized + */ +Datum +brin_desummarize_range(PG_FUNCTION_ARGS) +{ + Oid indexoid = PG_GETARG_OID(0); + int64 heapBlk64 = PG_GETARG_INT64(1); + BlockNumber heapBlk; + Oid heapoid; + Relation heapRel; + Relation indexRel; + bool done; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("BRIN control functions cannot be executed during recovery."))); + + if (heapBlk64 > MaxBlockNumber || heapBlk64 < 0) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("block number out of range: %lld", + (long long) heapBlk64))); + heapBlk = (BlockNumber) heapBlk64; + + /* + * We must lock table before index to avoid deadlocks. However, if the + * passed indexoid isn't an index then IndexGetRelation() will fail. + * Rather than emitting a not-very-helpful error message, postpone + * complaining, expecting that the is-it-an-index test below will fail. + * + * Unlike brin_summarize_range(), autovacuum never calls this. Hence, we + * don't switch userid. + */ + heapoid = IndexGetRelation(indexoid, true); + if (OidIsValid(heapoid)) + heapRel = table_open(heapoid, ShareUpdateExclusiveLock); + else + heapRel = NULL; + + indexRel = index_open(indexoid, ShareUpdateExclusiveLock); + + /* Must be a BRIN index */ + if (indexRel->rd_rel->relkind != RELKIND_INDEX || + indexRel->rd_rel->relam != BRIN_AM_OID) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is not a BRIN index", + RelationGetRelationName(indexRel)))); + + /* User must own the index (comparable to privileges needed for VACUUM) */ + if (!object_ownercheck(RelationRelationId, indexoid, GetUserId())) + aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX, + RelationGetRelationName(indexRel)); + + /* + * Since we did the IndexGetRelation call above without any lock, it's + * barely possible that a race against an index drop/recreation could have + * netted us the wrong table. Recheck. + */ + if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_TABLE), + errmsg("could not open parent table of index \"%s\"", + RelationGetRelationName(indexRel)))); + + /* see gin_clean_pending_list() */ + if (indexRel->rd_index->indisvalid) + { + /* the revmap does the hard work */ + do + { + done = brinRevmapDesummarizeRange(indexRel, heapBlk); + } + while (!done); + } + else + ereport(DEBUG1, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("index \"%s\" is not valid", + RelationGetRelationName(indexRel)))); + + relation_close(indexRel, ShareUpdateExclusiveLock); + relation_close(heapRel, ShareUpdateExclusiveLock); + + PG_RETURN_VOID(); +} + +/* + * Build a BrinDesc used to create or scan a BRIN index + */ +BrinDesc * +brin_build_desc(Relation rel) +{ + BrinOpcInfo **opcinfo; + BrinDesc *bdesc; + TupleDesc tupdesc; + int totalstored = 0; + int keyno; + long totalsize; + MemoryContext cxt; + MemoryContext oldcxt; + + cxt = AllocSetContextCreate(CurrentMemoryContext, + "brin desc cxt", + ALLOCSET_SMALL_SIZES); + oldcxt = MemoryContextSwitchTo(cxt); + tupdesc = RelationGetDescr(rel); + + /* + * Obtain BrinOpcInfo for each indexed column. While at it, accumulate + * the number of columns stored, since the number is opclass-defined. + */ + opcinfo = palloc_array(BrinOpcInfo *, tupdesc->natts); + for (keyno = 0; keyno < tupdesc->natts; keyno++) + { + FmgrInfo *opcInfoFn; + Form_pg_attribute attr = TupleDescAttr(tupdesc, keyno); + + opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO); + + opcinfo[keyno] = (BrinOpcInfo *) + DatumGetPointer(FunctionCall1(opcInfoFn, attr->atttypid)); + totalstored += opcinfo[keyno]->oi_nstored; + } + + /* Allocate our result struct and fill it in */ + totalsize = offsetof(BrinDesc, bd_info) + + sizeof(BrinOpcInfo *) * tupdesc->natts; + + bdesc = palloc(totalsize); + bdesc->bd_context = cxt; + bdesc->bd_index = rel; + bdesc->bd_tupdesc = tupdesc; + bdesc->bd_disktdesc = NULL; /* generated lazily */ + bdesc->bd_totalstored = totalstored; + + for (keyno = 0; keyno < tupdesc->natts; keyno++) + bdesc->bd_info[keyno] = opcinfo[keyno]; + pfree(opcinfo); + + MemoryContextSwitchTo(oldcxt); + + return bdesc; +} + +void +brin_free_desc(BrinDesc *bdesc) +{ + /* make sure the tupdesc is still valid */ + Assert(bdesc->bd_tupdesc->tdrefcount >= 1); + /* no need for retail pfree */ + MemoryContextDelete(bdesc->bd_context); +} + +/* + * Fetch index's statistical data into *stats + */ +void +brinGetStats(Relation index, BrinStatsData *stats) +{ + Buffer metabuffer; + Page metapage; + BrinMetaPageData *metadata; + + metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO); + LockBuffer(metabuffer, BUFFER_LOCK_SHARE); + metapage = BufferGetPage(metabuffer); + metadata = (BrinMetaPageData *) PageGetContents(metapage); + + stats->pagesPerRange = metadata->pagesPerRange; + stats->revmapNumPages = metadata->lastRevmapPage - 1; + + UnlockReleaseBuffer(metabuffer); +} + +/* + * Initialize a BrinBuildState appropriate to create tuples on the given index. + */ +static BrinBuildState * +initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap, + BlockNumber pagesPerRange) +{ + BrinBuildState *state; + + state = palloc_object(BrinBuildState); + + state->bs_irel = idxRel; + state->bs_numtuples = 0; + state->bs_currentInsertBuf = InvalidBuffer; + state->bs_pagesPerRange = pagesPerRange; + state->bs_currRangeStart = 0; + state->bs_rmAccess = revmap; + state->bs_bdesc = brin_build_desc(idxRel); + state->bs_dtuple = brin_new_memtuple(state->bs_bdesc); + + return state; +} + +/* + * Release resources associated with a BrinBuildState. + */ +static void +terminate_brin_buildstate(BrinBuildState *state) +{ + /* + * Release the last index buffer used. We might as well ensure that + * whatever free space remains in that page is available in FSM, too. + */ + if (!BufferIsInvalid(state->bs_currentInsertBuf)) + { + Page page; + Size freespace; + BlockNumber blk; + + page = BufferGetPage(state->bs_currentInsertBuf); + freespace = PageGetFreeSpace(page); + blk = BufferGetBlockNumber(state->bs_currentInsertBuf); + ReleaseBuffer(state->bs_currentInsertBuf); + RecordPageWithFreeSpace(state->bs_irel, blk, freespace); + FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1); + } + + brin_free_desc(state->bs_bdesc); + pfree(state->bs_dtuple); + pfree(state); +} + +/* + * On the given BRIN index, summarize the heap page range that corresponds + * to the heap block number given. + * + * This routine can run in parallel with insertions into the heap. To avoid + * missing those values from the summary tuple, we first insert a placeholder + * index tuple into the index, then execute the heap scan; transactions + * concurrent with the scan update the placeholder tuple. After the scan, we + * union the placeholder tuple with the one computed by this routine. The + * update of the index value happens in a loop, so that if somebody updates + * the placeholder tuple after we read it, we detect the case and try again. + * This ensures that the concurrently inserted tuples are not lost. + * + * A further corner case is this routine being asked to summarize the partial + * range at the end of the table. heapNumBlocks is the (possibly outdated) + * table size; if we notice that the requested range lies beyond that size, + * we re-compute the table size after inserting the placeholder tuple, to + * avoid missing pages that were appended recently. + */ +static void +summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel, + BlockNumber heapBlk, BlockNumber heapNumBlks) +{ + Buffer phbuf; + BrinTuple *phtup; + Size phsz; + OffsetNumber offset; + BlockNumber scanNumBlks; + + /* + * Insert the placeholder tuple + */ + phbuf = InvalidBuffer; + phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz); + offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange, + state->bs_rmAccess, &phbuf, + heapBlk, phtup, phsz); + + /* + * Compute range end. We hold ShareUpdateExclusive lock on table, so it + * cannot shrink concurrently (but it can grow). + */ + Assert(heapBlk % state->bs_pagesPerRange == 0); + if (heapBlk + state->bs_pagesPerRange > heapNumBlks) + { + /* + * If we're asked to scan what we believe to be the final range on the + * table (i.e. a range that might be partial) we need to recompute our + * idea of what the latest page is after inserting the placeholder + * tuple. Anyone that grows the table later will update the + * placeholder tuple, so it doesn't matter that we won't scan these + * pages ourselves. Careful: the table might have been extended + * beyond the current range, so clamp our result. + * + * Fortunately, this should occur infrequently. + */ + scanNumBlks = Min(RelationGetNumberOfBlocks(heapRel) - heapBlk, + state->bs_pagesPerRange); + } + else + { + /* Easy case: range is known to be complete */ + scanNumBlks = state->bs_pagesPerRange; + } + + /* + * Execute the partial heap scan covering the heap blocks in the specified + * page range, summarizing the heap tuples in it. This scan stops just + * short of brinbuildCallback creating the new index entry. + * + * Note that it is critical we use the "any visible" mode of + * table_index_build_range_scan here: otherwise, we would miss tuples + * inserted by transactions that are still in progress, among other corner + * cases. + */ + state->bs_currRangeStart = heapBlk; + table_index_build_range_scan(heapRel, state->bs_irel, indexInfo, false, true, false, + heapBlk, scanNumBlks, + brinbuildCallback, (void *) state, NULL); + + /* + * Now we update the values obtained by the scan with the placeholder + * tuple. We do this in a loop which only terminates if we're able to + * update the placeholder tuple successfully; if we are not, this means + * somebody else modified the placeholder tuple after we read it. + */ + for (;;) + { + BrinTuple *newtup; + Size newsize; + bool didupdate; + bool samepage; + + CHECK_FOR_INTERRUPTS(); + + /* + * Update the summary tuple and try to update. + */ + newtup = brin_form_tuple(state->bs_bdesc, + heapBlk, state->bs_dtuple, &newsize); + samepage = brin_can_do_samepage_update(phbuf, phsz, newsize); + didupdate = + brin_doupdate(state->bs_irel, state->bs_pagesPerRange, + state->bs_rmAccess, heapBlk, phbuf, offset, + phtup, phsz, newtup, newsize, samepage); + brin_free_tuple(phtup); + brin_free_tuple(newtup); + + /* If the update succeeded, we're done. */ + if (didupdate) + break; + + /* + * If the update didn't work, it might be because somebody updated the + * placeholder tuple concurrently. Extract the new version, union it + * with the values we have from the scan, and start over. (There are + * other reasons for the update to fail, but it's simple to treat them + * the same.) + */ + phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf, + &offset, &phsz, BUFFER_LOCK_SHARE, + NULL); + /* the placeholder tuple must exist */ + if (phtup == NULL) + elog(ERROR, "missing placeholder tuple"); + phtup = brin_copy_tuple(phtup, phsz, NULL, NULL); + LockBuffer(phbuf, BUFFER_LOCK_UNLOCK); + + /* merge it into the tuple from the heap scan */ + union_tuples(state->bs_bdesc, state->bs_dtuple, phtup); + } + + ReleaseBuffer(phbuf); +} + +/* + * Summarize page ranges that are not already summarized. If pageRange is + * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the + * page range containing the given heap page number is scanned. + * If include_partial is true, then the partial range at the end of the table + * is summarized, otherwise not. + * + * For each new index tuple inserted, *numSummarized (if not NULL) is + * incremented; for each existing tuple, *numExisting (if not NULL) is + * incremented. + */ +static void +brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange, + bool include_partial, double *numSummarized, double *numExisting) +{ + BrinRevmap *revmap; + BrinBuildState *state = NULL; + IndexInfo *indexInfo = NULL; + BlockNumber heapNumBlocks; + BlockNumber pagesPerRange; + Buffer buf; + BlockNumber startBlk; + + revmap = brinRevmapInitialize(index, &pagesPerRange, NULL); + + /* determine range of pages to process */ + heapNumBlocks = RelationGetNumberOfBlocks(heapRel); + if (pageRange == BRIN_ALL_BLOCKRANGES) + startBlk = 0; + else + { + startBlk = (pageRange / pagesPerRange) * pagesPerRange; + heapNumBlocks = Min(heapNumBlocks, startBlk + pagesPerRange); + } + if (startBlk > heapNumBlocks) + { + /* Nothing to do if start point is beyond end of table */ + brinRevmapTerminate(revmap); + return; + } + + /* + * Scan the revmap to find unsummarized items. + */ + buf = InvalidBuffer; + for (; startBlk < heapNumBlocks; startBlk += pagesPerRange) + { + BrinTuple *tup; + OffsetNumber off; + + /* + * Unless requested to summarize even a partial range, go away now if + * we think the next range is partial. Caller would pass true when it + * is typically run once bulk data loading is done + * (brin_summarize_new_values), and false when it is typically the + * result of arbitrarily-scheduled maintenance command (vacuuming). + */ + if (!include_partial && + (startBlk + pagesPerRange > heapNumBlocks)) + break; + + CHECK_FOR_INTERRUPTS(); + + tup = brinGetTupleForHeapBlock(revmap, startBlk, &buf, &off, NULL, + BUFFER_LOCK_SHARE, NULL); + if (tup == NULL) + { + /* no revmap entry for this heap range. Summarize it. */ + if (state == NULL) + { + /* first time through */ + Assert(!indexInfo); + state = initialize_brin_buildstate(index, revmap, + pagesPerRange); + indexInfo = BuildIndexInfo(index); + } + summarize_range(indexInfo, state, heapRel, startBlk, heapNumBlocks); + + /* and re-initialize state for the next range */ + brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc); + + if (numSummarized) + *numSummarized += 1.0; + } + else + { + if (numExisting) + *numExisting += 1.0; + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + } + + if (BufferIsValid(buf)) + ReleaseBuffer(buf); + + /* free resources */ + brinRevmapTerminate(revmap); + if (state) + { + terminate_brin_buildstate(state); + pfree(indexInfo); + } +} + +/* + * Given a deformed tuple in the build state, convert it into the on-disk + * format and insert it into the index, making the revmap point to it. + */ +static void +form_and_insert_tuple(BrinBuildState *state) +{ + BrinTuple *tup; + Size size; + + tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart, + state->bs_dtuple, &size); + brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess, + &state->bs_currentInsertBuf, state->bs_currRangeStart, + tup, size); + state->bs_numtuples++; + + pfree(tup); +} + +/* + * Given two deformed tuples, adjust the first one so that it's consistent + * with the summary values in both. + */ +static void +union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b) +{ + int keyno; + BrinMemTuple *db; + MemoryContext cxt; + MemoryContext oldcxt; + + /* Use our own memory context to avoid retail pfree */ + cxt = AllocSetContextCreate(CurrentMemoryContext, + "brin union", + ALLOCSET_DEFAULT_SIZES); + oldcxt = MemoryContextSwitchTo(cxt); + db = brin_deform_tuple(bdesc, b, NULL); + MemoryContextSwitchTo(oldcxt); + + /* + * Check if the ranges are empty. + * + * If at least one of them is empty, we don't need to call per-key union + * functions at all. If "b" is empty, we just use "a" as the result (it + * might be empty fine, but that's fine). If "a" is empty but "b" is not, + * we use "b" as the result (but we have to copy the data into "a" first). + * + * Only when both ranges are non-empty, we actually do the per-key merge. + */ + + /* If "b" is empty - ignore it and just use "a" (even if it's empty etc.). */ + if (db->bt_empty_range) + { + /* skip the per-key merge */ + MemoryContextDelete(cxt); + return; + } + + /* + * Now we know "b" is not empty. If "a" is empty, then "b" is the result. + * But we need to copy the data from "b" to "a" first, because that's how + * we pass result out. + * + * We have to copy all the global/per-key flags etc. too. + */ + if (a->bt_empty_range) + { + for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) + { + int i; + BrinValues *col_a = &a->bt_columns[keyno]; + BrinValues *col_b = &db->bt_columns[keyno]; + BrinOpcInfo *opcinfo = bdesc->bd_info[keyno]; + + col_a->bv_allnulls = col_b->bv_allnulls; + col_a->bv_hasnulls = col_b->bv_hasnulls; + + /* If "b" has no data, we're done. */ + if (col_b->bv_allnulls) + continue; + + for (i = 0; i < opcinfo->oi_nstored; i++) + col_a->bv_values[i] = + datumCopy(col_b->bv_values[i], + opcinfo->oi_typcache[i]->typbyval, + opcinfo->oi_typcache[i]->typlen); + } + + /* "a" started empty, but "b" was not empty, so remember that */ + a->bt_empty_range = false; + + /* skip the per-key merge */ + MemoryContextDelete(cxt); + return; + } + + /* Now we know neither range is empty. */ + for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) + { + FmgrInfo *unionFn; + BrinValues *col_a = &a->bt_columns[keyno]; + BrinValues *col_b = &db->bt_columns[keyno]; + BrinOpcInfo *opcinfo = bdesc->bd_info[keyno]; + + if (opcinfo->oi_regular_nulls) + { + /* Does the "b" summary represent any NULL values? */ + bool b_has_nulls = (col_b->bv_hasnulls || col_b->bv_allnulls); + + /* Adjust "hasnulls". */ + if (!col_a->bv_allnulls && b_has_nulls) + col_a->bv_hasnulls = true; + + /* If there are no values in B, there's nothing left to do. */ + if (col_b->bv_allnulls) + continue; + + /* + * Adjust "allnulls". If A doesn't have values, just copy the + * values from B into A, and we're done. We cannot run the + * operators in this case, because values in A might contain + * garbage. Note we already established that B contains values. + * + * Also adjust "hasnulls" in order not to forget the summary + * represents NULL values. This is not redundant with the earlier + * update, because that only happens when allnulls=false. + */ + if (col_a->bv_allnulls) + { + int i; + + col_a->bv_allnulls = false; + col_a->bv_hasnulls = true; + + for (i = 0; i < opcinfo->oi_nstored; i++) + col_a->bv_values[i] = + datumCopy(col_b->bv_values[i], + opcinfo->oi_typcache[i]->typbyval, + opcinfo->oi_typcache[i]->typlen); + + continue; + } + } + + unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1, + BRIN_PROCNUM_UNION); + FunctionCall3Coll(unionFn, + bdesc->bd_index->rd_indcollation[keyno], + PointerGetDatum(bdesc), + PointerGetDatum(col_a), + PointerGetDatum(col_b)); + } + + MemoryContextDelete(cxt); +} + +/* + * brin_vacuum_scan + * Do a complete scan of the index during VACUUM. + * + * This routine scans the complete index looking for uncatalogued index pages, + * i.e. those that might have been lost due to a crash after index extension + * and such. + */ +static void +brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy) +{ + BlockNumber nblocks; + BlockNumber blkno; + + /* + * Scan the index in physical order, and clean up any possible mess in + * each page. + */ + nblocks = RelationGetNumberOfBlocks(idxrel); + for (blkno = 0; blkno < nblocks; blkno++) + { + Buffer buf; + + CHECK_FOR_INTERRUPTS(); + + buf = ReadBufferExtended(idxrel, MAIN_FORKNUM, blkno, + RBM_NORMAL, strategy); + + brin_page_cleanup(idxrel, buf); + + ReleaseBuffer(buf); + } + + /* + * Update all upper pages in the index's FSM, as well. This ensures not + * only that we propagate leaf-page FSM updates made by brin_page_cleanup, + * but also that any pre-existing damage or out-of-dateness is repaired. + */ + FreeSpaceMapVacuum(idxrel); +} + +static bool +add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup, + Datum *values, bool *nulls) +{ + int keyno; + + /* If the range starts empty, we're certainly going to modify it. */ + bool modified = dtup->bt_empty_range; + + /* + * Compare the key values of the new tuple to the stored index values; our + * deformed tuple will get updated if the new tuple doesn't fit the + * original range (note this means we can't break out of the loop early). + * Make a note of whether this happens, so that we know to insert the + * modified tuple later. + */ + for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) + { + Datum result; + BrinValues *bval; + FmgrInfo *addValue; + bool has_nulls; + + bval = &dtup->bt_columns[keyno]; + + /* + * Does the range have actual NULL values? Either of the flags can be + * set, but we ignore the state before adding first row. + * + * We have to remember this, because we'll modify the flags and we + * need to know if the range started as empty. + */ + has_nulls = ((!dtup->bt_empty_range) && + (bval->bv_hasnulls || bval->bv_allnulls)); + + /* + * If the value we're adding is NULL, handle it locally. Otherwise + * call the BRIN_PROCNUM_ADDVALUE procedure. + */ + if (bdesc->bd_info[keyno]->oi_regular_nulls && nulls[keyno]) + { + /* + * If the new value is null, we record that we saw it if it's the + * first one; otherwise, there's nothing to do. + */ + if (!bval->bv_hasnulls) + { + bval->bv_hasnulls = true; + modified = true; + } + + continue; + } + + addValue = index_getprocinfo(idxRel, keyno + 1, + BRIN_PROCNUM_ADDVALUE); + result = FunctionCall4Coll(addValue, + idxRel->rd_indcollation[keyno], + PointerGetDatum(bdesc), + PointerGetDatum(bval), + values[keyno], + nulls[keyno]); + /* if that returned true, we need to insert the updated tuple */ + modified |= DatumGetBool(result); + + /* + * If the range was had actual NULL values (i.e. did not start empty), + * make sure we don't forget about the NULL values. Either the + * allnulls flag is still set to true, or (if the opclass cleared it) + * we need to set hasnulls=true. + * + * XXX This can only happen when the opclass modified the tuple, so + * the modified flag should be set. + */ + if (has_nulls && !(bval->bv_hasnulls || bval->bv_allnulls)) + { + Assert(modified); + bval->bv_hasnulls = true; + } + } + + /* + * After updating summaries for all the keys, mark it as not empty. + * + * If we're actually changing the flag value (i.e. tuple started as + * empty), we should have modified the tuple. So we should not see empty + * range that was not modified. + */ + Assert(!dtup->bt_empty_range || modified); + dtup->bt_empty_range = false; + + return modified; +} + +static bool +check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys) +{ + int keyno; + + /* + * First check if there are any IS [NOT] NULL scan keys, and if we're + * violating them. + */ + for (keyno = 0; keyno < nnullkeys; keyno++) + { + ScanKey key = nullkeys[keyno]; + + Assert(key->sk_attno == bval->bv_attno); + + /* Handle only IS NULL/IS NOT NULL tests */ + if (!(key->sk_flags & SK_ISNULL)) + continue; + + if (key->sk_flags & SK_SEARCHNULL) + { + /* IS NULL scan key, but range has no NULLs */ + if (!bval->bv_allnulls && !bval->bv_hasnulls) + return false; + } + else if (key->sk_flags & SK_SEARCHNOTNULL) + { + /* + * For IS NOT NULL, we can only skip ranges that are known to have + * only nulls. + */ + if (bval->bv_allnulls) + return false; + } + else + { + /* + * Neither IS NULL nor IS NOT NULL was used; assume all indexable + * operators are strict and thus return false with NULL value in + * the scan key. + */ + return false; + } + } + + return true; +} |