diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
commit | 46651ce6fe013220ed397add242004d764fc0153 (patch) | |
tree | 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/access/brin | |
parent | Initial commit. (diff) | |
download | postgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip |
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/access/brin')
-rw-r--r-- | src/backend/access/brin/Makefile | 27 | ||||
-rw-r--r-- | src/backend/access/brin/README | 189 | ||||
-rw-r--r-- | src/backend/access/brin/brin.c | 1800 | ||||
-rw-r--r-- | src/backend/access/brin/brin_bloom.c | 809 | ||||
-rw-r--r-- | src/backend/access/brin/brin_inclusion.c | 657 | ||||
-rw-r--r-- | src/backend/access/brin/brin_minmax.c | 317 | ||||
-rw-r--r-- | src/backend/access/brin/brin_minmax_multi.c | 3163 | ||||
-rw-r--r-- | src/backend/access/brin/brin_pageops.c | 920 | ||||
-rw-r--r-- | src/backend/access/brin/brin_revmap.c | 664 | ||||
-rw-r--r-- | src/backend/access/brin/brin_tuple.c | 708 | ||||
-rw-r--r-- | src/backend/access/brin/brin_validate.c | 281 | ||||
-rw-r--r-- | src/backend/access/brin/brin_xlog.c | 367 |
12 files changed, 9902 insertions, 0 deletions
diff --git a/src/backend/access/brin/Makefile b/src/backend/access/brin/Makefile new file mode 100644 index 0000000..a386cb7 --- /dev/null +++ b/src/backend/access/brin/Makefile @@ -0,0 +1,27 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/brin +# +# IDENTIFICATION +# src/backend/access/brin/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/brin +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + brin.o \ + brin_bloom.o \ + brin_inclusion.o \ + brin_minmax.o \ + brin_minmax_multi.o \ + brin_pageops.o \ + brin_revmap.o \ + brin_tuple.o \ + brin_validate.o \ + brin_xlog.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/brin/README b/src/backend/access/brin/README new file mode 100644 index 0000000..636d965 --- /dev/null +++ b/src/backend/access/brin/README @@ -0,0 +1,189 @@ +Block Range Indexes (BRIN) +========================== + +BRIN indexes intend to enable very fast scanning of extremely large tables. + +The essential idea of a BRIN index is to keep track of summarizing values in +consecutive groups of heap pages (page ranges); for example, the minimum and +maximum values for datatypes with a btree opclass, or the bounding box for +geometric types. These values can be used to avoid scanning such pages +during a table scan, depending on query quals. + +The cost of this is having to update the stored summary values of each page +range as tuples are inserted into them. + + +Access Method Design +-------------------- + +Since item pointers are not stored inside indexes of this type, it is not +possible to support the amgettuple interface. Instead, we only provide +amgetbitmap support. The amgetbitmap routine returns a lossy TIDBitmap +comprising all pages in those page ranges that match the query +qualifications. The recheck step in the BitmapHeapScan node prunes tuples +that are not visible according to the query qualifications. + +An operator class must have the following entries: + +- generic support procedures (pg_amproc), identical to all opclasses: + * "opcinfo" (BRIN_PROCNUM_OPCINFO) initializes a structure for index + creation or scanning + * "addValue" (BRIN_PROCNUM_ADDVALUE) takes an index tuple and a heap item, + and possibly changes the index tuple so that it includes the heap item + values + * "consistent" (BRIN_PROCNUM_CONSISTENT) takes an index tuple and query + quals, and returns whether the index tuple values match the query quals. + * "union" (BRIN_PROCNUM_UNION) takes two index tuples and modifies the first + one so that it represents the union of the two. +Procedure numbers up to 10 are reserved for future expansion. + +Additionally, each opclass needs additional support functions: +- Minmax-style operator classes: + * Proc numbers 11-14 are used for the functions implementing inequality + operators for the type, in this order: less than, less or equal, + greater or equal, greater than. + +Opclasses using a different design will require different additional procedure +numbers. + +Operator classes also need to have operator (pg_amop) entries so that the +optimizer can choose the index to execute queries. +- Minmax-style operator classes: + * The same operators as btree (<=, <, =, >=, >) + +Each index tuple stores some NULL bits and some opclass-specified values, which +are stored in a single null bitmask of length twice the number of columns. The +generic NULL bits indicate, for each column: + * bt_hasnulls: Whether there's any NULL value at all in the page range + * bt_allnulls: Whether all values are NULLs in the page range + +The opclass-specified values are: +- Minmax-style operator classes + * minimum value across all tuples in the range + * maximum value across all tuples in the range + +Note that the addValue and Union support procedures must be careful to +datumCopy() the values they want to store in the in-memory BRIN tuple, and +must pfree() the old copies when replacing older ones. Since some values +referenced from the tuple persist and others go away, there is no +well-defined lifetime for a memory context that would make this automatic. + + +The Range Map +------------- + +To find the index tuple for a particular page range, we have an internal +structure we call the range map, or "revmap" for short. This stores one TID +per page range, which is the address of the index tuple summarizing that +range. Since the map entries are fixed size, it is possible to compute the +address of the range map entry for any given heap page by simple arithmetic. + +When a new heap tuple is inserted in a summarized page range, we compare the +existing index tuple with the new heap tuple. If the heap tuple is outside +the summarization data given by the index tuple for any indexed column (or +if the new heap tuple contains null values but the index tuple indicates +there are no nulls), the index is updated with the new values. In many +cases it is possible to update the index tuple in-place, but if the new +index tuple is larger than the old one and there's not enough space in the +page, it is necessary to create a new index tuple with the new values. The +range map can be updated quickly to point to it; the old index tuple is +removed. + +If the range map points to an invalid TID, the corresponding page range is +considered to be not summarized. When tuples are added to unsummarized +pages, nothing needs to happen. + +To scan a table following a BRIN index, we scan the range map sequentially. +This yields index tuples in ascending page range order. Query quals are +matched to each index tuple; if they match, each page within the page range +is returned as part of the output TID bitmap. If there's no match, they are +skipped. Range map entries returning invalid index TIDs, that is +unsummarized page ranges, are also returned in the TID bitmap. + +The revmap is stored in the first few blocks of the index main fork, +immediately following the metapage. Whenever the revmap needs to be +extended by another page, existing tuples in that page are moved to some +other page. + +Heap tuples can be removed from anywhere without restriction. It might be +useful to mark the corresponding index tuple somehow, if the heap tuple is +one of the constraining values of the summary data (i.e. either min or max +in the case of a btree-opclass-bearing datatype), so that in the future we +are aware of the need to re-execute summarization on that range, leading to +a possible tightening of the summary values. + +Summarization +------------- + +At index creation time, the whole table is scanned; for each page range the +summarizing values of each indexed column and nulls bitmap are collected and +stored in the index. The partially-filled page range at the end of the +table is also summarized. + +As new tuples get inserted at the end of the table, they may update the +index tuple that summarizes the partial page range at the end. Eventually +that page range is complete and new tuples belong in a new page range that +hasn't yet been summarized. Those insertions do not create a new index +entry; instead, the page range remains unsummarized until later. + +Whenever VACUUM is run on the table, all unsummarized page ranges are +summarized. This action can also be invoked by the user via +brin_summarize_new_values(). Both these procedures scan all the +unsummarized ranges, and create a summary tuple. Again, this includes the +partially-filled page range at the end of the table. + +Vacuuming +--------- + +Since no heap TIDs are stored in a BRIN index, it's not necessary to scan the +index when heap tuples are removed. It might be that some summary values can +be tightened if heap tuples have been deleted; but this would represent an +optimization opportunity only, not a correctness issue. It's simpler to +represent this as the need to re-run summarization on the affected page range +rather than "subtracting" values from the existing one. This is not +currently implemented. + +Note that if there are no indexes on the table other than the BRIN index, +usage of maintenance_work_mem by vacuum can be decreased significantly, because +no detailed index scan needs to take place (and thus it's not necessary for +vacuum to save TIDs to remove). It's unlikely that BRIN would be the only +indexes in a table, though, because primary keys can be btrees only, and so +we don't implement this optimization. + + +Optimizer +--------- + +The optimizer selects the index based on the operator class' pg_amop +entries for the column. + + +Future improvements +------------------- + +* Different-size page ranges? + In the current design, each "index entry" in a BRIN index covers the same + number of pages. There's no hard reason for this; it might make sense to + allow the index to self-tune so that some index entries cover smaller page + ranges, if this allows the summary values to be more compact. This would incur + larger BRIN overhead for the index itself, but might allow better pruning of + page ranges during scan. In the limit of one index tuple per page, the index + itself would occupy too much space, even though we would be able to skip + reading the most heap pages, because the summary values are tight; in the + opposite limit of a single tuple that summarizes the whole table, we wouldn't + be able to prune anything even though the index is very small. This can + probably be made to work by using the range map as an index in itself. + +* More compact representation for TIDBitmap? + TIDBitmap is the structure used to represent bitmap scans. The + representation of lossy page ranges is not optimal for our purposes, because + it uses a Bitmapset to represent pages in the range; since we're going to return + all pages in a large range, it might be more convenient to allow for a + struct that uses start and end page numbers to represent the range, instead. + +* Better vacuuming? + It might be useful to enable passing more useful info to BRIN indexes during + vacuuming about tuples that are deleted, i.e. do not require the callback to + pass each tuple's TID. For instance we might need a callback that passes a + block number instead of a TID. That would help determine when to re-run + summarization on blocks that have seen lots of tuple deletions. diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c new file mode 100644 index 0000000..21a2384 --- /dev/null +++ b/src/backend/access/brin/brin.c @@ -0,0 +1,1800 @@ +/* + * brin.c + * Implementation of BRIN indexes for Postgres + * + * See src/backend/access/brin/README for details. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin.c + * + * TODO + * * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY) + */ +#include "postgres.h" + +#include "access/brin.h" +#include "access/brin_page.h" +#include "access/brin_pageops.h" +#include "access/brin_xlog.h" +#include "access/relation.h" +#include "access/reloptions.h" +#include "access/relscan.h" +#include "access/table.h" +#include "access/tableam.h" +#include "access/xloginsert.h" +#include "catalog/index.h" +#include "catalog/pg_am.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/autovacuum.h" +#include "storage/bufmgr.h" +#include "storage/freespace.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/index_selfuncs.h" +#include "utils/memutils.h" +#include "utils/rel.h" + + +/* + * We use a BrinBuildState during initial construction of a BRIN index. + * The running state is kept in a BrinMemTuple. + */ +typedef struct BrinBuildState +{ + Relation bs_irel; + int bs_numtuples; + Buffer bs_currentInsertBuf; + BlockNumber bs_pagesPerRange; + BlockNumber bs_currRangeStart; + BrinRevmap *bs_rmAccess; + BrinDesc *bs_bdesc; + BrinMemTuple *bs_dtuple; +} BrinBuildState; + +/* + * Struct used as "opaque" during index scans + */ +typedef struct BrinOpaque +{ + BlockNumber bo_pagesPerRange; + BrinRevmap *bo_rmAccess; + BrinDesc *bo_bdesc; +} BrinOpaque; + +#define BRIN_ALL_BLOCKRANGES InvalidBlockNumber + +static BrinBuildState *initialize_brin_buildstate(Relation idxRel, + BrinRevmap *revmap, BlockNumber pagesPerRange); +static void terminate_brin_buildstate(BrinBuildState *state); +static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange, + bool include_partial, double *numSummarized, double *numExisting); +static void form_and_insert_tuple(BrinBuildState *state); +static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a, + BrinTuple *b); +static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy); +static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc, + BrinMemTuple *dtup, Datum *values, bool *nulls); +static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys); + +/* + * BRIN handler function: return IndexAmRoutine with access method parameters + * and callbacks. + */ +Datum +brinhandler(PG_FUNCTION_ARGS) +{ + IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); + + amroutine->amstrategies = 0; + amroutine->amsupport = BRIN_LAST_OPTIONAL_PROCNUM; + amroutine->amoptsprocnum = BRIN_PROCNUM_OPTIONS; + amroutine->amcanorder = false; + amroutine->amcanorderbyop = false; + amroutine->amcanbackward = false; + amroutine->amcanunique = false; + amroutine->amcanmulticol = true; + amroutine->amoptionalkey = true; + amroutine->amsearcharray = false; + amroutine->amsearchnulls = true; + amroutine->amstorage = true; + amroutine->amclusterable = false; + amroutine->ampredlocks = false; + amroutine->amcanparallel = false; + amroutine->amcaninclude = false; + amroutine->amusemaintenanceworkmem = false; + amroutine->amparallelvacuumoptions = + VACUUM_OPTION_PARALLEL_CLEANUP; + amroutine->amkeytype = InvalidOid; + + amroutine->ambuild = brinbuild; + amroutine->ambuildempty = brinbuildempty; + amroutine->aminsert = brininsert; + amroutine->ambulkdelete = brinbulkdelete; + amroutine->amvacuumcleanup = brinvacuumcleanup; + amroutine->amcanreturn = NULL; + amroutine->amcostestimate = brincostestimate; + amroutine->amoptions = brinoptions; + amroutine->amproperty = NULL; + amroutine->ambuildphasename = NULL; + amroutine->amvalidate = brinvalidate; + amroutine->amadjustmembers = NULL; + amroutine->ambeginscan = brinbeginscan; + amroutine->amrescan = brinrescan; + amroutine->amgettuple = NULL; + amroutine->amgetbitmap = bringetbitmap; + amroutine->amendscan = brinendscan; + amroutine->ammarkpos = NULL; + amroutine->amrestrpos = NULL; + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; + + PG_RETURN_POINTER(amroutine); +} + +/* + * A tuple in the heap is being inserted. To keep a brin index up to date, + * we need to obtain the relevant index tuple and compare its stored values + * with those of the new tuple. If the tuple values are not consistent with + * the summary tuple, we need to update the index tuple. + * + * If autosummarization is enabled, check if we need to summarize the previous + * page range. + * + * If the range is not currently summarized (i.e. the revmap returns NULL for + * it), there's nothing to do for this tuple. + */ +bool +brininsert(Relation idxRel, Datum *values, bool *nulls, + ItemPointer heaptid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + IndexInfo *indexInfo) +{ + BlockNumber pagesPerRange; + BlockNumber origHeapBlk; + BlockNumber heapBlk; + BrinDesc *bdesc = (BrinDesc *) indexInfo->ii_AmCache; + BrinRevmap *revmap; + Buffer buf = InvalidBuffer; + MemoryContext tupcxt = NULL; + MemoryContext oldcxt = CurrentMemoryContext; + bool autosummarize = BrinGetAutoSummarize(idxRel); + + revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL); + + /* + * origHeapBlk is the block number where the insertion occurred. heapBlk + * is the first block in the corresponding page range. + */ + origHeapBlk = ItemPointerGetBlockNumber(heaptid); + heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange; + + for (;;) + { + bool need_insert = false; + OffsetNumber off; + BrinTuple *brtup; + BrinMemTuple *dtup; + + CHECK_FOR_INTERRUPTS(); + + /* + * If auto-summarization is enabled and we just inserted the first + * tuple into the first block of a new non-first page range, request a + * summarization run of the previous range. + */ + if (autosummarize && + heapBlk > 0 && + heapBlk == origHeapBlk && + ItemPointerGetOffsetNumber(heaptid) == FirstOffsetNumber) + { + BlockNumber lastPageRange = heapBlk - 1; + BrinTuple *lastPageTuple; + + lastPageTuple = + brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off, + NULL, BUFFER_LOCK_SHARE, NULL); + if (!lastPageTuple) + { + bool recorded; + + recorded = AutoVacuumRequestWork(AVW_BRINSummarizeRange, + RelationGetRelid(idxRel), + lastPageRange); + if (!recorded) + ereport(LOG, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded", + RelationGetRelationName(idxRel), + lastPageRange))); + } + else + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + + brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, + NULL, BUFFER_LOCK_SHARE, NULL); + + /* if range is unsummarized, there's nothing to do */ + if (!brtup) + break; + + /* First time through in this statement? */ + if (bdesc == NULL) + { + MemoryContextSwitchTo(indexInfo->ii_Context); + bdesc = brin_build_desc(idxRel); + indexInfo->ii_AmCache = (void *) bdesc; + MemoryContextSwitchTo(oldcxt); + } + /* First time through in this brininsert call? */ + if (tupcxt == NULL) + { + tupcxt = AllocSetContextCreate(CurrentMemoryContext, + "brininsert cxt", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(tupcxt); + } + + dtup = brin_deform_tuple(bdesc, brtup, NULL); + + need_insert = add_values_to_range(idxRel, bdesc, dtup, values, nulls); + + if (!need_insert) + { + /* + * The tuple is consistent with the new values, so there's nothing + * to do. + */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + else + { + Page page = BufferGetPage(buf); + ItemId lp = PageGetItemId(page, off); + Size origsz; + BrinTuple *origtup; + Size newsz; + BrinTuple *newtup; + bool samepage; + + /* + * Make a copy of the old tuple, so that we can compare it after + * re-acquiring the lock. + */ + origsz = ItemIdGetLength(lp); + origtup = brin_copy_tuple(brtup, origsz, NULL, NULL); + + /* + * Before releasing the lock, check if we can attempt a same-page + * update. Another process could insert a tuple concurrently in + * the same page though, so downstream we must be prepared to cope + * if this turns out to not be possible after all. + */ + newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz); + samepage = brin_can_do_samepage_update(buf, origsz, newsz); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + /* + * Try to update the tuple. If this doesn't work for whatever + * reason, we need to restart from the top; the revmap might be + * pointing at a different tuple for this block now, so we need to + * recompute to ensure both our new heap tuple and the other + * inserter's are covered by the combined tuple. It might be that + * we don't need to update at all. + */ + if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk, + buf, off, origtup, origsz, newtup, newsz, + samepage)) + { + /* no luck; start over */ + MemoryContextResetAndDeleteChildren(tupcxt); + continue; + } + } + + /* success! */ + break; + } + + brinRevmapTerminate(revmap); + if (BufferIsValid(buf)) + ReleaseBuffer(buf); + MemoryContextSwitchTo(oldcxt); + if (tupcxt != NULL) + MemoryContextDelete(tupcxt); + + return false; +} + +/* + * Initialize state for a BRIN index scan. + * + * We read the metapage here to determine the pages-per-range number that this + * index was built with. Note that since this cannot be changed while we're + * holding lock on index, it's not necessary to recompute it during brinrescan. + */ +IndexScanDesc +brinbeginscan(Relation r, int nkeys, int norderbys) +{ + IndexScanDesc scan; + BrinOpaque *opaque; + + scan = RelationGetIndexScan(r, nkeys, norderbys); + + opaque = (BrinOpaque *) palloc(sizeof(BrinOpaque)); + opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange, + scan->xs_snapshot); + opaque->bo_bdesc = brin_build_desc(r); + scan->opaque = opaque; + + return scan; +} + +/* + * Execute the index scan. + * + * This works by reading index TIDs from the revmap, and obtaining the index + * tuples pointed to by them; the summary values in the index tuples are + * compared to the scan keys. We return into the TID bitmap all the pages in + * ranges corresponding to index tuples that match the scan keys. + * + * If a TID from the revmap is read as InvalidTID, we know that range is + * unsummarized. Pages in those ranges need to be returned regardless of scan + * keys. + */ +int64 +bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm) +{ + Relation idxRel = scan->indexRelation; + Buffer buf = InvalidBuffer; + BrinDesc *bdesc; + Oid heapOid; + Relation heapRel; + BrinOpaque *opaque; + BlockNumber nblocks; + BlockNumber heapBlk; + int totalpages = 0; + FmgrInfo *consistentFn; + MemoryContext oldcxt; + MemoryContext perRangeCxt; + BrinMemTuple *dtup; + BrinTuple *btup = NULL; + Size btupsz = 0; + ScanKey **keys, + **nullkeys; + int *nkeys, + *nnullkeys; + int keyno; + char *ptr; + Size len; + char *tmp PG_USED_FOR_ASSERTS_ONLY; + + opaque = (BrinOpaque *) scan->opaque; + bdesc = opaque->bo_bdesc; + pgstat_count_index_scan(idxRel); + + /* + * We need to know the size of the table so that we know how long to + * iterate on the revmap. + */ + heapOid = IndexGetRelation(RelationGetRelid(idxRel), false); + heapRel = table_open(heapOid, AccessShareLock); + nblocks = RelationGetNumberOfBlocks(heapRel); + table_close(heapRel, AccessShareLock); + + /* + * Make room for the consistent support procedures of indexed columns. We + * don't look them up here; we do that lazily the first time we see a scan + * key reference each of them. We rely on zeroing fn_oid to InvalidOid. + */ + consistentFn = palloc0(sizeof(FmgrInfo) * bdesc->bd_tupdesc->natts); + + /* + * Make room for per-attribute lists of scan keys that we'll pass to the + * consistent support procedure. We don't know which attributes have scan + * keys, so we allocate space for all attributes. That may use more memory + * but it's probably cheaper than determining which attributes are used. + * + * We keep null and regular keys separate, so that we can pass just the + * regular keys to the consistent function easily. + * + * To reduce the allocation overhead, we allocate one big chunk and then + * carve it into smaller arrays ourselves. All the pieces have exactly the + * same lifetime, so that's OK. + * + * XXX The widest index can have 32 attributes, so the amount of wasted + * memory is negligible. We could invent a more compact approach (with + * just space for used attributes) but that would make the matching more + * complex so it's not a good trade-off. + */ + len = + MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* regular keys */ + MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts + + MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts) + + MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* NULL keys */ + MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts + + MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts); + + ptr = palloc(len); + tmp = ptr; + + keys = (ScanKey **) ptr; + ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts); + + nullkeys = (ScanKey **) ptr; + ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts); + + nkeys = (int *) ptr; + ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts); + + nnullkeys = (int *) ptr; + ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts); + + for (int i = 0; i < bdesc->bd_tupdesc->natts; i++) + { + keys[i] = (ScanKey *) ptr; + ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys); + + nullkeys[i] = (ScanKey *) ptr; + ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys); + } + + Assert(tmp + len == ptr); + + /* zero the number of keys */ + memset(nkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts); + memset(nnullkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts); + + /* Preprocess the scan keys - split them into per-attribute arrays. */ + for (keyno = 0; keyno < scan->numberOfKeys; keyno++) + { + ScanKey key = &scan->keyData[keyno]; + AttrNumber keyattno = key->sk_attno; + + /* + * The collation of the scan key must match the collation used in the + * index column (but only if the search is not IS NULL/ IS NOT NULL). + * Otherwise we shouldn't be using this index ... + */ + Assert((key->sk_flags & SK_ISNULL) || + (key->sk_collation == + TupleDescAttr(bdesc->bd_tupdesc, + keyattno - 1)->attcollation)); + + /* + * First time we see this index attribute, so init as needed. + * + * This is a bit of an overkill - we don't know how many scan keys are + * there for this attribute, so we simply allocate the largest number + * possible (as if all keys were for this attribute). This may waste a + * bit of memory, but we only expect small number of scan keys in + * general, so this should be negligible, and repeated repalloc calls + * are not free either. + */ + if (consistentFn[keyattno - 1].fn_oid == InvalidOid) + { + FmgrInfo *tmp; + + /* First time we see this attribute, so no key/null keys. */ + Assert(nkeys[keyattno - 1] == 0); + Assert(nnullkeys[keyattno - 1] == 0); + + tmp = index_getprocinfo(idxRel, keyattno, + BRIN_PROCNUM_CONSISTENT); + fmgr_info_copy(&consistentFn[keyattno - 1], tmp, + CurrentMemoryContext); + } + + /* Add key to the proper per-attribute array. */ + if (key->sk_flags & SK_ISNULL) + { + nullkeys[keyattno - 1][nnullkeys[keyattno - 1]] = key; + nnullkeys[keyattno - 1]++; + } + else + { + keys[keyattno - 1][nkeys[keyattno - 1]] = key; + nkeys[keyattno - 1]++; + } + } + + /* allocate an initial in-memory tuple, out of the per-range memcxt */ + dtup = brin_new_memtuple(bdesc); + + /* + * Setup and use a per-range memory context, which is reset every time we + * loop below. This avoids having to free the tuples within the loop. + */ + perRangeCxt = AllocSetContextCreate(CurrentMemoryContext, + "bringetbitmap cxt", + ALLOCSET_DEFAULT_SIZES); + oldcxt = MemoryContextSwitchTo(perRangeCxt); + + /* + * Now scan the revmap. We start by querying for heap page 0, + * incrementing by the number of pages per range; this gives us a full + * view of the table. + */ + for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange) + { + bool addrange; + bool gottuple = false; + BrinTuple *tup; + OffsetNumber off; + Size size; + + CHECK_FOR_INTERRUPTS(); + + MemoryContextResetAndDeleteChildren(perRangeCxt); + + tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf, + &off, &size, BUFFER_LOCK_SHARE, + scan->xs_snapshot); + if (tup) + { + gottuple = true; + btup = brin_copy_tuple(tup, size, btup, &btupsz); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + + /* + * For page ranges with no indexed tuple, we must return the whole + * range; otherwise, compare it to the scan keys. + */ + if (!gottuple) + { + addrange = true; + } + else + { + dtup = brin_deform_tuple(bdesc, btup, dtup); + if (dtup->bt_placeholder) + { + /* + * Placeholder tuples are always returned, regardless of the + * values stored in them. + */ + addrange = true; + } + else + { + int attno; + + /* + * Compare scan keys with summary values stored for the range. + * If scan keys are matched, the page range must be added to + * the bitmap. We initially assume the range needs to be + * added; in particular this serves the case where there are + * no keys. + */ + addrange = true; + for (attno = 1; attno <= bdesc->bd_tupdesc->natts; attno++) + { + BrinValues *bval; + Datum add; + Oid collation; + + /* + * skip attributes without any scan keys (both regular and + * IS [NOT] NULL) + */ + if (nkeys[attno - 1] == 0 && nnullkeys[attno - 1] == 0) + continue; + + bval = &dtup->bt_columns[attno - 1]; + + /* + * First check if there are any IS [NOT] NULL scan keys, + * and if we're violating them. In that case we can + * terminate early, without invoking the support function. + * + * As there may be more keys, we can only determine + * mismatch within this loop. + */ + if (bdesc->bd_info[attno - 1]->oi_regular_nulls && + !check_null_keys(bval, nullkeys[attno - 1], + nnullkeys[attno - 1])) + { + /* + * If any of the IS [NOT] NULL keys failed, the page + * range as a whole can't pass. So terminate the loop. + */ + addrange = false; + break; + } + + /* + * So either there are no IS [NOT] NULL keys, or all + * passed. If there are no regular scan keys, we're done - + * the page range matches. If there are regular keys, but + * the page range is marked as 'all nulls' it can't + * possibly pass (we're assuming the operators are + * strict). + */ + + /* No regular scan keys - page range as a whole passes. */ + if (!nkeys[attno - 1]) + continue; + + Assert((nkeys[attno - 1] > 0) && + (nkeys[attno - 1] <= scan->numberOfKeys)); + + /* If it is all nulls, it cannot possibly be consistent. */ + if (bval->bv_allnulls) + { + addrange = false; + break; + } + + /* + * Collation from the first key (has to be the same for + * all keys for the same attribute). + */ + collation = keys[attno - 1][0]->sk_collation; + + /* + * Check whether the scan key is consistent with the page + * range values; if so, have the pages in the range added + * to the output bitmap. + * + * The opclass may or may not support processing of + * multiple scan keys. We can determine that based on the + * number of arguments - functions with extra parameter + * (number of scan keys) do support this, otherwise we + * have to simply pass the scan keys one by one. + */ + if (consistentFn[attno - 1].fn_nargs >= 4) + { + /* Check all keys at once */ + add = FunctionCall4Coll(&consistentFn[attno - 1], + collation, + PointerGetDatum(bdesc), + PointerGetDatum(bval), + PointerGetDatum(keys[attno - 1]), + Int32GetDatum(nkeys[attno - 1])); + addrange = DatumGetBool(add); + } + else + { + /* + * Check keys one by one + * + * When there are multiple scan keys, failure to meet + * the criteria for a single one of them is enough to + * discard the range as a whole, so break out of the + * loop as soon as a false return value is obtained. + */ + int keyno; + + for (keyno = 0; keyno < nkeys[attno - 1]; keyno++) + { + add = FunctionCall3Coll(&consistentFn[attno - 1], + keys[attno - 1][keyno]->sk_collation, + PointerGetDatum(bdesc), + PointerGetDatum(bval), + PointerGetDatum(keys[attno - 1][keyno])); + addrange = DatumGetBool(add); + if (!addrange) + break; + } + } + } + } + } + + /* add the pages in the range to the output bitmap, if needed */ + if (addrange) + { + BlockNumber pageno; + + for (pageno = heapBlk; + pageno <= Min(nblocks, heapBlk + opaque->bo_pagesPerRange) - 1; + pageno++) + { + MemoryContextSwitchTo(oldcxt); + tbm_add_page(tbm, pageno); + totalpages++; + MemoryContextSwitchTo(perRangeCxt); + } + } + } + + MemoryContextSwitchTo(oldcxt); + MemoryContextDelete(perRangeCxt); + + if (buf != InvalidBuffer) + ReleaseBuffer(buf); + + /* + * XXX We have an approximation of the number of *pages* that our scan + * returns, but we don't have a precise idea of the number of heap tuples + * involved. + */ + return totalpages * 10; +} + +/* + * Re-initialize state for a BRIN index scan + */ +void +brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys) +{ + /* + * Other index AMs preprocess the scan keys at this point, or sometime + * early during the scan; this lets them optimize by removing redundant + * keys, or doing early returns when they are impossible to satisfy; see + * _bt_preprocess_keys for an example. Something like that could be added + * here someday, too. + */ + + if (scankey && scan->numberOfKeys > 0) + memmove(scan->keyData, scankey, + scan->numberOfKeys * sizeof(ScanKeyData)); +} + +/* + * Close down a BRIN index scan + */ +void +brinendscan(IndexScanDesc scan) +{ + BrinOpaque *opaque = (BrinOpaque *) scan->opaque; + + brinRevmapTerminate(opaque->bo_rmAccess); + brin_free_desc(opaque->bo_bdesc); + pfree(opaque); +} + +/* + * Per-heap-tuple callback for table_index_build_scan. + * + * Note we don't worry about the page range at the end of the table here; it is + * present in the build state struct after we're called the last time, but not + * inserted into the index. Caller must ensure to do so, if appropriate. + */ +static void +brinbuildCallback(Relation index, + ItemPointer tid, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *brstate) +{ + BrinBuildState *state = (BrinBuildState *) brstate; + BlockNumber thisblock; + + thisblock = ItemPointerGetBlockNumber(tid); + + /* + * If we're in a block that belongs to a future range, summarize what + * we've got and start afresh. Note the scan might have skipped many + * pages, if they were devoid of live tuples; make sure to insert index + * tuples for those too. + */ + while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1) + { + + BRIN_elog((DEBUG2, + "brinbuildCallback: completed a range: %u--%u", + state->bs_currRangeStart, + state->bs_currRangeStart + state->bs_pagesPerRange)); + + /* create the index tuple and insert it */ + form_and_insert_tuple(state); + + /* set state to correspond to the next range */ + state->bs_currRangeStart += state->bs_pagesPerRange; + + /* re-initialize state for it */ + brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc); + } + + /* Accumulate the current tuple into the running state */ + (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple, + values, isnull); +} + +/* + * brinbuild() -- build a new BRIN index. + */ +IndexBuildResult * +brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) +{ + IndexBuildResult *result; + double reltuples; + double idxtuples; + BrinRevmap *revmap; + BrinBuildState *state; + Buffer meta; + BlockNumber pagesPerRange; + + /* + * We expect to be called exactly once for any index relation. + */ + if (RelationGetNumberOfBlocks(index) != 0) + elog(ERROR, "index \"%s\" already contains data", + RelationGetRelationName(index)); + + /* + * Critical section not required, because on error the creation of the + * whole relation will be rolled back. + */ + + meta = ReadBuffer(index, P_NEW); + Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO); + LockBuffer(meta, BUFFER_LOCK_EXCLUSIVE); + + brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index), + BRIN_CURRENT_VERSION); + MarkBufferDirty(meta); + + if (RelationNeedsWAL(index)) + { + xl_brin_createidx xlrec; + XLogRecPtr recptr; + Page page; + + xlrec.version = BRIN_CURRENT_VERSION; + xlrec.pagesPerRange = BrinGetPagesPerRange(index); + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBrinCreateIdx); + XLogRegisterBuffer(0, meta, REGBUF_WILL_INIT | REGBUF_STANDARD); + + recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX); + + page = BufferGetPage(meta); + PageSetLSN(page, recptr); + } + + UnlockReleaseBuffer(meta); + + /* + * Initialize our state, including the deformed tuple state. + */ + revmap = brinRevmapInitialize(index, &pagesPerRange, NULL); + state = initialize_brin_buildstate(index, revmap, pagesPerRange); + + /* + * Now scan the relation. No syncscan allowed here because we want the + * heap blocks in physical order. + */ + reltuples = table_index_build_scan(heap, index, indexInfo, false, true, + brinbuildCallback, (void *) state, NULL); + + /* process the final batch */ + form_and_insert_tuple(state); + + /* release resources */ + idxtuples = state->bs_numtuples; + brinRevmapTerminate(state->bs_rmAccess); + terminate_brin_buildstate(state); + + /* + * Return statistics + */ + result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + + result->heap_tuples = reltuples; + result->index_tuples = idxtuples; + + return result; +} + +void +brinbuildempty(Relation index) +{ + Buffer metabuf; + + /* An empty BRIN index has a metapage only. */ + metabuf = + ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* Initialize and xlog metabuffer. */ + START_CRIT_SECTION(); + brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index), + BRIN_CURRENT_VERSION); + MarkBufferDirty(metabuf); + log_newpage_buffer(metabuf, true); + END_CRIT_SECTION(); + + UnlockReleaseBuffer(metabuf); +} + +/* + * brinbulkdelete + * Since there are no per-heap-tuple index tuples in BRIN indexes, + * there's not a lot we can do here. + * + * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap + * tuple is deleted), meaning the need to re-run summarization on the affected + * range. Would need to add an extra flag in brintuples for that. + */ +IndexBulkDeleteResult * +brinbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state) +{ + /* allocate stats if first time through, else re-use existing struct */ + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + + return stats; +} + +/* + * This routine is in charge of "vacuuming" a BRIN index: we just summarize + * ranges that are currently unsummarized. + */ +IndexBulkDeleteResult * +brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) +{ + Relation heapRel; + + /* No-op in ANALYZE ONLY mode */ + if (info->analyze_only) + return stats; + + if (!stats) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + stats->num_pages = RelationGetNumberOfBlocks(info->index); + /* rest of stats is initialized by zeroing */ + + heapRel = table_open(IndexGetRelation(RelationGetRelid(info->index), false), + AccessShareLock); + + brin_vacuum_scan(info->index, info->strategy); + + brinsummarize(info->index, heapRel, BRIN_ALL_BLOCKRANGES, false, + &stats->num_index_tuples, &stats->num_index_tuples); + + table_close(heapRel, AccessShareLock); + + return stats; +} + +/* + * reloptions processor for BRIN indexes + */ +bytea * +brinoptions(Datum reloptions, bool validate) +{ + static const relopt_parse_elt tab[] = { + {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)}, + {"autosummarize", RELOPT_TYPE_BOOL, offsetof(BrinOptions, autosummarize)} + }; + + return (bytea *) build_reloptions(reloptions, validate, + RELOPT_KIND_BRIN, + sizeof(BrinOptions), + tab, lengthof(tab)); +} + +/* + * SQL-callable function to scan through an index and summarize all ranges + * that are not currently summarized. + */ +Datum +brin_summarize_new_values(PG_FUNCTION_ARGS) +{ + Datum relation = PG_GETARG_DATUM(0); + + return DirectFunctionCall2(brin_summarize_range, + relation, + Int64GetDatum((int64) BRIN_ALL_BLOCKRANGES)); +} + +/* + * SQL-callable function to summarize the indicated page range, if not already + * summarized. If the second argument is BRIN_ALL_BLOCKRANGES, all + * unsummarized ranges are summarized. + */ +Datum +brin_summarize_range(PG_FUNCTION_ARGS) +{ + Oid indexoid = PG_GETARG_OID(0); + int64 heapBlk64 = PG_GETARG_INT64(1); + BlockNumber heapBlk; + Oid heapoid; + Relation indexRel; + Relation heapRel; + Oid save_userid; + int save_sec_context; + int save_nestlevel; + double numSummarized = 0; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("BRIN control functions cannot be executed during recovery."))); + + if (heapBlk64 > BRIN_ALL_BLOCKRANGES || heapBlk64 < 0) + { + char *blk = psprintf(INT64_FORMAT, heapBlk64); + + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("block number out of range: %s", blk))); + } + heapBlk = (BlockNumber) heapBlk64; + + /* + * We must lock table before index to avoid deadlocks. However, if the + * passed indexoid isn't an index then IndexGetRelation() will fail. + * Rather than emitting a not-very-helpful error message, postpone + * complaining, expecting that the is-it-an-index test below will fail. + */ + heapoid = IndexGetRelation(indexoid, true); + if (OidIsValid(heapoid)) + { + heapRel = table_open(heapoid, ShareUpdateExclusiveLock); + + /* + * Autovacuum calls us. For its benefit, switch to the table owner's + * userid, so that any index functions are run as that user. Also + * lock down security-restricted operations and arrange to make GUC + * variable changes local to this command. This is harmless, albeit + * unnecessary, when called from SQL, because we fail shortly if the + * user does not own the index. + */ + GetUserIdAndSecContext(&save_userid, &save_sec_context); + SetUserIdAndSecContext(heapRel->rd_rel->relowner, + save_sec_context | SECURITY_RESTRICTED_OPERATION); + save_nestlevel = NewGUCNestLevel(); + } + else + { + heapRel = NULL; + /* Set these just to suppress "uninitialized variable" warnings */ + save_userid = InvalidOid; + save_sec_context = -1; + save_nestlevel = -1; + } + + indexRel = index_open(indexoid, ShareUpdateExclusiveLock); + + /* Must be a BRIN index */ + if (indexRel->rd_rel->relkind != RELKIND_INDEX || + indexRel->rd_rel->relam != BRIN_AM_OID) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is not a BRIN index", + RelationGetRelationName(indexRel)))); + + /* User must own the index (comparable to privileges needed for VACUUM) */ + if (heapRel != NULL && !pg_class_ownercheck(indexoid, save_userid)) + aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX, + RelationGetRelationName(indexRel)); + + /* + * Since we did the IndexGetRelation call above without any lock, it's + * barely possible that a race against an index drop/recreation could have + * netted us the wrong table. Recheck. + */ + if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_TABLE), + errmsg("could not open parent table of index \"%s\"", + RelationGetRelationName(indexRel)))); + + /* OK, do it */ + brinsummarize(indexRel, heapRel, heapBlk, true, &numSummarized, NULL); + + /* Roll back any GUC changes executed by index functions */ + AtEOXact_GUC(false, save_nestlevel); + + /* Restore userid and security context */ + SetUserIdAndSecContext(save_userid, save_sec_context); + + relation_close(indexRel, ShareUpdateExclusiveLock); + relation_close(heapRel, ShareUpdateExclusiveLock); + + PG_RETURN_INT32((int32) numSummarized); +} + +/* + * SQL-callable interface to mark a range as no longer summarized + */ +Datum +brin_desummarize_range(PG_FUNCTION_ARGS) +{ + Oid indexoid = PG_GETARG_OID(0); + int64 heapBlk64 = PG_GETARG_INT64(1); + BlockNumber heapBlk; + Oid heapoid; + Relation heapRel; + Relation indexRel; + bool done; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("BRIN control functions cannot be executed during recovery."))); + + if (heapBlk64 > MaxBlockNumber || heapBlk64 < 0) + { + char *blk = psprintf(INT64_FORMAT, heapBlk64); + + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("block number out of range: %s", blk))); + } + heapBlk = (BlockNumber) heapBlk64; + + /* + * We must lock table before index to avoid deadlocks. However, if the + * passed indexoid isn't an index then IndexGetRelation() will fail. + * Rather than emitting a not-very-helpful error message, postpone + * complaining, expecting that the is-it-an-index test below will fail. + * + * Unlike brin_summarize_range(), autovacuum never calls this. Hence, we + * don't switch userid. + */ + heapoid = IndexGetRelation(indexoid, true); + if (OidIsValid(heapoid)) + heapRel = table_open(heapoid, ShareUpdateExclusiveLock); + else + heapRel = NULL; + + indexRel = index_open(indexoid, ShareUpdateExclusiveLock); + + /* Must be a BRIN index */ + if (indexRel->rd_rel->relkind != RELKIND_INDEX || + indexRel->rd_rel->relam != BRIN_AM_OID) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is not a BRIN index", + RelationGetRelationName(indexRel)))); + + /* User must own the index (comparable to privileges needed for VACUUM) */ + if (!pg_class_ownercheck(indexoid, GetUserId())) + aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX, + RelationGetRelationName(indexRel)); + + /* + * Since we did the IndexGetRelation call above without any lock, it's + * barely possible that a race against an index drop/recreation could have + * netted us the wrong table. Recheck. + */ + if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_TABLE), + errmsg("could not open parent table of index \"%s\"", + RelationGetRelationName(indexRel)))); + + /* the revmap does the hard work */ + do + { + done = brinRevmapDesummarizeRange(indexRel, heapBlk); + } + while (!done); + + relation_close(indexRel, ShareUpdateExclusiveLock); + relation_close(heapRel, ShareUpdateExclusiveLock); + + PG_RETURN_VOID(); +} + +/* + * Build a BrinDesc used to create or scan a BRIN index + */ +BrinDesc * +brin_build_desc(Relation rel) +{ + BrinOpcInfo **opcinfo; + BrinDesc *bdesc; + TupleDesc tupdesc; + int totalstored = 0; + int keyno; + long totalsize; + MemoryContext cxt; + MemoryContext oldcxt; + + cxt = AllocSetContextCreate(CurrentMemoryContext, + "brin desc cxt", + ALLOCSET_SMALL_SIZES); + oldcxt = MemoryContextSwitchTo(cxt); + tupdesc = RelationGetDescr(rel); + + /* + * Obtain BrinOpcInfo for each indexed column. While at it, accumulate + * the number of columns stored, since the number is opclass-defined. + */ + opcinfo = (BrinOpcInfo **) palloc(sizeof(BrinOpcInfo *) * tupdesc->natts); + for (keyno = 0; keyno < tupdesc->natts; keyno++) + { + FmgrInfo *opcInfoFn; + Form_pg_attribute attr = TupleDescAttr(tupdesc, keyno); + + opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO); + + opcinfo[keyno] = (BrinOpcInfo *) + DatumGetPointer(FunctionCall1(opcInfoFn, attr->atttypid)); + totalstored += opcinfo[keyno]->oi_nstored; + } + + /* Allocate our result struct and fill it in */ + totalsize = offsetof(BrinDesc, bd_info) + + sizeof(BrinOpcInfo *) * tupdesc->natts; + + bdesc = palloc(totalsize); + bdesc->bd_context = cxt; + bdesc->bd_index = rel; + bdesc->bd_tupdesc = tupdesc; + bdesc->bd_disktdesc = NULL; /* generated lazily */ + bdesc->bd_totalstored = totalstored; + + for (keyno = 0; keyno < tupdesc->natts; keyno++) + bdesc->bd_info[keyno] = opcinfo[keyno]; + pfree(opcinfo); + + MemoryContextSwitchTo(oldcxt); + + return bdesc; +} + +void +brin_free_desc(BrinDesc *bdesc) +{ + /* make sure the tupdesc is still valid */ + Assert(bdesc->bd_tupdesc->tdrefcount >= 1); + /* no need for retail pfree */ + MemoryContextDelete(bdesc->bd_context); +} + +/* + * Fetch index's statistical data into *stats + */ +void +brinGetStats(Relation index, BrinStatsData *stats) +{ + Buffer metabuffer; + Page metapage; + BrinMetaPageData *metadata; + + metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO); + LockBuffer(metabuffer, BUFFER_LOCK_SHARE); + metapage = BufferGetPage(metabuffer); + metadata = (BrinMetaPageData *) PageGetContents(metapage); + + stats->pagesPerRange = metadata->pagesPerRange; + stats->revmapNumPages = metadata->lastRevmapPage - 1; + + UnlockReleaseBuffer(metabuffer); +} + +/* + * Initialize a BrinBuildState appropriate to create tuples on the given index. + */ +static BrinBuildState * +initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap, + BlockNumber pagesPerRange) +{ + BrinBuildState *state; + + state = palloc(sizeof(BrinBuildState)); + + state->bs_irel = idxRel; + state->bs_numtuples = 0; + state->bs_currentInsertBuf = InvalidBuffer; + state->bs_pagesPerRange = pagesPerRange; + state->bs_currRangeStart = 0; + state->bs_rmAccess = revmap; + state->bs_bdesc = brin_build_desc(idxRel); + state->bs_dtuple = brin_new_memtuple(state->bs_bdesc); + + brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc); + + return state; +} + +/* + * Release resources associated with a BrinBuildState. + */ +static void +terminate_brin_buildstate(BrinBuildState *state) +{ + /* + * Release the last index buffer used. We might as well ensure that + * whatever free space remains in that page is available in FSM, too. + */ + if (!BufferIsInvalid(state->bs_currentInsertBuf)) + { + Page page; + Size freespace; + BlockNumber blk; + + page = BufferGetPage(state->bs_currentInsertBuf); + freespace = PageGetFreeSpace(page); + blk = BufferGetBlockNumber(state->bs_currentInsertBuf); + ReleaseBuffer(state->bs_currentInsertBuf); + RecordPageWithFreeSpace(state->bs_irel, blk, freespace); + FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1); + } + + brin_free_desc(state->bs_bdesc); + pfree(state->bs_dtuple); + pfree(state); +} + +/* + * On the given BRIN index, summarize the heap page range that corresponds + * to the heap block number given. + * + * This routine can run in parallel with insertions into the heap. To avoid + * missing those values from the summary tuple, we first insert a placeholder + * index tuple into the index, then execute the heap scan; transactions + * concurrent with the scan update the placeholder tuple. After the scan, we + * union the placeholder tuple with the one computed by this routine. The + * update of the index value happens in a loop, so that if somebody updates + * the placeholder tuple after we read it, we detect the case and try again. + * This ensures that the concurrently inserted tuples are not lost. + * + * A further corner case is this routine being asked to summarize the partial + * range at the end of the table. heapNumBlocks is the (possibly outdated) + * table size; if we notice that the requested range lies beyond that size, + * we re-compute the table size after inserting the placeholder tuple, to + * avoid missing pages that were appended recently. + */ +static void +summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel, + BlockNumber heapBlk, BlockNumber heapNumBlks) +{ + Buffer phbuf; + BrinTuple *phtup; + Size phsz; + OffsetNumber offset; + BlockNumber scanNumBlks; + + /* + * Insert the placeholder tuple + */ + phbuf = InvalidBuffer; + phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz); + offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange, + state->bs_rmAccess, &phbuf, + heapBlk, phtup, phsz); + + /* + * Compute range end. We hold ShareUpdateExclusive lock on table, so it + * cannot shrink concurrently (but it can grow). + */ + Assert(heapBlk % state->bs_pagesPerRange == 0); + if (heapBlk + state->bs_pagesPerRange > heapNumBlks) + { + /* + * If we're asked to scan what we believe to be the final range on the + * table (i.e. a range that might be partial) we need to recompute our + * idea of what the latest page is after inserting the placeholder + * tuple. Anyone that grows the table later will update the + * placeholder tuple, so it doesn't matter that we won't scan these + * pages ourselves. Careful: the table might have been extended + * beyond the current range, so clamp our result. + * + * Fortunately, this should occur infrequently. + */ + scanNumBlks = Min(RelationGetNumberOfBlocks(heapRel) - heapBlk, + state->bs_pagesPerRange); + } + else + { + /* Easy case: range is known to be complete */ + scanNumBlks = state->bs_pagesPerRange; + } + + /* + * Execute the partial heap scan covering the heap blocks in the specified + * page range, summarizing the heap tuples in it. This scan stops just + * short of brinbuildCallback creating the new index entry. + * + * Note that it is critical we use the "any visible" mode of + * table_index_build_range_scan here: otherwise, we would miss tuples + * inserted by transactions that are still in progress, among other corner + * cases. + */ + state->bs_currRangeStart = heapBlk; + table_index_build_range_scan(heapRel, state->bs_irel, indexInfo, false, true, false, + heapBlk, scanNumBlks, + brinbuildCallback, (void *) state, NULL); + + /* + * Now we update the values obtained by the scan with the placeholder + * tuple. We do this in a loop which only terminates if we're able to + * update the placeholder tuple successfully; if we are not, this means + * somebody else modified the placeholder tuple after we read it. + */ + for (;;) + { + BrinTuple *newtup; + Size newsize; + bool didupdate; + bool samepage; + + CHECK_FOR_INTERRUPTS(); + + /* + * Update the summary tuple and try to update. + */ + newtup = brin_form_tuple(state->bs_bdesc, + heapBlk, state->bs_dtuple, &newsize); + samepage = brin_can_do_samepage_update(phbuf, phsz, newsize); + didupdate = + brin_doupdate(state->bs_irel, state->bs_pagesPerRange, + state->bs_rmAccess, heapBlk, phbuf, offset, + phtup, phsz, newtup, newsize, samepage); + brin_free_tuple(phtup); + brin_free_tuple(newtup); + + /* If the update succeeded, we're done. */ + if (didupdate) + break; + + /* + * If the update didn't work, it might be because somebody updated the + * placeholder tuple concurrently. Extract the new version, union it + * with the values we have from the scan, and start over. (There are + * other reasons for the update to fail, but it's simple to treat them + * the same.) + */ + phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf, + &offset, &phsz, BUFFER_LOCK_SHARE, + NULL); + /* the placeholder tuple must exist */ + if (phtup == NULL) + elog(ERROR, "missing placeholder tuple"); + phtup = brin_copy_tuple(phtup, phsz, NULL, NULL); + LockBuffer(phbuf, BUFFER_LOCK_UNLOCK); + + /* merge it into the tuple from the heap scan */ + union_tuples(state->bs_bdesc, state->bs_dtuple, phtup); + } + + ReleaseBuffer(phbuf); +} + +/* + * Summarize page ranges that are not already summarized. If pageRange is + * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the + * page range containing the given heap page number is scanned. + * If include_partial is true, then the partial range at the end of the table + * is summarized, otherwise not. + * + * For each new index tuple inserted, *numSummarized (if not NULL) is + * incremented; for each existing tuple, *numExisting (if not NULL) is + * incremented. + */ +static void +brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange, + bool include_partial, double *numSummarized, double *numExisting) +{ + BrinRevmap *revmap; + BrinBuildState *state = NULL; + IndexInfo *indexInfo = NULL; + BlockNumber heapNumBlocks; + BlockNumber pagesPerRange; + Buffer buf; + BlockNumber startBlk; + + revmap = brinRevmapInitialize(index, &pagesPerRange, NULL); + + /* determine range of pages to process */ + heapNumBlocks = RelationGetNumberOfBlocks(heapRel); + if (pageRange == BRIN_ALL_BLOCKRANGES) + startBlk = 0; + else + { + startBlk = (pageRange / pagesPerRange) * pagesPerRange; + heapNumBlocks = Min(heapNumBlocks, startBlk + pagesPerRange); + } + if (startBlk > heapNumBlocks) + { + /* Nothing to do if start point is beyond end of table */ + brinRevmapTerminate(revmap); + return; + } + + /* + * Scan the revmap to find unsummarized items. + */ + buf = InvalidBuffer; + for (; startBlk < heapNumBlocks; startBlk += pagesPerRange) + { + BrinTuple *tup; + OffsetNumber off; + + /* + * Unless requested to summarize even a partial range, go away now if + * we think the next range is partial. Caller would pass true when it + * is typically run once bulk data loading is done + * (brin_summarize_new_values), and false when it is typically the + * result of arbitrarily-scheduled maintenance command (vacuuming). + */ + if (!include_partial && + (startBlk + pagesPerRange > heapNumBlocks)) + break; + + CHECK_FOR_INTERRUPTS(); + + tup = brinGetTupleForHeapBlock(revmap, startBlk, &buf, &off, NULL, + BUFFER_LOCK_SHARE, NULL); + if (tup == NULL) + { + /* no revmap entry for this heap range. Summarize it. */ + if (state == NULL) + { + /* first time through */ + Assert(!indexInfo); + state = initialize_brin_buildstate(index, revmap, + pagesPerRange); + indexInfo = BuildIndexInfo(index); + } + summarize_range(indexInfo, state, heapRel, startBlk, heapNumBlocks); + + /* and re-initialize state for the next range */ + brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc); + + if (numSummarized) + *numSummarized += 1.0; + } + else + { + if (numExisting) + *numExisting += 1.0; + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + } + + if (BufferIsValid(buf)) + ReleaseBuffer(buf); + + /* free resources */ + brinRevmapTerminate(revmap); + if (state) + { + terminate_brin_buildstate(state); + pfree(indexInfo); + } +} + +/* + * Given a deformed tuple in the build state, convert it into the on-disk + * format and insert it into the index, making the revmap point to it. + */ +static void +form_and_insert_tuple(BrinBuildState *state) +{ + BrinTuple *tup; + Size size; + + tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart, + state->bs_dtuple, &size); + brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess, + &state->bs_currentInsertBuf, state->bs_currRangeStart, + tup, size); + state->bs_numtuples++; + + pfree(tup); +} + +/* + * Given two deformed tuples, adjust the first one so that it's consistent + * with the summary values in both. + */ +static void +union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b) +{ + int keyno; + BrinMemTuple *db; + MemoryContext cxt; + MemoryContext oldcxt; + + /* Use our own memory context to avoid retail pfree */ + cxt = AllocSetContextCreate(CurrentMemoryContext, + "brin union", + ALLOCSET_DEFAULT_SIZES); + oldcxt = MemoryContextSwitchTo(cxt); + db = brin_deform_tuple(bdesc, b, NULL); + MemoryContextSwitchTo(oldcxt); + + for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) + { + FmgrInfo *unionFn; + BrinValues *col_a = &a->bt_columns[keyno]; + BrinValues *col_b = &db->bt_columns[keyno]; + BrinOpcInfo *opcinfo = bdesc->bd_info[keyno]; + + if (opcinfo->oi_regular_nulls) + { + /* Adjust "hasnulls". */ + if (!col_a->bv_hasnulls && col_b->bv_hasnulls) + col_a->bv_hasnulls = true; + + /* If there are no values in B, there's nothing left to do. */ + if (col_b->bv_allnulls) + continue; + + /* + * Adjust "allnulls". If A doesn't have values, just copy the + * values from B into A, and we're done. We cannot run the + * operators in this case, because values in A might contain + * garbage. Note we already established that B contains values. + */ + if (col_a->bv_allnulls) + { + int i; + + col_a->bv_allnulls = false; + + for (i = 0; i < opcinfo->oi_nstored; i++) + col_a->bv_values[i] = + datumCopy(col_b->bv_values[i], + opcinfo->oi_typcache[i]->typbyval, + opcinfo->oi_typcache[i]->typlen); + + continue; + } + } + + unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1, + BRIN_PROCNUM_UNION); + FunctionCall3Coll(unionFn, + bdesc->bd_index->rd_indcollation[keyno], + PointerGetDatum(bdesc), + PointerGetDatum(col_a), + PointerGetDatum(col_b)); + } + + MemoryContextDelete(cxt); +} + +/* + * brin_vacuum_scan + * Do a complete scan of the index during VACUUM. + * + * This routine scans the complete index looking for uncatalogued index pages, + * i.e. those that might have been lost due to a crash after index extension + * and such. + */ +static void +brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy) +{ + BlockNumber nblocks; + BlockNumber blkno; + + /* + * Scan the index in physical order, and clean up any possible mess in + * each page. + */ + nblocks = RelationGetNumberOfBlocks(idxrel); + for (blkno = 0; blkno < nblocks; blkno++) + { + Buffer buf; + + CHECK_FOR_INTERRUPTS(); + + buf = ReadBufferExtended(idxrel, MAIN_FORKNUM, blkno, + RBM_NORMAL, strategy); + + brin_page_cleanup(idxrel, buf); + + ReleaseBuffer(buf); + } + + /* + * Update all upper pages in the index's FSM, as well. This ensures not + * only that we propagate leaf-page FSM updates made by brin_page_cleanup, + * but also that any pre-existing damage or out-of-dateness is repaired. + */ + FreeSpaceMapVacuum(idxrel); +} + +static bool +add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup, + Datum *values, bool *nulls) +{ + int keyno; + bool modified = false; + + /* + * Compare the key values of the new tuple to the stored index values; our + * deformed tuple will get updated if the new tuple doesn't fit the + * original range (note this means we can't break out of the loop early). + * Make a note of whether this happens, so that we know to insert the + * modified tuple later. + */ + for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) + { + Datum result; + BrinValues *bval; + FmgrInfo *addValue; + + bval = &dtup->bt_columns[keyno]; + + if (bdesc->bd_info[keyno]->oi_regular_nulls && nulls[keyno]) + { + /* + * If the new value is null, we record that we saw it if it's the + * first one; otherwise, there's nothing to do. + */ + if (!bval->bv_hasnulls) + { + bval->bv_hasnulls = true; + modified = true; + } + + continue; + } + + addValue = index_getprocinfo(idxRel, keyno + 1, + BRIN_PROCNUM_ADDVALUE); + result = FunctionCall4Coll(addValue, + idxRel->rd_indcollation[keyno], + PointerGetDatum(bdesc), + PointerGetDatum(bval), + values[keyno], + nulls[keyno]); + /* if that returned true, we need to insert the updated tuple */ + modified |= DatumGetBool(result); + } + + return modified; +} + +static bool +check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys) +{ + int keyno; + + /* + * First check if there are any IS [NOT] NULL scan keys, and if we're + * violating them. + */ + for (keyno = 0; keyno < nnullkeys; keyno++) + { + ScanKey key = nullkeys[keyno]; + + Assert(key->sk_attno == bval->bv_attno); + + /* Handle only IS NULL/IS NOT NULL tests */ + if (!(key->sk_flags & SK_ISNULL)) + continue; + + if (key->sk_flags & SK_SEARCHNULL) + { + /* IS NULL scan key, but range has no NULLs */ + if (!bval->bv_allnulls && !bval->bv_hasnulls) + return false; + } + else if (key->sk_flags & SK_SEARCHNOTNULL) + { + /* + * For IS NOT NULL, we can only skip ranges that are known to have + * only nulls. + */ + if (bval->bv_allnulls) + return false; + } + else + { + /* + * Neither IS NULL nor IS NOT NULL was used; assume all indexable + * operators are strict and thus return false with NULL value in + * the scan key. + */ + return false; + } + } + + return true; +} diff --git a/src/backend/access/brin/brin_bloom.c b/src/backend/access/brin/brin_bloom.c new file mode 100644 index 0000000..2c8a20a --- /dev/null +++ b/src/backend/access/brin/brin_bloom.c @@ -0,0 +1,809 @@ +/* + * brin_bloom.c + * Implementation of Bloom opclass for BRIN + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * A BRIN opclass summarizing page range into a bloom filter. + * + * Bloom filters allow efficient testing whether a given page range contains + * a particular value. Therefore, if we summarize each page range into a small + * bloom filter, we can easily (and cheaply) test whether it contains values + * we get later. + * + * The index only supports equality operators, similarly to hash indexes. + * Bloom indexes are however much smaller, and support only bitmap scans. + * + * Note: Don't confuse this with bloom indexes, implemented in a contrib + * module. That extension implements an entirely new AM, building a bloom + * filter on multiple columns in a single row. This opclass works with an + * existing AM (BRIN) and builds bloom filter on a column. + * + * + * values vs. hashes + * ----------------- + * + * The original column values are not used directly, but are first hashed + * using the regular type-specific hash function, producing a uint32 hash. + * And this hash value is then added to the summary - i.e. it's hashed + * again and added to the bloom filter. + * + * This allows the code to treat all data types (byval/byref/...) the same + * way, with only minimal space requirements, because we're working with + * hashes and not the original values. Everything is uint32. + * + * Of course, this assumes the built-in hash function is reasonably good, + * without too many collisions etc. But that does seem to be the case, at + * least based on past experience. After all, the same hash functions are + * used for hash indexes, hash partitioning and so on. + * + * + * hashing scheme + * -------------- + * + * Bloom filters require a number of independent hash functions. There are + * different schemes how to construct them - for example we might use + * hash_uint32_extended with random seeds, but that seems fairly expensive. + * We use a scheme requiring only two functions described in this paper: + * + * Less Hashing, Same Performance:Building a Better Bloom Filter + * Adam Kirsch, Michael Mitzenmacher†, Harvard School of Engineering and + * Applied Sciences, Cambridge, Massachusetts [DOI 10.1002/rsa.20208] + * + * The two hash functions h1 and h2 are calculated using hard-coded seeds, + * and then combined using (h1 + i * h2) to generate the hash functions. + * + * + * sizing the bloom filter + * ----------------------- + * + * Size of a bloom filter depends on the number of distinct values we will + * store in it, and the desired false positive rate. The higher the number + * of distinct values and/or the lower the false positive rate, the larger + * the bloom filter. On the other hand, we want to keep the index as small + * as possible - that's one of the basic advantages of BRIN indexes. + * + * Although the number of distinct elements (in a page range) depends on + * the data, we can consider it fixed. This simplifies the trade-off to + * just false positive rate vs. size. + * + * At the page range level, false positive rate is a probability the bloom + * filter matches a random value. For the whole index (with sufficiently + * many page ranges) it represents the fraction of the index ranges (and + * thus fraction of the table to be scanned) matching the random value. + * + * Furthermore, the size of the bloom filter is subject to implementation + * limits - it has to fit onto a single index page (8kB by default). As + * the bitmap is inherently random (when "full" about half the bits is set + * to 1, randomly), compression can't help very much. + * + * To reduce the size of a filter (to fit to a page), we have to either + * accept higher false positive rate (undesirable), or reduce the number + * of distinct items to be stored in the filter. We can't alter the input + * data, of course, but we may make the BRIN page ranges smaller - instead + * of the default 128 pages (1MB) we may build index with 16-page ranges, + * or something like that. This should reduce the number of distinct values + * in the page range, making the filter smaller (with fixed false positive + * rate). Even for random data sets this should help, as the number of rows + * per heap page is limited (to ~290 with very narrow tables, likely ~20 + * in practice). + * + * Of course, good sizing decisions depend on having the necessary data, + * i.e. number of distinct values in a page range (of a given size) and + * table size (to estimate cost change due to change in false positive + * rate due to having larger index vs. scanning larger indexes). We may + * not have that data - for example when building an index on empty table + * it's not really possible. And for some data we only have estimates for + * the whole table and we can only estimate per-range values (ndistinct). + * + * Another challenge is that while the bloom filter is per-column, it's + * the whole index tuple that has to fit into a page. And for multi-column + * indexes that may include pieces we have no control over (not necessarily + * bloom filters, the other columns may use other BRIN opclasses). So it's + * not entirely clear how to distribute the space between those columns. + * + * The current logic, implemented in brin_bloom_get_ndistinct, attempts to + * make some basic sizing decisions, based on the size of BRIN ranges, and + * the maximum number of rows per range. + * + * + * IDENTIFICATION + * src/backend/access/brin/brin_bloom.c + */ +#include "postgres.h" + +#include "access/genam.h" +#include "access/brin.h" +#include "access/brin_internal.h" +#include "access/brin_page.h" +#include "access/brin_tuple.h" +#include "access/hash.h" +#include "access/htup_details.h" +#include "access/reloptions.h" +#include "access/stratnum.h" +#include "catalog/pg_type.h" +#include "catalog/pg_amop.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" +#include "utils/syscache.h" + +#include <math.h> + +#define BloomEqualStrategyNumber 1 + +/* + * Additional SQL level support functions. We only have one, which is + * used to calculate hash of the input value. + * + * Procedure numbers must not use values reserved for BRIN itself; see + * brin_internal.h. + */ +#define BLOOM_MAX_PROCNUMS 1 /* maximum support procs we need */ +#define PROCNUM_HASH 11 /* required */ + +/* + * Subtract this from procnum to obtain index in BloomOpaque arrays + * (Must be equal to minimum of private procnums). + */ +#define PROCNUM_BASE 11 + +/* + * Storage type for BRIN's reloptions. + */ +typedef struct BloomOptions +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + double nDistinctPerRange; /* number of distinct values per range */ + double falsePositiveRate; /* false positive for bloom filter */ +} BloomOptions; + +/* + * The current min value (16) is somewhat arbitrary, but it's based + * on the fact that the filter header is ~20B alone, which is about + * the same as the filter bitmap for 16 distinct items with 1% false + * positive rate. So by allowing lower values we'd not gain much. In + * any case, the min should not be larger than MaxHeapTuplesPerPage + * (~290), which is the theoretical maximum for single-page ranges. + */ +#define BLOOM_MIN_NDISTINCT_PER_RANGE 16 + +/* + * Used to determine number of distinct items, based on the number of rows + * in a page range. The 10% is somewhat similar to what estimate_num_groups + * does, so we use the same factor here. + */ +#define BLOOM_DEFAULT_NDISTINCT_PER_RANGE -0.1 /* 10% of values */ + +/* + * Allowed range and default value for the false positive range. The exact + * values are somewhat arbitrary, but were chosen considering the various + * parameters (size of filter vs. page size, etc.). + * + * The lower the false-positive rate, the more accurate the filter is, but + * it also gets larger - at some point this eliminates the main advantage + * of BRIN indexes, which is the tiny size. At 0.01% the index is about + * 10% of the table (assuming 290 distinct values per 8kB page). + * + * On the other hand, as the false-positive rate increases, larger part of + * the table has to be scanned due to mismatches - at 25% we're probably + * close to sequential scan being cheaper. + */ +#define BLOOM_MIN_FALSE_POSITIVE_RATE 0.0001 /* 0.01% fp rate */ +#define BLOOM_MAX_FALSE_POSITIVE_RATE 0.25 /* 25% fp rate */ +#define BLOOM_DEFAULT_FALSE_POSITIVE_RATE 0.01 /* 1% fp rate */ + +#define BloomGetNDistinctPerRange(opts) \ + ((opts) && (((BloomOptions *) (opts))->nDistinctPerRange != 0) ? \ + (((BloomOptions *) (opts))->nDistinctPerRange) : \ + BLOOM_DEFAULT_NDISTINCT_PER_RANGE) + +#define BloomGetFalsePositiveRate(opts) \ + ((opts) && (((BloomOptions *) (opts))->falsePositiveRate != 0.0) ? \ + (((BloomOptions *) (opts))->falsePositiveRate) : \ + BLOOM_DEFAULT_FALSE_POSITIVE_RATE) + +/* + * And estimate of the largest bloom we can fit onto a page. This is not + * a perfect guarantee, for a couple of reasons. For example, the row may + * be larger because the index has multiple columns. + */ +#define BloomMaxFilterSize \ + MAXALIGN_DOWN(BLCKSZ - \ + (MAXALIGN(SizeOfPageHeaderData + \ + sizeof(ItemIdData)) + \ + MAXALIGN(sizeof(BrinSpecialSpace)) + \ + SizeOfBrinTuple)) + +/* + * Seeds used to calculate two hash functions h1 and h2, which are then used + * to generate k hashes using the (h1 + i * h2) scheme. + */ +#define BLOOM_SEED_1 0x71d924af +#define BLOOM_SEED_2 0xba48b314 + +/* + * Bloom Filter + * + * Represents a bloom filter, built on hashes of the indexed values. That is, + * we compute a uint32 hash of the value, and then store this hash into the + * bloom filter (and compute additional hashes on it). + * + * XXX We could implement "sparse" bloom filters, keeping only the bytes that + * are not entirely 0. But while indexes don't support TOAST, the varlena can + * still be compressed. So this seems unnecessary, because the compression + * should do the same job. + * + * XXX We can also watch the number of bits set in the bloom filter, and then + * stop using it (and not store the bitmap, to save space) when the false + * positive rate gets too high. But even if the false positive rate exceeds the + * desired value, it still can eliminate some page ranges. + */ +typedef struct BloomFilter +{ + /* varlena header (do not touch directly!) */ + int32 vl_len_; + + /* space for various flags (unused for now) */ + uint16 flags; + + /* fields for the HASHED phase */ + uint8 nhashes; /* number of hash functions */ + uint32 nbits; /* number of bits in the bitmap (size) */ + uint32 nbits_set; /* number of bits set to 1 */ + + /* data of the bloom filter */ + char data[FLEXIBLE_ARRAY_MEMBER]; + +} BloomFilter; + + +/* + * bloom_init + * Initialize the Bloom Filter, allocate all the memory. + * + * The filter is initialized with optimal size for ndistinct expected values + * and the requested false positive rate. The filter is stored as varlena. + */ +static BloomFilter * +bloom_init(int ndistinct, double false_positive_rate) +{ + Size len; + BloomFilter *filter; + + int nbits; /* size of filter / number of bits */ + int nbytes; /* size of filter / number of bytes */ + + double k; /* number of hash functions */ + + Assert(ndistinct > 0); + Assert((false_positive_rate >= BLOOM_MIN_FALSE_POSITIVE_RATE) && + (false_positive_rate < BLOOM_MAX_FALSE_POSITIVE_RATE)); + + /* sizing bloom filter: -(n * ln(p)) / (ln(2))^2 */ + nbits = ceil(-(ndistinct * log(false_positive_rate)) / pow(log(2.0), 2)); + + /* round m to whole bytes */ + nbytes = ((nbits + 7) / 8); + nbits = nbytes * 8; + + /* + * Reject filters that are obviously too large to store on a page. + * + * Initially the bloom filter is just zeroes and so very compressible, but + * as we add values it gets more and more random, and so less and less + * compressible. So initially everything fits on the page, but we might + * get surprising failures later - we want to prevent that, so we reject + * bloom filter that are obviously too large. + * + * XXX It's not uncommon to oversize the bloom filter a bit, to defend + * against unexpected data anomalies (parts of table with more distinct + * values per range etc.). But we still need to make sure even the + * oversized filter fits on page, if such need arises. + * + * XXX This check is not perfect, because the index may have multiple + * filters that are small individually, but too large when combined. + */ + if (nbytes > BloomMaxFilterSize) + elog(ERROR, "the bloom filter is too large (%d > %zu)", nbytes, + BloomMaxFilterSize); + + /* + * round(log(2.0) * m / ndistinct), but assume round() may not be + * available on Windows + */ + k = log(2.0) * nbits / ndistinct; + k = (k - floor(k) >= 0.5) ? ceil(k) : floor(k); + + /* + * We allocate the whole filter. Most of it is going to be 0 bits, so the + * varlena is easy to compress. + */ + len = offsetof(BloomFilter, data) + nbytes; + + filter = (BloomFilter *) palloc0(len); + + filter->flags = 0; + filter->nhashes = (int) k; + filter->nbits = nbits; + + SET_VARSIZE(filter, len); + + return filter; +} + + +/* + * bloom_add_value + * Add value to the bloom filter. + */ +static BloomFilter * +bloom_add_value(BloomFilter *filter, uint32 value, bool *updated) +{ + int i; + uint64 h1, + h2; + + /* compute the hashes, used for the bloom filter */ + h1 = hash_bytes_uint32_extended(value, BLOOM_SEED_1) % filter->nbits; + h2 = hash_bytes_uint32_extended(value, BLOOM_SEED_2) % filter->nbits; + + /* compute the requested number of hashes */ + for (i = 0; i < filter->nhashes; i++) + { + /* h1 + h2 + f(i) */ + uint32 h = (h1 + i * h2) % filter->nbits; + uint32 byte = (h / 8); + uint32 bit = (h % 8); + + /* if the bit is not set, set it and remember we did that */ + if (!(filter->data[byte] & (0x01 << bit))) + { + filter->data[byte] |= (0x01 << bit); + filter->nbits_set++; + if (updated) + *updated = true; + } + } + + return filter; +} + + +/* + * bloom_contains_value + * Check if the bloom filter contains a particular value. + */ +static bool +bloom_contains_value(BloomFilter *filter, uint32 value) +{ + int i; + uint64 h1, + h2; + + /* calculate the two hashes */ + h1 = hash_bytes_uint32_extended(value, BLOOM_SEED_1) % filter->nbits; + h2 = hash_bytes_uint32_extended(value, BLOOM_SEED_2) % filter->nbits; + + /* compute the requested number of hashes */ + for (i = 0; i < filter->nhashes; i++) + { + /* h1 + h2 + f(i) */ + uint32 h = (h1 + i * h2) % filter->nbits; + uint32 byte = (h / 8); + uint32 bit = (h % 8); + + /* if the bit is not set, the value is not there */ + if (!(filter->data[byte] & (0x01 << bit))) + return false; + } + + /* all hashes found in bloom filter */ + return true; +} + +typedef struct BloomOpaque +{ + /* + * XXX At this point we only need a single proc (to compute the hash), but + * let's keep the array just like inclusion and minmax opclasses, for + * consistency. We may need additional procs in the future. + */ + FmgrInfo extra_procinfos[BLOOM_MAX_PROCNUMS]; + bool extra_proc_missing[BLOOM_MAX_PROCNUMS]; +} BloomOpaque; + +static FmgrInfo *bloom_get_procinfo(BrinDesc *bdesc, uint16 attno, + uint16 procnum); + + +Datum +brin_bloom_opcinfo(PG_FUNCTION_ARGS) +{ + BrinOpcInfo *result; + + /* + * opaque->strategy_procinfos is initialized lazily; here it is set to + * all-uninitialized by palloc0 which sets fn_oid to InvalidOid. + * + * bloom indexes only store the filter as a single BYTEA column + */ + + result = palloc0(MAXALIGN(SizeofBrinOpcInfo(1)) + + sizeof(BloomOpaque)); + result->oi_nstored = 1; + result->oi_regular_nulls = true; + result->oi_opaque = (BloomOpaque *) + MAXALIGN((char *) result + SizeofBrinOpcInfo(1)); + result->oi_typcache[0] = lookup_type_cache(PG_BRIN_BLOOM_SUMMARYOID, 0); + + PG_RETURN_POINTER(result); +} + +/* + * brin_bloom_get_ndistinct + * Determine the ndistinct value used to size bloom filter. + * + * Adjust the ndistinct value based on the pagesPerRange value. First, + * if it's negative, it's assumed to be relative to maximum number of + * tuples in the range (assuming each page gets MaxHeapTuplesPerPage + * tuples, which is likely a significant over-estimate). We also clamp + * the value, not to over-size the bloom filter unnecessarily. + * + * XXX We can only do this when the pagesPerRange value was supplied. + * If it wasn't, it has to be a read-only access to the index, in which + * case we don't really care. But perhaps we should fall-back to the + * default pagesPerRange value? + * + * XXX We might also fetch info about ndistinct estimate for the column, + * and compute the expected number of distinct values in a range. But + * that may be tricky due to data being sorted in various ways, so it + * seems better to rely on the upper estimate. + * + * XXX We might also calculate a better estimate of rows per BRIN range, + * instead of using MaxHeapTuplesPerPage (which probably produces values + * much higher than reality). + */ +static int +brin_bloom_get_ndistinct(BrinDesc *bdesc, BloomOptions *opts) +{ + double ndistinct; + double maxtuples; + BlockNumber pagesPerRange; + + pagesPerRange = BrinGetPagesPerRange(bdesc->bd_index); + ndistinct = BloomGetNDistinctPerRange(opts); + + Assert(BlockNumberIsValid(pagesPerRange)); + + maxtuples = MaxHeapTuplesPerPage * pagesPerRange; + + /* + * Similarly to n_distinct, negative values are relative - in this case to + * maximum number of tuples in the page range (maxtuples). + */ + if (ndistinct < 0) + ndistinct = (-ndistinct) * maxtuples; + + /* + * Positive values are to be used directly, but we still apply a couple of + * safeties to avoid using unreasonably small bloom filters. + */ + ndistinct = Max(ndistinct, BLOOM_MIN_NDISTINCT_PER_RANGE); + + /* + * And don't use more than the maximum possible number of tuples, in the + * range, which would be entirely wasteful. + */ + ndistinct = Min(ndistinct, maxtuples); + + return (int) ndistinct; +} + +/* + * Examine the given index tuple (which contains partial status of a certain + * page range) by comparing it to the given value that comes from another heap + * tuple. If the new value is outside the bloom filter specified by the + * existing tuple values, update the index tuple and return true. Otherwise, + * return false and do not modify in this case. + */ +Datum +brin_bloom_add_value(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); + Datum newval = PG_GETARG_DATUM(2); + bool isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_DATUM(3); + BloomOptions *opts = (BloomOptions *) PG_GET_OPCLASS_OPTIONS(); + Oid colloid = PG_GET_COLLATION(); + FmgrInfo *hashFn; + uint32 hashValue; + bool updated = false; + AttrNumber attno; + BloomFilter *filter; + + Assert(!isnull); + + attno = column->bv_attno; + + /* + * If this is the first non-null value, we need to initialize the bloom + * filter. Otherwise just extract the existing bloom filter from + * BrinValues. + */ + if (column->bv_allnulls) + { + filter = bloom_init(brin_bloom_get_ndistinct(bdesc, opts), + BloomGetFalsePositiveRate(opts)); + column->bv_values[0] = PointerGetDatum(filter); + column->bv_allnulls = false; + updated = true; + } + else + filter = (BloomFilter *) PG_DETOAST_DATUM(column->bv_values[0]); + + /* + * Compute the hash of the new value, using the supplied hash function, + * and then add the hash value to the bloom filter. + */ + hashFn = bloom_get_procinfo(bdesc, attno, PROCNUM_HASH); + + hashValue = DatumGetUInt32(FunctionCall1Coll(hashFn, colloid, newval)); + + filter = bloom_add_value(filter, hashValue, &updated); + + column->bv_values[0] = PointerGetDatum(filter); + + PG_RETURN_BOOL(updated); +} + +/* + * Given an index tuple corresponding to a certain page range and a scan key, + * return whether the scan key is consistent with the index tuple's bloom + * filter. Return true if so, false otherwise. + */ +Datum +brin_bloom_consistent(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); + ScanKey *keys = (ScanKey *) PG_GETARG_POINTER(2); + int nkeys = PG_GETARG_INT32(3); + Oid colloid = PG_GET_COLLATION(); + AttrNumber attno; + Datum value; + Datum matches; + FmgrInfo *finfo; + uint32 hashValue; + BloomFilter *filter; + int keyno; + + filter = (BloomFilter *) PG_DETOAST_DATUM(column->bv_values[0]); + + Assert(filter); + + matches = true; + + for (keyno = 0; keyno < nkeys; keyno++) + { + ScanKey key = keys[keyno]; + + /* NULL keys are handled and filtered-out in bringetbitmap */ + Assert(!(key->sk_flags & SK_ISNULL)); + + attno = key->sk_attno; + value = key->sk_argument; + + switch (key->sk_strategy) + { + case BloomEqualStrategyNumber: + + /* + * In the equality case (WHERE col = someval), we want to + * return the current page range if the minimum value in the + * range <= scan key, and the maximum value >= scan key. + */ + finfo = bloom_get_procinfo(bdesc, attno, PROCNUM_HASH); + + hashValue = DatumGetUInt32(FunctionCall1Coll(finfo, colloid, value)); + matches &= bloom_contains_value(filter, hashValue); + + break; + default: + /* shouldn't happen */ + elog(ERROR, "invalid strategy number %d", key->sk_strategy); + matches = 0; + break; + } + + if (!matches) + break; + } + + PG_RETURN_DATUM(matches); +} + +/* + * Given two BrinValues, update the first of them as a union of the summary + * values contained in both. The second one is untouched. + * + * XXX We assume the bloom filters have the same parameters for now. In the + * future we should have 'can union' function, to decide if we can combine + * two particular bloom filters. + */ +Datum +brin_bloom_union(PG_FUNCTION_ARGS) +{ + int i; + int nbytes; + BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1); + BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2); + BloomFilter *filter_a; + BloomFilter *filter_b; + + Assert(col_a->bv_attno == col_b->bv_attno); + Assert(!col_a->bv_allnulls && !col_b->bv_allnulls); + + filter_a = (BloomFilter *) PG_DETOAST_DATUM(col_a->bv_values[0]); + filter_b = (BloomFilter *) PG_DETOAST_DATUM(col_b->bv_values[0]); + + /* make sure the filters use the same parameters */ + Assert(filter_a && filter_b); + Assert(filter_a->nbits == filter_b->nbits); + Assert(filter_a->nhashes == filter_b->nhashes); + Assert((filter_a->nbits > 0) && (filter_a->nbits % 8 == 0)); + + nbytes = (filter_a->nbits) / 8; + + /* simply OR the bitmaps */ + for (i = 0; i < nbytes; i++) + filter_a->data[i] |= filter_b->data[i]; + + PG_RETURN_VOID(); +} + +/* + * Cache and return inclusion opclass support procedure + * + * Return the procedure corresponding to the given function support number + * or null if it does not exist. + */ +static FmgrInfo * +bloom_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum) +{ + BloomOpaque *opaque; + uint16 basenum = procnum - PROCNUM_BASE; + + /* + * We cache these in the opaque struct, to avoid repetitive syscache + * lookups. + */ + opaque = (BloomOpaque *) bdesc->bd_info[attno - 1]->oi_opaque; + + /* + * If we already searched for this proc and didn't find it, don't bother + * searching again. + */ + if (opaque->extra_proc_missing[basenum]) + return NULL; + + if (opaque->extra_procinfos[basenum].fn_oid == InvalidOid) + { + if (RegProcedureIsValid(index_getprocid(bdesc->bd_index, attno, + procnum))) + { + fmgr_info_copy(&opaque->extra_procinfos[basenum], + index_getprocinfo(bdesc->bd_index, attno, procnum), + bdesc->bd_context); + } + else + { + opaque->extra_proc_missing[basenum] = true; + return NULL; + } + } + + return &opaque->extra_procinfos[basenum]; +} + +Datum +brin_bloom_options(PG_FUNCTION_ARGS) +{ + local_relopts *relopts = (local_relopts *) PG_GETARG_POINTER(0); + + init_local_reloptions(relopts, sizeof(BloomOptions)); + + add_local_real_reloption(relopts, "n_distinct_per_range", + "number of distinct items expected in a BRIN page range", + BLOOM_DEFAULT_NDISTINCT_PER_RANGE, + -1.0, INT_MAX, offsetof(BloomOptions, nDistinctPerRange)); + + add_local_real_reloption(relopts, "false_positive_rate", + "desired false-positive rate for the bloom filters", + BLOOM_DEFAULT_FALSE_POSITIVE_RATE, + BLOOM_MIN_FALSE_POSITIVE_RATE, + BLOOM_MAX_FALSE_POSITIVE_RATE, + offsetof(BloomOptions, falsePositiveRate)); + + PG_RETURN_VOID(); +} + +/* + * brin_bloom_summary_in + * - input routine for type brin_bloom_summary. + * + * brin_bloom_summary is only used internally to represent summaries + * in BRIN bloom indexes, so it has no operations of its own, and we + * disallow input too. + */ +Datum +brin_bloom_summary_in(PG_FUNCTION_ARGS) +{ + /* + * brin_bloom_summary stores the data in binary form and parsing text + * input is not needed, so disallow this. + */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_brin_bloom_summary"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + + +/* + * brin_bloom_summary_out + * - output routine for type brin_bloom_summary. + * + * BRIN bloom summaries are serialized into a bytea value, but we want + * to output something nicer humans can understand. + */ +Datum +brin_bloom_summary_out(PG_FUNCTION_ARGS) +{ + BloomFilter *filter; + StringInfoData str; + + /* detoast the data to get value with a full 4B header */ + filter = (BloomFilter *) PG_DETOAST_DATUM(PG_GETARG_BYTEA_PP(0)); + + initStringInfo(&str); + appendStringInfoChar(&str, '{'); + + appendStringInfo(&str, "mode: hashed nhashes: %u nbits: %u nbits_set: %u", + filter->nhashes, filter->nbits, filter->nbits_set); + + appendStringInfoChar(&str, '}'); + + PG_RETURN_CSTRING(str.data); +} + +/* + * brin_bloom_summary_recv + * - binary input routine for type brin_bloom_summary. + */ +Datum +brin_bloom_summary_recv(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "pg_brin_bloom_summary"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + +/* + * brin_bloom_summary_send + * - binary output routine for type brin_bloom_summary. + * + * BRIN bloom summaries are serialized in a bytea value (although the + * type is named differently), so let's just send that. + */ +Datum +brin_bloom_summary_send(PG_FUNCTION_ARGS) +{ + return byteasend(fcinfo); +} diff --git a/src/backend/access/brin/brin_inclusion.c b/src/backend/access/brin/brin_inclusion.c new file mode 100644 index 0000000..0b384c0 --- /dev/null +++ b/src/backend/access/brin/brin_inclusion.c @@ -0,0 +1,657 @@ +/* + * brin_inclusion.c + * Implementation of inclusion opclasses for BRIN + * + * This module provides framework BRIN support functions for the "inclusion" + * operator classes. A few SQL-level support functions are also required for + * each opclass. + * + * The "inclusion" BRIN strategy is useful for types that support R-Tree + * operations. This implementation is a straight mapping of those operations + * to the block-range nature of BRIN, with two exceptions: (a) we explicitly + * support "empty" elements: at least with range types, we need to consider + * emptiness separately from regular R-Tree strategies; and (b) we need to + * consider "unmergeable" elements, that is, a set of elements for whose union + * no representation exists. The only case where that happens as of this + * writing is the INET type, where IPv6 values cannot be merged with IPv4 + * values. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_inclusion.c + */ +#include "postgres.h" + +#include "access/brin_internal.h" +#include "access/brin_tuple.h" +#include "access/genam.h" +#include "access/skey.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" +#include "utils/syscache.h" + + +/* + * Additional SQL level support functions + * + * Procedure numbers must not use values reserved for BRIN itself; see + * brin_internal.h. + */ +#define INCLUSION_MAX_PROCNUMS 4 /* maximum support procs we need */ +#define PROCNUM_MERGE 11 /* required */ +#define PROCNUM_MERGEABLE 12 /* optional */ +#define PROCNUM_CONTAINS 13 /* optional */ +#define PROCNUM_EMPTY 14 /* optional */ + + +/* + * Subtract this from procnum to obtain index in InclusionOpaque arrays + * (Must be equal to minimum of private procnums). + */ +#define PROCNUM_BASE 11 + +/*- + * The values stored in the bv_values arrays correspond to: + * + * INCLUSION_UNION + * the union of the values in the block range + * INCLUSION_UNMERGEABLE + * whether the values in the block range cannot be merged + * (e.g. an IPv6 address amidst IPv4 addresses) + * INCLUSION_CONTAINS_EMPTY + * whether an empty value is present in any tuple + * in the block range + */ +#define INCLUSION_UNION 0 +#define INCLUSION_UNMERGEABLE 1 +#define INCLUSION_CONTAINS_EMPTY 2 + + +typedef struct InclusionOpaque +{ + FmgrInfo extra_procinfos[INCLUSION_MAX_PROCNUMS]; + bool extra_proc_missing[INCLUSION_MAX_PROCNUMS]; + Oid cached_subtype; + FmgrInfo strategy_procinfos[RTMaxStrategyNumber]; +} InclusionOpaque; + +static FmgrInfo *inclusion_get_procinfo(BrinDesc *bdesc, uint16 attno, + uint16 procnum); +static FmgrInfo *inclusion_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno, + Oid subtype, uint16 strategynum); + + +/* + * BRIN inclusion OpcInfo function + */ +Datum +brin_inclusion_opcinfo(PG_FUNCTION_ARGS) +{ + Oid typoid = PG_GETARG_OID(0); + BrinOpcInfo *result; + TypeCacheEntry *bool_typcache = lookup_type_cache(BOOLOID, 0); + + /* + * All members of opaque are initialized lazily; both procinfo arrays + * start out as non-initialized by having fn_oid be InvalidOid, and + * "missing" to false, by zeroing here. strategy_procinfos elements can + * be invalidated when cached_subtype changes by zeroing fn_oid. + * extra_procinfo entries are never invalidated, but if a lookup fails + * (which is expected), extra_proc_missing is set to true, indicating not + * to look it up again. + */ + result = palloc0(MAXALIGN(SizeofBrinOpcInfo(3)) + sizeof(InclusionOpaque)); + result->oi_nstored = 3; + result->oi_regular_nulls = true; + result->oi_opaque = (InclusionOpaque *) + MAXALIGN((char *) result + SizeofBrinOpcInfo(3)); + + /* the union */ + result->oi_typcache[INCLUSION_UNION] = + lookup_type_cache(typoid, 0); + + /* includes elements that are not mergeable */ + result->oi_typcache[INCLUSION_UNMERGEABLE] = bool_typcache; + + /* includes the empty element */ + result->oi_typcache[INCLUSION_CONTAINS_EMPTY] = bool_typcache; + + PG_RETURN_POINTER(result); +} + +/* + * BRIN inclusion add value function + * + * Examine the given index tuple (which contains partial status of a certain + * page range) by comparing it to the given value that comes from another heap + * tuple. If the new value is outside the union specified by the existing + * tuple values, update the index tuple and return true. Otherwise, return + * false and do not modify in this case. + */ +Datum +brin_inclusion_add_value(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); + Datum newval = PG_GETARG_DATUM(2); + bool isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_BOOL(3); + Oid colloid = PG_GET_COLLATION(); + FmgrInfo *finfo; + Datum result; + bool new = false; + AttrNumber attno; + Form_pg_attribute attr; + + Assert(!isnull); + + attno = column->bv_attno; + attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1); + + /* + * If the recorded value is null, copy the new value (which we know to be + * not null), and we're almost done. + */ + if (column->bv_allnulls) + { + column->bv_values[INCLUSION_UNION] = + datumCopy(newval, attr->attbyval, attr->attlen); + column->bv_values[INCLUSION_UNMERGEABLE] = BoolGetDatum(false); + column->bv_values[INCLUSION_CONTAINS_EMPTY] = BoolGetDatum(false); + column->bv_allnulls = false; + new = true; + } + + /* + * No need for further processing if the block range is marked as + * containing unmergeable values. + */ + if (DatumGetBool(column->bv_values[INCLUSION_UNMERGEABLE])) + PG_RETURN_BOOL(false); + + /* + * If the opclass supports the concept of empty values, test the passed + * new value for emptiness; if it returns true, we need to set the + * "contains empty" flag in the element (unless already set). + */ + finfo = inclusion_get_procinfo(bdesc, attno, PROCNUM_EMPTY); + if (finfo != NULL && DatumGetBool(FunctionCall1Coll(finfo, colloid, newval))) + { + if (!DatumGetBool(column->bv_values[INCLUSION_CONTAINS_EMPTY])) + { + column->bv_values[INCLUSION_CONTAINS_EMPTY] = BoolGetDatum(true); + PG_RETURN_BOOL(true); + } + + PG_RETURN_BOOL(false); + } + + if (new) + PG_RETURN_BOOL(true); + + /* Check if the new value is already contained. */ + finfo = inclusion_get_procinfo(bdesc, attno, PROCNUM_CONTAINS); + if (finfo != NULL && + DatumGetBool(FunctionCall2Coll(finfo, colloid, + column->bv_values[INCLUSION_UNION], + newval))) + PG_RETURN_BOOL(false); + + /* + * Check if the new value is mergeable to the existing union. If it is + * not, mark the value as containing unmergeable elements and get out. + * + * Note: at this point we could remove the value from the union, since + * it's not going to be used any longer. However, the BRIN framework + * doesn't allow for the value not being present. Improve someday. + */ + finfo = inclusion_get_procinfo(bdesc, attno, PROCNUM_MERGEABLE); + if (finfo != NULL && + !DatumGetBool(FunctionCall2Coll(finfo, colloid, + column->bv_values[INCLUSION_UNION], + newval))) + { + column->bv_values[INCLUSION_UNMERGEABLE] = BoolGetDatum(true); + PG_RETURN_BOOL(true); + } + + /* Finally, merge the new value to the existing union. */ + finfo = inclusion_get_procinfo(bdesc, attno, PROCNUM_MERGE); + Assert(finfo != NULL); + result = FunctionCall2Coll(finfo, colloid, + column->bv_values[INCLUSION_UNION], newval); + if (!attr->attbyval && + DatumGetPointer(result) != DatumGetPointer(column->bv_values[INCLUSION_UNION])) + { + pfree(DatumGetPointer(column->bv_values[INCLUSION_UNION])); + + if (result == newval) + result = datumCopy(result, attr->attbyval, attr->attlen); + } + column->bv_values[INCLUSION_UNION] = result; + + PG_RETURN_BOOL(true); +} + +/* + * BRIN inclusion consistent function + * + * We're no longer dealing with NULL keys in the consistent function, that is + * now handled by the AM code. That means we should not get any all-NULL ranges + * either, because those can't be consistent with regular (not [IS] NULL) keys. + * + * All of the strategies are optional. + */ +Datum +brin_inclusion_consistent(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); + ScanKey key = (ScanKey) PG_GETARG_POINTER(2); + Oid colloid = PG_GET_COLLATION(), + subtype; + Datum unionval; + AttrNumber attno; + Datum query; + FmgrInfo *finfo; + Datum result; + + /* This opclass uses the old signature with only three arguments. */ + Assert(PG_NARGS() == 3); + + /* Should not be dealing with all-NULL ranges. */ + Assert(!column->bv_allnulls); + + /* It has to be checked, if it contains elements that are not mergeable. */ + if (DatumGetBool(column->bv_values[INCLUSION_UNMERGEABLE])) + PG_RETURN_BOOL(true); + + attno = key->sk_attno; + subtype = key->sk_subtype; + query = key->sk_argument; + unionval = column->bv_values[INCLUSION_UNION]; + switch (key->sk_strategy) + { + /* + * Placement strategies + * + * These are implemented by logically negating the result of the + * converse placement operator; for this to work, the converse + * operator must be part of the opclass. An error will be thrown + * by inclusion_get_strategy_procinfo() if the required strategy + * is not part of the opclass. + * + * These all return false if either argument is empty, so there is + * no need to check for empty elements. + */ + + case RTLeftStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTOverRightStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_BOOL(!DatumGetBool(result)); + + case RTOverLeftStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTRightStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_BOOL(!DatumGetBool(result)); + + case RTOverRightStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTLeftStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_BOOL(!DatumGetBool(result)); + + case RTRightStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTOverLeftStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_BOOL(!DatumGetBool(result)); + + case RTBelowStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTOverAboveStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_BOOL(!DatumGetBool(result)); + + case RTOverBelowStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTAboveStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_BOOL(!DatumGetBool(result)); + + case RTOverAboveStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTBelowStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_BOOL(!DatumGetBool(result)); + + case RTAboveStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTOverBelowStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_BOOL(!DatumGetBool(result)); + + /* + * Overlap and contains strategies + * + * These strategies are simple enough that we can simply call the + * operator and return its result. Empty elements don't change + * the result. + */ + + case RTOverlapStrategyNumber: + case RTContainsStrategyNumber: + case RTContainsElemStrategyNumber: + case RTSubStrategyNumber: + case RTSubEqualStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + key->sk_strategy); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_DATUM(result); + + /* + * Contained by strategies + * + * We cannot just call the original operator for the contained by + * strategies because some elements can be contained even though + * the union is not; instead we use the overlap operator. + * + * We check for empty elements separately as they are not merged + * to the union but contained by everything. + */ + + case RTContainedByStrategyNumber: + case RTSuperStrategyNumber: + case RTSuperEqualStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTOverlapStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + if (DatumGetBool(result)) + PG_RETURN_BOOL(true); + + PG_RETURN_DATUM(column->bv_values[INCLUSION_CONTAINS_EMPTY]); + + /* + * Adjacent strategy + * + * We test for overlap first but to be safe we need to call the + * actual adjacent operator also. + * + * An empty element cannot be adjacent to any other, so there is + * no need to check for it. + */ + + case RTAdjacentStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTOverlapStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + if (DatumGetBool(result)) + PG_RETURN_BOOL(true); + + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTAdjacentStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_DATUM(result); + + /* + * Basic comparison strategies + * + * It is straightforward to support the equality strategies with + * the contains operator. Generally, inequality strategies do not + * make much sense for the types which will be used with the + * inclusion BRIN family of opclasses, but it is possible to + * implement them with logical negation of the left-of and + * right-of operators. + * + * NB: These strategies cannot be used with geometric datatypes + * that use comparison of areas! The only exception is the "same" + * strategy. + * + * Empty elements are considered to be less than the others. We + * cannot use the empty support function to check the query is an + * empty element, because the query can be another data type than + * the empty support function argument. So we will return true, + * if there is a possibility that empty elements will change the + * result. + */ + + case RTLessStrategyNumber: + case RTLessEqualStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTRightStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + if (!DatumGetBool(result)) + PG_RETURN_BOOL(true); + + PG_RETURN_DATUM(column->bv_values[INCLUSION_CONTAINS_EMPTY]); + + case RTSameStrategyNumber: + case RTEqualStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTContainsStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + if (DatumGetBool(result)) + PG_RETURN_BOOL(true); + + PG_RETURN_DATUM(column->bv_values[INCLUSION_CONTAINS_EMPTY]); + + case RTGreaterEqualStrategyNumber: + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTLeftStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + if (!DatumGetBool(result)) + PG_RETURN_BOOL(true); + + PG_RETURN_DATUM(column->bv_values[INCLUSION_CONTAINS_EMPTY]); + + case RTGreaterStrategyNumber: + /* no need to check for empty elements */ + finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype, + RTLeftStrategyNumber); + result = FunctionCall2Coll(finfo, colloid, unionval, query); + PG_RETURN_BOOL(!DatumGetBool(result)); + + default: + /* shouldn't happen */ + elog(ERROR, "invalid strategy number %d", key->sk_strategy); + PG_RETURN_BOOL(false); + } +} + +/* + * BRIN inclusion union function + * + * Given two BrinValues, update the first of them as a union of the summary + * values contained in both. The second one is untouched. + */ +Datum +brin_inclusion_union(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1); + BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2); + Oid colloid = PG_GET_COLLATION(); + AttrNumber attno; + Form_pg_attribute attr; + FmgrInfo *finfo; + Datum result; + + Assert(col_a->bv_attno == col_b->bv_attno); + Assert(!col_a->bv_allnulls && !col_b->bv_allnulls); + + attno = col_a->bv_attno; + attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1); + + /* If B includes empty elements, mark A similarly, if needed. */ + if (!DatumGetBool(col_a->bv_values[INCLUSION_CONTAINS_EMPTY]) && + DatumGetBool(col_b->bv_values[INCLUSION_CONTAINS_EMPTY])) + col_a->bv_values[INCLUSION_CONTAINS_EMPTY] = BoolGetDatum(true); + + /* Check if A includes elements that are not mergeable. */ + if (DatumGetBool(col_a->bv_values[INCLUSION_UNMERGEABLE])) + PG_RETURN_VOID(); + + /* If B includes elements that are not mergeable, mark A similarly. */ + if (DatumGetBool(col_b->bv_values[INCLUSION_UNMERGEABLE])) + { + col_a->bv_values[INCLUSION_UNMERGEABLE] = BoolGetDatum(true); + PG_RETURN_VOID(); + } + + /* Check if A and B are mergeable; if not, mark A unmergeable. */ + finfo = inclusion_get_procinfo(bdesc, attno, PROCNUM_MERGEABLE); + if (finfo != NULL && + !DatumGetBool(FunctionCall2Coll(finfo, colloid, + col_a->bv_values[INCLUSION_UNION], + col_b->bv_values[INCLUSION_UNION]))) + { + col_a->bv_values[INCLUSION_UNMERGEABLE] = BoolGetDatum(true); + PG_RETURN_VOID(); + } + + /* Finally, merge B to A. */ + finfo = inclusion_get_procinfo(bdesc, attno, PROCNUM_MERGE); + Assert(finfo != NULL); + result = FunctionCall2Coll(finfo, colloid, + col_a->bv_values[INCLUSION_UNION], + col_b->bv_values[INCLUSION_UNION]); + if (!attr->attbyval && + DatumGetPointer(result) != DatumGetPointer(col_a->bv_values[INCLUSION_UNION])) + { + pfree(DatumGetPointer(col_a->bv_values[INCLUSION_UNION])); + + if (result == col_b->bv_values[INCLUSION_UNION]) + result = datumCopy(result, attr->attbyval, attr->attlen); + } + col_a->bv_values[INCLUSION_UNION] = result; + + PG_RETURN_VOID(); +} + +/* + * Cache and return inclusion opclass support procedure + * + * Return the procedure corresponding to the given function support number + * or null if it is not exists. + */ +static FmgrInfo * +inclusion_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum) +{ + InclusionOpaque *opaque; + uint16 basenum = procnum - PROCNUM_BASE; + + /* + * We cache these in the opaque struct, to avoid repetitive syscache + * lookups. + */ + opaque = (InclusionOpaque *) bdesc->bd_info[attno - 1]->oi_opaque; + + /* + * If we already searched for this proc and didn't find it, don't bother + * searching again. + */ + if (opaque->extra_proc_missing[basenum]) + return NULL; + + if (opaque->extra_procinfos[basenum].fn_oid == InvalidOid) + { + if (RegProcedureIsValid(index_getprocid(bdesc->bd_index, attno, + procnum))) + { + fmgr_info_copy(&opaque->extra_procinfos[basenum], + index_getprocinfo(bdesc->bd_index, attno, procnum), + bdesc->bd_context); + } + else + { + opaque->extra_proc_missing[basenum] = true; + return NULL; + } + } + + return &opaque->extra_procinfos[basenum]; +} + +/* + * Cache and return the procedure of the given strategy + * + * Return the procedure corresponding to the given sub-type and strategy + * number. The data type of the index will be used as the left hand side of + * the operator and the given sub-type will be used as the right hand side. + * Throws an error if the pg_amop row does not exist, but that should not + * happen with a properly configured opclass. + * + * It always throws an error when the data type of the opclass is different + * from the data type of the column or the expression. That happens when the + * column data type has implicit cast to the opclass data type. We don't + * bother casting types, because this situation can easily be avoided by + * setting storage data type to that of the opclass. The same problem does not + * apply to the data type of the right hand side, because the type in the + * ScanKey always matches the opclass' one. + * + * Note: this function mirrors minmax_get_strategy_procinfo; if changes are + * made here, see that function too. + */ +static FmgrInfo * +inclusion_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno, Oid subtype, + uint16 strategynum) +{ + InclusionOpaque *opaque; + + Assert(strategynum >= 1 && + strategynum <= RTMaxStrategyNumber); + + opaque = (InclusionOpaque *) bdesc->bd_info[attno - 1]->oi_opaque; + + /* + * We cache the procedures for the last sub-type in the opaque struct, to + * avoid repetitive syscache lookups. If the sub-type is changed, + * invalidate all the cached entries. + */ + if (opaque->cached_subtype != subtype) + { + uint16 i; + + for (i = 1; i <= RTMaxStrategyNumber; i++) + opaque->strategy_procinfos[i - 1].fn_oid = InvalidOid; + opaque->cached_subtype = subtype; + } + + if (opaque->strategy_procinfos[strategynum - 1].fn_oid == InvalidOid) + { + Form_pg_attribute attr; + HeapTuple tuple; + Oid opfamily, + oprid; + bool isNull; + + opfamily = bdesc->bd_index->rd_opfamily[attno - 1]; + attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1); + tuple = SearchSysCache4(AMOPSTRATEGY, ObjectIdGetDatum(opfamily), + ObjectIdGetDatum(attr->atttypid), + ObjectIdGetDatum(subtype), + Int16GetDatum(strategynum)); + + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "missing operator %d(%u,%u) in opfamily %u", + strategynum, attr->atttypid, subtype, opfamily); + + oprid = DatumGetObjectId(SysCacheGetAttr(AMOPSTRATEGY, tuple, + Anum_pg_amop_amopopr, &isNull)); + ReleaseSysCache(tuple); + Assert(!isNull && RegProcedureIsValid(oprid)); + + fmgr_info_cxt(get_opcode(oprid), + &opaque->strategy_procinfos[strategynum - 1], + bdesc->bd_context); + } + + return &opaque->strategy_procinfos[strategynum - 1]; +} diff --git a/src/backend/access/brin/brin_minmax.c b/src/backend/access/brin/brin_minmax.c new file mode 100644 index 0000000..798f06c --- /dev/null +++ b/src/backend/access/brin/brin_minmax.c @@ -0,0 +1,317 @@ +/* + * brin_minmax.c + * Implementation of Min/Max opclass for BRIN + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_minmax.c + */ +#include "postgres.h" + +#include "access/brin_internal.h" +#include "access/brin_tuple.h" +#include "access/genam.h" +#include "access/stratnum.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" +#include "utils/syscache.h" + +typedef struct MinmaxOpaque +{ + Oid cached_subtype; + FmgrInfo strategy_procinfos[BTMaxStrategyNumber]; +} MinmaxOpaque; + +static FmgrInfo *minmax_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno, + Oid subtype, uint16 strategynum); + + +Datum +brin_minmax_opcinfo(PG_FUNCTION_ARGS) +{ + Oid typoid = PG_GETARG_OID(0); + BrinOpcInfo *result; + + /* + * opaque->strategy_procinfos is initialized lazily; here it is set to + * all-uninitialized by palloc0 which sets fn_oid to InvalidOid. + */ + + result = palloc0(MAXALIGN(SizeofBrinOpcInfo(2)) + + sizeof(MinmaxOpaque)); + result->oi_nstored = 2; + result->oi_regular_nulls = true; + result->oi_opaque = (MinmaxOpaque *) + MAXALIGN((char *) result + SizeofBrinOpcInfo(2)); + result->oi_typcache[0] = result->oi_typcache[1] = + lookup_type_cache(typoid, 0); + + PG_RETURN_POINTER(result); +} + +/* + * Examine the given index tuple (which contains partial status of a certain + * page range) by comparing it to the given value that comes from another heap + * tuple. If the new value is outside the min/max range specified by the + * existing tuple values, update the index tuple and return true. Otherwise, + * return false and do not modify in this case. + */ +Datum +brin_minmax_add_value(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); + Datum newval = PG_GETARG_DATUM(2); + bool isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_DATUM(3); + Oid colloid = PG_GET_COLLATION(); + FmgrInfo *cmpFn; + Datum compar; + bool updated = false; + Form_pg_attribute attr; + AttrNumber attno; + + Assert(!isnull); + + attno = column->bv_attno; + attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1); + + /* + * If the recorded value is null, store the new value (which we know to be + * not null) as both minimum and maximum, and we're done. + */ + if (column->bv_allnulls) + { + column->bv_values[0] = datumCopy(newval, attr->attbyval, attr->attlen); + column->bv_values[1] = datumCopy(newval, attr->attbyval, attr->attlen); + column->bv_allnulls = false; + PG_RETURN_BOOL(true); + } + + /* + * Otherwise, need to compare the new value with the existing boundaries + * and update them accordingly. First check if it's less than the + * existing minimum. + */ + cmpFn = minmax_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTLessStrategyNumber); + compar = FunctionCall2Coll(cmpFn, colloid, newval, column->bv_values[0]); + if (DatumGetBool(compar)) + { + if (!attr->attbyval) + pfree(DatumGetPointer(column->bv_values[0])); + column->bv_values[0] = datumCopy(newval, attr->attbyval, attr->attlen); + updated = true; + } + + /* + * And now compare it to the existing maximum. + */ + cmpFn = minmax_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTGreaterStrategyNumber); + compar = FunctionCall2Coll(cmpFn, colloid, newval, column->bv_values[1]); + if (DatumGetBool(compar)) + { + if (!attr->attbyval) + pfree(DatumGetPointer(column->bv_values[1])); + column->bv_values[1] = datumCopy(newval, attr->attbyval, attr->attlen); + updated = true; + } + + PG_RETURN_BOOL(updated); +} + +/* + * Given an index tuple corresponding to a certain page range and a scan key, + * return whether the scan key is consistent with the index tuple's min/max + * values. Return true if so, false otherwise. + * + * We're no longer dealing with NULL keys in the consistent function, that is + * now handled by the AM code. That means we should not get any all-NULL ranges + * either, because those can't be consistent with regular (not [IS] NULL) keys. + */ +Datum +brin_minmax_consistent(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); + ScanKey key = (ScanKey) PG_GETARG_POINTER(2); + Oid colloid = PG_GET_COLLATION(), + subtype; + AttrNumber attno; + Datum value; + Datum matches; + FmgrInfo *finfo; + + /* This opclass uses the old signature with only three arguments. */ + Assert(PG_NARGS() == 3); + + /* Should not be dealing with all-NULL ranges. */ + Assert(!column->bv_allnulls); + + attno = key->sk_attno; + subtype = key->sk_subtype; + value = key->sk_argument; + switch (key->sk_strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + finfo = minmax_get_strategy_procinfo(bdesc, attno, subtype, + key->sk_strategy); + matches = FunctionCall2Coll(finfo, colloid, column->bv_values[0], + value); + break; + case BTEqualStrategyNumber: + + /* + * In the equality case (WHERE col = someval), we want to return + * the current page range if the minimum value in the range <= + * scan key, and the maximum value >= scan key. + */ + finfo = minmax_get_strategy_procinfo(bdesc, attno, subtype, + BTLessEqualStrategyNumber); + matches = FunctionCall2Coll(finfo, colloid, column->bv_values[0], + value); + if (!DatumGetBool(matches)) + break; + /* max() >= scankey */ + finfo = minmax_get_strategy_procinfo(bdesc, attno, subtype, + BTGreaterEqualStrategyNumber); + matches = FunctionCall2Coll(finfo, colloid, column->bv_values[1], + value); + break; + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + finfo = minmax_get_strategy_procinfo(bdesc, attno, subtype, + key->sk_strategy); + matches = FunctionCall2Coll(finfo, colloid, column->bv_values[1], + value); + break; + default: + /* shouldn't happen */ + elog(ERROR, "invalid strategy number %d", key->sk_strategy); + matches = 0; + break; + } + + PG_RETURN_DATUM(matches); +} + +/* + * Given two BrinValues, update the first of them as a union of the summary + * values contained in both. The second one is untouched. + */ +Datum +brin_minmax_union(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1); + BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2); + Oid colloid = PG_GET_COLLATION(); + AttrNumber attno; + Form_pg_attribute attr; + FmgrInfo *finfo; + bool needsadj; + + Assert(col_a->bv_attno == col_b->bv_attno); + Assert(!col_a->bv_allnulls && !col_b->bv_allnulls); + + attno = col_a->bv_attno; + attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1); + + /* Adjust minimum, if B's min is less than A's min */ + finfo = minmax_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTLessStrategyNumber); + needsadj = FunctionCall2Coll(finfo, colloid, col_b->bv_values[0], + col_a->bv_values[0]); + if (needsadj) + { + if (!attr->attbyval) + pfree(DatumGetPointer(col_a->bv_values[0])); + col_a->bv_values[0] = datumCopy(col_b->bv_values[0], + attr->attbyval, attr->attlen); + } + + /* Adjust maximum, if B's max is greater than A's max */ + finfo = minmax_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTGreaterStrategyNumber); + needsadj = FunctionCall2Coll(finfo, colloid, col_b->bv_values[1], + col_a->bv_values[1]); + if (needsadj) + { + if (!attr->attbyval) + pfree(DatumGetPointer(col_a->bv_values[1])); + col_a->bv_values[1] = datumCopy(col_b->bv_values[1], + attr->attbyval, attr->attlen); + } + + PG_RETURN_VOID(); +} + +/* + * Cache and return the procedure for the given strategy. + * + * Note: this function mirrors inclusion_get_strategy_procinfo; see notes + * there. If changes are made here, see that function too. + */ +static FmgrInfo * +minmax_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno, Oid subtype, + uint16 strategynum) +{ + MinmaxOpaque *opaque; + + Assert(strategynum >= 1 && + strategynum <= BTMaxStrategyNumber); + + opaque = (MinmaxOpaque *) bdesc->bd_info[attno - 1]->oi_opaque; + + /* + * We cache the procedures for the previous subtype in the opaque struct, + * to avoid repetitive syscache lookups. If the subtype changed, + * invalidate all the cached entries. + */ + if (opaque->cached_subtype != subtype) + { + uint16 i; + + for (i = 1; i <= BTMaxStrategyNumber; i++) + opaque->strategy_procinfos[i - 1].fn_oid = InvalidOid; + opaque->cached_subtype = subtype; + } + + if (opaque->strategy_procinfos[strategynum - 1].fn_oid == InvalidOid) + { + Form_pg_attribute attr; + HeapTuple tuple; + Oid opfamily, + oprid; + bool isNull; + + opfamily = bdesc->bd_index->rd_opfamily[attno - 1]; + attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1); + tuple = SearchSysCache4(AMOPSTRATEGY, ObjectIdGetDatum(opfamily), + ObjectIdGetDatum(attr->atttypid), + ObjectIdGetDatum(subtype), + Int16GetDatum(strategynum)); + + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "missing operator %d(%u,%u) in opfamily %u", + strategynum, attr->atttypid, subtype, opfamily); + + oprid = DatumGetObjectId(SysCacheGetAttr(AMOPSTRATEGY, tuple, + Anum_pg_amop_amopopr, &isNull)); + ReleaseSysCache(tuple); + Assert(!isNull && RegProcedureIsValid(oprid)); + + fmgr_info_cxt(get_opcode(oprid), + &opaque->strategy_procinfos[strategynum - 1], + bdesc->bd_context); + } + + return &opaque->strategy_procinfos[strategynum - 1]; +} diff --git a/src/backend/access/brin/brin_minmax_multi.c b/src/backend/access/brin/brin_minmax_multi.c new file mode 100644 index 0000000..5200916 --- /dev/null +++ b/src/backend/access/brin/brin_minmax_multi.c @@ -0,0 +1,3163 @@ +/* + * brin_minmax_multi.c + * Implementation of Multi Min/Max opclass for BRIN + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * Implements a variant of minmax opclass, where the summary is composed of + * multiple smaller intervals. This allows us to handle outliers, which + * usually make the simple minmax opclass inefficient. + * + * Consider for example page range with simple minmax interval [1000,2000], + * and assume a new row gets inserted into the range with value 1000000. + * Due to that the interval gets [1000,1000000]. I.e. the minmax interval + * got 1000x wider and won't be useful to eliminate scan keys between 2001 + * and 1000000. + * + * With minmax-multi opclass, we may have [1000,2000] interval initially, + * but after adding the new row we start tracking it as two interval: + * + * [1000,2000] and [1000000,1000000] + * + * This allows us to still eliminate the page range when the scan keys hit + * the gap between 2000 and 1000000, making it useful in cases when the + * simple minmax opclass gets inefficient. + * + * The number of intervals tracked per page range is somewhat flexible. + * What is restricted is the number of values per page range, and the limit + * is currently 32 (see values_per_range reloption). Collapsed intervals + * (with equal minimum and maximum value) are stored as a single value, + * while regular intervals require two values. + * + * When the number of values gets too high (by adding new values to the + * summary), we merge some of the intervals to free space for more values. + * This is done in a greedy way - we simply pick the two closest intervals, + * merge them, and repeat this until the number of values to store gets + * sufficiently low (below 50% of maximum values), but that is mostly + * arbitrary threshold and may be changed easily). + * + * To pick the closest intervals we use the "distance" support procedure, + * which measures space between two ranges (i.e. the length of an interval). + * The computed value may be an approximation - in the worst case we will + * merge two ranges that are slightly less optimal at that step, but the + * index should still produce correct results. + * + * The compactions (reducing the number of values) is fairly expensive, as + * it requires calling the distance functions, sorting etc. So when building + * the summary, we use a significantly larger buffer, and only enforce the + * exact limit at the very end. This improves performance, and it also helps + * with building better ranges (due to the greedy approach). + * + * + * IDENTIFICATION + * src/backend/access/brin/brin_minmax_multi.c + */ +#include "postgres.h" + +/* needed for PGSQL_AF_INET */ +#include <sys/socket.h> + +#include "access/genam.h" +#include "access/brin.h" +#include "access/brin_internal.h" +#include "access/brin_tuple.h" +#include "access/reloptions.h" +#include "access/stratnum.h" +#include "access/htup_details.h" +#include "catalog/pg_type.h" +#include "catalog/pg_am.h" +#include "catalog/pg_amop.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/date.h" +#include "utils/datum.h" +#include "utils/float.h" +#include "utils/inet.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/numeric.h" +#include "utils/pg_lsn.h" +#include "utils/rel.h" +#include "utils/syscache.h" +#include "utils/timestamp.h" +#include "utils/uuid.h" + +/* + * Additional SQL level support functions + * + * Procedure numbers must not use values reserved for BRIN itself; see + * brin_internal.h. + */ +#define MINMAX_MAX_PROCNUMS 1 /* maximum support procs we need */ +#define PROCNUM_DISTANCE 11 /* required, distance between values */ + +/* + * Subtract this from procnum to obtain index in MinmaxMultiOpaque arrays + * (Must be equal to minimum of private procnums). + */ +#define PROCNUM_BASE 11 + +/* + * Sizing the insert buffer - we use 10x the number of values specified + * in the reloption, but we cap it to 8192 not to get too large. When + * the buffer gets full, we reduce the number of values by half. + */ +#define MINMAX_BUFFER_FACTOR 10 +#define MINMAX_BUFFER_MIN 256 +#define MINMAX_BUFFER_MAX 8192 +#define MINMAX_BUFFER_LOAD_FACTOR 0.5 + +typedef struct MinmaxMultiOpaque +{ + FmgrInfo extra_procinfos[MINMAX_MAX_PROCNUMS]; + bool extra_proc_missing[MINMAX_MAX_PROCNUMS]; + Oid cached_subtype; + FmgrInfo strategy_procinfos[BTMaxStrategyNumber]; +} MinmaxMultiOpaque; + +/* + * Storage type for BRIN's minmax reloptions + */ +typedef struct MinMaxMultiOptions +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int valuesPerRange; /* number of values per range */ +} MinMaxMultiOptions; + +#define MINMAX_MULTI_DEFAULT_VALUES_PER_PAGE 32 + +#define MinMaxMultiGetValuesPerRange(opts) \ + ((opts) && (((MinMaxMultiOptions *) (opts))->valuesPerRange != 0) ? \ + ((MinMaxMultiOptions *) (opts))->valuesPerRange : \ + MINMAX_MULTI_DEFAULT_VALUES_PER_PAGE) + +#define SAMESIGN(a,b) (((a) < 0) == ((b) < 0)) + +/* + * The summary of minmax-multi indexes has two representations - Ranges for + * convenient processing, and SerializedRanges for storage in bytea value. + * + * The Ranges struct stores the boundary values in a single array, but we + * treat regular and single-point ranges differently to save space. For + * regular ranges (with different boundary values) we have to store both + * values, while for "single-point ranges" we only need to save one value. + * + * The 'values' array stores boundary values for regular ranges first (there + * are 2*nranges values to store), and then the nvalues boundary values for + * single-point ranges. That is, we have (2*nranges + nvalues) boundary + * values in the array. + * + * +---------------------------------+-------------------------------+ + * | ranges (sorted pairs of values) | sorted values (single points) | + * +---------------------------------+-------------------------------+ + * + * This allows us to quickly add new values, and store outliers without + * making the other ranges very wide. + * + * We never store more than maxvalues values (as set by values_per_range + * reloption). If needed we merge some of the ranges. + * + * To minimize palloc overhead, we always allocate the full array with + * space for maxvalues elements. This should be fine as long as the + * maxvalues is reasonably small (64 seems fine), which is the case + * thanks to values_per_range reloption being limited to 256. + */ +typedef struct Ranges +{ + /* Cache information that we need quite often. */ + Oid typid; + Oid colloid; + AttrNumber attno; + FmgrInfo *cmp; + + /* (2*nranges + nvalues) <= maxvalues */ + int nranges; /* number of ranges in the array (stored) */ + int nsorted; /* number of sorted values (ranges + points) */ + int nvalues; /* number of values in the data array (all) */ + int maxvalues; /* maximum number of values (reloption) */ + + /* + * We simply add the values into a large buffer, without any expensive + * steps (sorting, deduplication, ...). The buffer is a multiple of the + * target number of values, so the compaction happens less often, + * amortizing the costs. We keep the actual target and compact to the + * requested number of values at the very end, before serializing to + * on-disk representation. + */ + /* requested number of values */ + int target_maxvalues; + + /* values stored for this range - either raw values, or ranges */ + Datum values[FLEXIBLE_ARRAY_MEMBER]; +} Ranges; + +/* + * On-disk the summary is stored as a bytea value, with a simple header + * with basic metadata, followed by the boundary values. It has a varlena + * header, so can be treated as varlena directly. + * + * See range_serialize/range_deserialize for serialization details. + */ +typedef struct SerializedRanges +{ + /* varlena header (do not touch directly!) */ + int32 vl_len_; + + /* type of values stored in the data array */ + Oid typid; + + /* (2*nranges + nvalues) <= maxvalues */ + int nranges; /* number of ranges in the array (stored) */ + int nvalues; /* number of values in the data array (all) */ + int maxvalues; /* maximum number of values (reloption) */ + + /* contains the actual data */ + char data[FLEXIBLE_ARRAY_MEMBER]; +} SerializedRanges; + +static SerializedRanges *range_serialize(Ranges *range); + +static Ranges *range_deserialize(int maxvalues, SerializedRanges *range); + + +/* + * Used to represent ranges expanded to make merging and combining easier. + * + * Each expanded range is essentially an interval, represented by min/max + * values, along with a flag whether it's a collapsed range (in which case + * the min and max values are equal). We have the flag to handle by-ref + * data types - we can't simply compare the datums, and this saves some + * calls to the type-specific comparator function. + */ +typedef struct ExpandedRange +{ + Datum minval; /* lower boundary */ + Datum maxval; /* upper boundary */ + bool collapsed; /* true if minval==maxval */ +} ExpandedRange; + +/* + * Represents a distance between two ranges (identified by index into + * an array of extended ranges). + */ +typedef struct DistanceValue +{ + int index; + double value; +} DistanceValue; + + +/* Cache for support and strategy procedures. */ + +static FmgrInfo *minmax_multi_get_procinfo(BrinDesc *bdesc, uint16 attno, + uint16 procnum); + +static FmgrInfo *minmax_multi_get_strategy_procinfo(BrinDesc *bdesc, + uint16 attno, Oid subtype, + uint16 strategynum); + +typedef struct compare_context +{ + FmgrInfo *cmpFn; + Oid colloid; +} compare_context; + +static int compare_values(const void *a, const void *b, void *arg); + + +#ifdef USE_ASSERT_CHECKING +/* + * Check that the order of the array values is correct, using the cmp + * function (which should be BTLessStrategyNumber). + */ +static void +AssertArrayOrder(FmgrInfo *cmp, Oid colloid, Datum *values, int nvalues) +{ + int i; + Datum lt; + + for (i = 0; i < (nvalues - 1); i++) + { + lt = FunctionCall2Coll(cmp, colloid, values[i], values[i + 1]); + Assert(DatumGetBool(lt)); + } +} +#endif + +/* + * Comprehensive check of the Ranges structure. + */ +static void +AssertCheckRanges(Ranges *ranges, FmgrInfo *cmpFn, Oid colloid) +{ +#ifdef USE_ASSERT_CHECKING + int i; + + /* some basic sanity checks */ + Assert(ranges->nranges >= 0); + Assert(ranges->nsorted >= 0); + Assert(ranges->nvalues >= ranges->nsorted); + Assert(ranges->maxvalues >= 2 * ranges->nranges + ranges->nvalues); + Assert(ranges->typid != InvalidOid); + + /* + * First the ranges - there are 2*nranges boundary values, and the values + * have to be strictly ordered (equal values would mean the range is + * collapsed, and should be stored as a point). This also guarantees that + * the ranges do not overlap. + */ + AssertArrayOrder(cmpFn, colloid, ranges->values, 2 * ranges->nranges); + + /* then the single-point ranges (with nvalues boundar values ) */ + AssertArrayOrder(cmpFn, colloid, &ranges->values[2 * ranges->nranges], + ranges->nsorted); + + /* + * Check that none of the values are not covered by ranges (both sorted + * and unsorted) + */ + for (i = 0; i < ranges->nvalues; i++) + { + Datum compar; + int start, + end; + Datum minvalue, + maxvalue; + + Datum value = ranges->values[2 * ranges->nranges + i]; + + if (ranges->nranges == 0) + break; + + minvalue = ranges->values[0]; + maxvalue = ranges->values[2 * ranges->nranges - 1]; + + /* + * Is the value smaller than the minval? If yes, we'll recurse to the + * left side of range array. + */ + compar = FunctionCall2Coll(cmpFn, colloid, value, minvalue); + + /* smaller than the smallest value in the first range */ + if (DatumGetBool(compar)) + continue; + + /* + * Is the value greater than the maxval? If yes, we'll recurse to the + * right side of range array. + */ + compar = FunctionCall2Coll(cmpFn, colloid, maxvalue, value); + + /* larger than the largest value in the last range */ + if (DatumGetBool(compar)) + continue; + + start = 0; /* first range */ + end = ranges->nranges - 1; /* last range */ + while (true) + { + int midpoint = (start + end) / 2; + + /* this means we ran out of ranges in the last step */ + if (start > end) + break; + + /* copy the min/max values from the ranges */ + minvalue = ranges->values[2 * midpoint]; + maxvalue = ranges->values[2 * midpoint + 1]; + + /* + * Is the value smaller than the minval? If yes, we'll recurse to + * the left side of range array. + */ + compar = FunctionCall2Coll(cmpFn, colloid, value, minvalue); + + /* smaller than the smallest value in this range */ + if (DatumGetBool(compar)) + { + end = (midpoint - 1); + continue; + } + + /* + * Is the value greater than the minval? If yes, we'll recurse to + * the right side of range array. + */ + compar = FunctionCall2Coll(cmpFn, colloid, maxvalue, value); + + /* larger than the largest value in this range */ + if (DatumGetBool(compar)) + { + start = (midpoint + 1); + continue; + } + + /* hey, we found a matching range */ + Assert(false); + } + } + + /* and values in the unsorted part must not be in sorted part */ + for (i = ranges->nsorted; i < ranges->nvalues; i++) + { + compare_context cxt; + Datum value = ranges->values[2 * ranges->nranges + i]; + + if (ranges->nsorted == 0) + break; + + cxt.colloid = ranges->colloid; + cxt.cmpFn = ranges->cmp; + + Assert(bsearch_arg(&value, &ranges->values[2 * ranges->nranges], + ranges->nsorted, sizeof(Datum), + compare_values, (void *) &cxt) == NULL); + } +#endif +} + +/* + * Check that the expanded ranges (built when reducing the number of ranges + * by combining some of them) are correctly sorted and do not overlap. + */ +static void +AssertCheckExpandedRanges(BrinDesc *bdesc, Oid colloid, AttrNumber attno, + Form_pg_attribute attr, ExpandedRange *ranges, + int nranges) +{ +#ifdef USE_ASSERT_CHECKING + int i; + FmgrInfo *eq; + FmgrInfo *lt; + + eq = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTEqualStrategyNumber); + + lt = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTLessStrategyNumber); + + /* + * Each range independently should be valid, i.e. that for the boundary + * values (lower <= upper). + */ + for (i = 0; i < nranges; i++) + { + Datum r; + Datum minval = ranges[i].minval; + Datum maxval = ranges[i].maxval; + + if (ranges[i].collapsed) /* collapsed: minval == maxval */ + r = FunctionCall2Coll(eq, colloid, minval, maxval); + else /* non-collapsed: minval < maxval */ + r = FunctionCall2Coll(lt, colloid, minval, maxval); + + Assert(DatumGetBool(r)); + } + + /* + * And the ranges should be ordered and must not overlap, i.e. upper < + * lower for boundaries of consecutive ranges. + */ + for (i = 0; i < nranges - 1; i++) + { + Datum r; + Datum maxval = ranges[i].maxval; + Datum minval = ranges[i + 1].minval; + + r = FunctionCall2Coll(lt, colloid, maxval, minval); + + Assert(DatumGetBool(r)); + } +#endif +} + + +/* + * minmax_multi_init + * Initialize the deserialized range list, allocate all the memory. + * + * This is only in-memory representation of the ranges, so we allocate + * enough space for the maximum number of values (so as not to have to do + * repallocs as the ranges grow). + */ +static Ranges * +minmax_multi_init(int maxvalues) +{ + Size len; + Ranges *ranges; + + Assert(maxvalues > 0); + + len = offsetof(Ranges, values); /* fixed header */ + len += maxvalues * sizeof(Datum); /* Datum values */ + + ranges = (Ranges *) palloc0(len); + + ranges->maxvalues = maxvalues; + + return ranges; +} + + +/* + * range_deduplicate_values + * Deduplicate the part with values in the simple points. + * + * This is meant to be a cheaper way of reducing the size of the ranges. It + * does not touch the ranges, and only sorts the other values - it does not + * call the distance functions, which may be quite expensive, etc. + * + * We do know the values are not duplicate with the ranges, because we check + * that before adding a new value. Same for the sorted part of values. + */ +static void +range_deduplicate_values(Ranges *range) +{ + int i, + n; + int start; + compare_context cxt; + + /* + * If there are no unsorted values, we're done (this probably can't + * happen, as we're adding values to unsorted part). + */ + if (range->nsorted == range->nvalues) + return; + + /* sort the values */ + cxt.colloid = range->colloid; + cxt.cmpFn = range->cmp; + + /* the values start right after the ranges (which are always sorted) */ + start = 2 * range->nranges; + + /* + * XXX This might do a merge sort, to leverage that the first part of the + * array is already sorted. If the sorted part is large, it might be quite + * a bit faster. + */ + qsort_arg(&range->values[start], + range->nvalues, sizeof(Datum), + compare_values, (void *) &cxt); + + n = 1; + for (i = 1; i < range->nvalues; i++) + { + /* same as preceding value, so store it */ + if (compare_values(&range->values[start + i - 1], + &range->values[start + i], + (void *) &cxt) == 0) + continue; + + range->values[start + n] = range->values[start + i]; + + n++; + } + + /* now all the values are sorted */ + range->nvalues = n; + range->nsorted = n; + + AssertCheckRanges(range, range->cmp, range->colloid); +} + + +/* + * range_serialize + * Serialize the in-memory representation into a compact varlena value. + * + * Simply copy the header and then also the individual values, as stored + * in the in-memory value array. + */ +static SerializedRanges * +range_serialize(Ranges *range) +{ + Size len; + int nvalues; + SerializedRanges *serialized; + Oid typid; + int typlen; + bool typbyval; + + int i; + char *ptr; + + /* simple sanity checks */ + Assert(range->nranges >= 0); + Assert(range->nsorted >= 0); + Assert(range->nvalues >= 0); + Assert(range->maxvalues > 0); + Assert(range->target_maxvalues > 0); + + /* at this point the range should be compacted to the target size */ + Assert(2 * range->nranges + range->nvalues <= range->target_maxvalues); + + Assert(range->target_maxvalues <= range->maxvalues); + + /* range boundaries are always sorted */ + Assert(range->nvalues >= range->nsorted); + + /* deduplicate values, if there's unsorted part */ + range_deduplicate_values(range); + + /* see how many Datum values we actually have */ + nvalues = 2 * range->nranges + range->nvalues; + + typid = range->typid; + typbyval = get_typbyval(typid); + typlen = get_typlen(typid); + + /* header is always needed */ + len = offsetof(SerializedRanges, data); + + /* + * The space needed depends on data type - for fixed-length data types + * (by-value and some by-reference) it's pretty simple, just multiply + * (attlen * nvalues) and we're done. For variable-length by-reference + * types we need to actually walk all the values and sum the lengths. + */ + if (typlen == -1) /* varlena */ + { + int i; + + for (i = 0; i < nvalues; i++) + { + len += VARSIZE_ANY(range->values[i]); + } + } + else if (typlen == -2) /* cstring */ + { + int i; + + for (i = 0; i < nvalues; i++) + { + /* don't forget to include the null terminator ;-) */ + len += strlen(DatumGetCString(range->values[i])) + 1; + } + } + else /* fixed-length types (even by-reference) */ + { + Assert(typlen > 0); + len += nvalues * typlen; + } + + /* + * Allocate the serialized object, copy the basic information. The + * serialized object is a varlena, so update the header. + */ + serialized = (SerializedRanges *) palloc0(len); + SET_VARSIZE(serialized, len); + + serialized->typid = typid; + serialized->nranges = range->nranges; + serialized->nvalues = range->nvalues; + serialized->maxvalues = range->target_maxvalues; + + /* + * And now copy also the boundary values (like the length calculation this + * depends on the particular data type). + */ + ptr = serialized->data; /* start of the serialized data */ + + for (i = 0; i < nvalues; i++) + { + if (typbyval) /* simple by-value data types */ + { + Datum tmp; + + /* + * For byval types, we need to copy just the significant bytes - + * we can't use memcpy directly, as that assumes little-endian + * behavior. store_att_byval does almost what we need, but it + * requires a properly aligned buffer - the output buffer does not + * guarantee that. So we simply use a local Datum variable (which + * guarantees proper alignment), and then copy the value from it. + */ + store_att_byval(&tmp, range->values[i], typlen); + + memcpy(ptr, &tmp, typlen); + ptr += typlen; + } + else if (typlen > 0) /* fixed-length by-ref types */ + { + memcpy(ptr, DatumGetPointer(range->values[i]), typlen); + ptr += typlen; + } + else if (typlen == -1) /* varlena */ + { + int tmp = VARSIZE_ANY(DatumGetPointer(range->values[i])); + + memcpy(ptr, DatumGetPointer(range->values[i]), tmp); + ptr += tmp; + } + else if (typlen == -2) /* cstring */ + { + int tmp = strlen(DatumGetCString(range->values[i])) + 1; + + memcpy(ptr, DatumGetCString(range->values[i]), tmp); + ptr += tmp; + } + + /* make sure we haven't overflown the buffer end */ + Assert(ptr <= ((char *) serialized + len)); + } + + /* exact size */ + Assert(ptr == ((char *) serialized + len)); + + return serialized; +} + +/* + * range_deserialize + * Serialize the in-memory representation into a compact varlena value. + * + * Simply copy the header and then also the individual values, as stored + * in the in-memory value array. + */ +static Ranges * +range_deserialize(int maxvalues, SerializedRanges *serialized) +{ + int i, + nvalues; + char *ptr, + *dataptr; + bool typbyval; + int typlen; + Size datalen; + + Ranges *range; + + Assert(serialized->nranges >= 0); + Assert(serialized->nvalues >= 0); + Assert(serialized->maxvalues > 0); + + nvalues = 2 * serialized->nranges + serialized->nvalues; + + Assert(nvalues <= serialized->maxvalues); + Assert(serialized->maxvalues <= maxvalues); + + range = minmax_multi_init(maxvalues); + + /* copy the header info */ + range->nranges = serialized->nranges; + range->nvalues = serialized->nvalues; + range->nsorted = serialized->nvalues; + range->maxvalues = maxvalues; + range->target_maxvalues = serialized->maxvalues; + + range->typid = serialized->typid; + + typbyval = get_typbyval(serialized->typid); + typlen = get_typlen(serialized->typid); + + /* + * And now deconstruct the values into Datum array. We have to copy the + * data because the serialized representation ignores alignment, and we + * don't want to rely on it being kept around anyway. + */ + ptr = serialized->data; + + /* + * We don't want to allocate many pieces, so we just allocate everything + * in one chunk. How much space will we need? + * + * XXX We don't need to copy simple by-value data types. + */ + datalen = 0; + dataptr = NULL; + for (i = 0; (i < nvalues) && (!typbyval); i++) + { + if (typlen > 0) /* fixed-length by-ref types */ + datalen += MAXALIGN(typlen); + else if (typlen == -1) /* varlena */ + { + datalen += MAXALIGN(VARSIZE_ANY(DatumGetPointer(ptr))); + ptr += VARSIZE_ANY(DatumGetPointer(ptr)); + } + else if (typlen == -2) /* cstring */ + { + Size slen = strlen(DatumGetCString(ptr)) + 1; + + datalen += MAXALIGN(slen); + ptr += slen; + } + } + + if (datalen > 0) + dataptr = palloc(datalen); + + /* + * Restore the source pointer (might have been modified when calculating + * the space we need to allocate). + */ + ptr = serialized->data; + + for (i = 0; i < nvalues; i++) + { + if (typbyval) /* simple by-value data types */ + { + Datum v = 0; + + memcpy(&v, ptr, typlen); + + range->values[i] = fetch_att(&v, true, typlen); + ptr += typlen; + } + else if (typlen > 0) /* fixed-length by-ref types */ + { + range->values[i] = PointerGetDatum(dataptr); + + memcpy(dataptr, ptr, typlen); + dataptr += MAXALIGN(typlen); + + ptr += typlen; + } + else if (typlen == -1) /* varlena */ + { + range->values[i] = PointerGetDatum(dataptr); + + memcpy(dataptr, ptr, VARSIZE_ANY(ptr)); + dataptr += MAXALIGN(VARSIZE_ANY(ptr)); + ptr += VARSIZE_ANY(ptr); + } + else if (typlen == -2) /* cstring */ + { + Size slen = strlen(ptr) + 1; + + range->values[i] = PointerGetDatum(dataptr); + + memcpy(dataptr, ptr, slen); + dataptr += MAXALIGN(slen); + ptr += slen; + } + + /* make sure we haven't overflown the buffer end */ + Assert(ptr <= ((char *) serialized + VARSIZE_ANY(serialized))); + } + + /* should have consumed the whole input value exactly */ + Assert(ptr == ((char *) serialized + VARSIZE_ANY(serialized))); + + /* return the deserialized value */ + return range; +} + +/* + * compare_expanded_ranges + * Compare the expanded ranges - first by minimum, then by maximum. + * + * We do guarantee that ranges in a single Ranges object do not overlap, so it + * may seem strange that we don't order just by minimum. But when merging two + * Ranges (which happens in the union function), the ranges may in fact + * overlap. So we do compare both. + */ +static int +compare_expanded_ranges(const void *a, const void *b, void *arg) +{ + ExpandedRange *ra = (ExpandedRange *) a; + ExpandedRange *rb = (ExpandedRange *) b; + Datum r; + + compare_context *cxt = (compare_context *) arg; + + /* first compare minvals */ + r = FunctionCall2Coll(cxt->cmpFn, cxt->colloid, ra->minval, rb->minval); + + if (DatumGetBool(r)) + return -1; + + r = FunctionCall2Coll(cxt->cmpFn, cxt->colloid, rb->minval, ra->minval); + + if (DatumGetBool(r)) + return 1; + + /* then compare maxvals */ + r = FunctionCall2Coll(cxt->cmpFn, cxt->colloid, ra->maxval, rb->maxval); + + if (DatumGetBool(r)) + return -1; + + r = FunctionCall2Coll(cxt->cmpFn, cxt->colloid, rb->maxval, ra->maxval); + + if (DatumGetBool(r)) + return 1; + + return 0; +} + +/* + * compare_values + * Compare the values. + */ +static int +compare_values(const void *a, const void *b, void *arg) +{ + Datum *da = (Datum *) a; + Datum *db = (Datum *) b; + Datum r; + + compare_context *cxt = (compare_context *) arg; + + r = FunctionCall2Coll(cxt->cmpFn, cxt->colloid, *da, *db); + + if (DatumGetBool(r)) + return -1; + + r = FunctionCall2Coll(cxt->cmpFn, cxt->colloid, *db, *da); + + if (DatumGetBool(r)) + return 1; + + return 0; +} + +/* + * Check if the new value matches one of the existing ranges. + */ +static bool +has_matching_range(BrinDesc *bdesc, Oid colloid, Ranges *ranges, + Datum newval, AttrNumber attno, Oid typid) +{ + Datum compar; + + Datum minvalue = ranges->values[0]; + Datum maxvalue = ranges->values[2 * ranges->nranges - 1]; + + FmgrInfo *cmpLessFn; + FmgrInfo *cmpGreaterFn; + + /* binary search on ranges */ + int start, + end; + + if (ranges->nranges == 0) + return false; + + /* + * Otherwise, need to compare the new value with boundaries of all the + * ranges. First check if it's less than the absolute minimum, which is + * the first value in the array. + */ + cmpLessFn = minmax_multi_get_strategy_procinfo(bdesc, attno, typid, + BTLessStrategyNumber); + compar = FunctionCall2Coll(cmpLessFn, colloid, newval, minvalue); + + /* smaller than the smallest value in the range list */ + if (DatumGetBool(compar)) + return false; + + /* + * And now compare it to the existing maximum (last value in the data + * array). But only if we haven't already ruled out a possible match in + * the minvalue check. + */ + cmpGreaterFn = minmax_multi_get_strategy_procinfo(bdesc, attno, typid, + BTGreaterStrategyNumber); + compar = FunctionCall2Coll(cmpGreaterFn, colloid, newval, maxvalue); + + if (DatumGetBool(compar)) + return false; + + /* + * So we know it's in the general min/max, the question is whether it + * falls in one of the ranges or gaps. We'll do a binary search on + * individual ranges - for each range we check equality (value falls into + * the range), and then check ranges either above or below the current + * range. + */ + start = 0; /* first range */ + end = (ranges->nranges - 1); /* last range */ + while (true) + { + int midpoint = (start + end) / 2; + + /* this means we ran out of ranges in the last step */ + if (start > end) + return false; + + /* copy the min/max values from the ranges */ + minvalue = ranges->values[2 * midpoint]; + maxvalue = ranges->values[2 * midpoint + 1]; + + /* + * Is the value smaller than the minval? If yes, we'll recurse to the + * left side of range array. + */ + compar = FunctionCall2Coll(cmpLessFn, colloid, newval, minvalue); + + /* smaller than the smallest value in this range */ + if (DatumGetBool(compar)) + { + end = (midpoint - 1); + continue; + } + + /* + * Is the value greater than the minval? If yes, we'll recurse to the + * right side of range array. + */ + compar = FunctionCall2Coll(cmpGreaterFn, colloid, newval, maxvalue); + + /* larger than the largest value in this range */ + if (DatumGetBool(compar)) + { + start = (midpoint + 1); + continue; + } + + /* hey, we found a matching range */ + return true; + } + + return false; +} + + +/* + * range_contains_value + * See if the new value is already contained in the range list. + * + * We first inspect the list of intervals. We use a small trick - we check + * the value against min/max of the whole range (min of the first interval, + * max of the last one) first, and only inspect the individual intervals if + * this passes. + * + * If the value matches none of the intervals, we check the exact values. + * We simply loop through them and invoke equality operator on them. + * + * The last parameter (full) determines whether we need to search all the + * values, including the unsorted part. With full=false, the unsorted part + * is not searched, which may produce false negatives and duplicate values + * (in the unsorted part only), but when we're building the range that's + * fine - we'll deduplicate before serialization, and it can only happen + * if there already are unsorted values (so it was already modified). + * + * Serialized ranges don't have any unsorted values, so this can't cause + * false negatives during querying. + */ +static bool +range_contains_value(BrinDesc *bdesc, Oid colloid, + AttrNumber attno, Form_pg_attribute attr, + Ranges *ranges, Datum newval, bool full) +{ + int i; + FmgrInfo *cmpEqualFn; + Oid typid = attr->atttypid; + + /* + * First inspect the ranges, if there are any. We first check the whole + * range, and only when there's still a chance of getting a match we + * inspect the individual ranges. + */ + if (has_matching_range(bdesc, colloid, ranges, newval, attno, typid)) + return true; + + cmpEqualFn = minmax_multi_get_strategy_procinfo(bdesc, attno, typid, + BTEqualStrategyNumber); + + /* + * There is no matching range, so let's inspect the sorted values. + * + * We do a sequential search for small numbers of values, and binary + * search once we have more than 16 values. This threshold is somewhat + * arbitrary, as it depends on how expensive the comparison function is. + * + * XXX If we use the threshold here, maybe we should do the same thing in + * has_matching_range? Or maybe we should do the bin search all the time? + * + * XXX We could use the same optimization as for ranges, to check if the + * value is between min/max, to maybe rule out all sorted values without + * having to inspect all of them. + */ + if (ranges->nsorted >= 16) + { + compare_context cxt; + + cxt.colloid = ranges->colloid; + cxt.cmpFn = ranges->cmp; + + if (bsearch_arg(&newval, &ranges->values[2 * ranges->nranges], + ranges->nsorted, sizeof(Datum), + compare_values, (void *) &cxt) != NULL) + return true; + } + else + { + for (i = 2 * ranges->nranges; i < 2 * ranges->nranges + ranges->nsorted; i++) + { + Datum compar; + + compar = FunctionCall2Coll(cmpEqualFn, colloid, newval, ranges->values[i]); + + /* found an exact match */ + if (DatumGetBool(compar)) + return true; + } + } + + /* If not asked to inspect the unsorted part, we're done. */ + if (!full) + return false; + + /* Inspect the unsorted part. */ + for (i = 2 * ranges->nranges + ranges->nsorted; i < 2 * ranges->nranges + ranges->nvalues; i++) + { + Datum compar; + + compar = FunctionCall2Coll(cmpEqualFn, colloid, newval, ranges->values[i]); + + /* found an exact match */ + if (DatumGetBool(compar)) + return true; + } + + /* the value is not covered by this BRIN tuple */ + return false; +} + +/* + * Expand ranges from Ranges into ExpandedRange array. This expects the + * eranges to be pre-allocated and with the correct size - there needs to be + * (nranges + nvalues) elements. + * + * The order of expanded ranges is arbitrary. We do expand the ranges first, + * and this part is sorted. But then we expand the values, and this part may + * be unsorted. + */ +static void +fill_expanded_ranges(ExpandedRange *eranges, int neranges, Ranges *ranges) +{ + int idx; + int i; + + /* Check that the output array has the right size. */ + Assert(neranges == (ranges->nranges + ranges->nvalues)); + + idx = 0; + for (i = 0; i < ranges->nranges; i++) + { + eranges[idx].minval = ranges->values[2 * i]; + eranges[idx].maxval = ranges->values[2 * i + 1]; + eranges[idx].collapsed = false; + idx++; + + Assert(idx <= neranges); + } + + for (i = 0; i < ranges->nvalues; i++) + { + eranges[idx].minval = ranges->values[2 * ranges->nranges + i]; + eranges[idx].maxval = ranges->values[2 * ranges->nranges + i]; + eranges[idx].collapsed = true; + idx++; + + Assert(idx <= neranges); + } + + /* Did we produce the expected number of elements? */ + Assert(idx == neranges); + + return; +} + +/* + * Sort and deduplicate expanded ranges. + * + * The ranges may be deduplicated - we're simply appending values, without + * checking for duplicates etc. So maybe the deduplication will reduce the + * number of ranges enough, and we won't have to compute the distances etc. + * + * Returns the number of expanded ranges. + */ +static int +sort_expanded_ranges(FmgrInfo *cmp, Oid colloid, + ExpandedRange *eranges, int neranges) +{ + int n; + int i; + compare_context cxt; + + Assert(neranges > 0); + + /* sort the values */ + cxt.colloid = colloid; + cxt.cmpFn = cmp; + + /* + * XXX We do qsort on all the values, but we could also leverage the fact + * that some of the input data is already sorted (all the ranges and maybe + * some of the points) and do merge sort. + */ + qsort_arg(eranges, neranges, sizeof(ExpandedRange), + compare_expanded_ranges, (void *) &cxt); + + /* + * Deduplicate the ranges - simply compare each range to the preceding + * one, and skip the duplicate ones. + */ + n = 1; + for (i = 1; i < neranges; i++) + { + /* if the current range is equal to the preceding one, do nothing */ + if (!compare_expanded_ranges(&eranges[i - 1], &eranges[i], (void *) &cxt)) + continue; + + /* otherwise, copy it to n-th place (if not already there) */ + if (i != n) + memcpy(&eranges[n], &eranges[i], sizeof(ExpandedRange)); + + n++; + } + + Assert((n > 0) && (n <= neranges)); + + return n; +} + +/* + * When combining multiple Range values (in union function), some of the + * ranges may overlap. We simply merge the overlapping ranges to fix that. + * + * XXX This assumes the expanded ranges were previously sorted (by minval + * and then maxval). We leverage this when detecting overlap. + */ +static int +merge_overlapping_ranges(FmgrInfo *cmp, Oid colloid, + ExpandedRange *eranges, int neranges) +{ + int idx; + + /* Merge ranges (idx) and (idx+1) if they overlap. */ + idx = 0; + while (idx < (neranges - 1)) + { + Datum r; + + /* + * comparing [?,maxval] vs. [minval,?] - the ranges overlap if (minval + * < maxval) + */ + r = FunctionCall2Coll(cmp, colloid, + eranges[idx].maxval, + eranges[idx + 1].minval); + + /* + * Nope, maxval < minval, so no overlap. And we know the ranges are + * ordered, so there are no more overlaps, because all the remaining + * ranges have greater or equal minval. + */ + if (DatumGetBool(r)) + { + /* proceed to the next range */ + idx += 1; + continue; + } + + /* + * So ranges 'idx' and 'idx+1' do overlap, but we don't know if + * 'idx+1' is contained in 'idx', or if they overlap only partially. + * So compare the upper bounds and keep the larger one. + */ + r = FunctionCall2Coll(cmp, colloid, + eranges[idx].maxval, + eranges[idx + 1].maxval); + + if (DatumGetBool(r)) + eranges[idx].maxval = eranges[idx + 1].maxval; + + /* + * The range certainly is no longer collapsed (irrespectively of the + * previous state). + */ + eranges[idx].collapsed = false; + + /* + * Now get rid of the (idx+1) range entirely by shifting the remaining + * ranges by 1. There are neranges elements, and we need to move + * elements from (idx+2). That means the number of elements to move is + * [ncranges - (idx+2)]. + */ + memmove(&eranges[idx + 1], &eranges[idx + 2], + (neranges - (idx + 2)) * sizeof(ExpandedRange)); + + /* + * Decrease the number of ranges, and repeat (with the same range, as + * it might overlap with additional ranges thanks to the merge). + */ + neranges--; + } + + return neranges; +} + +/* + * Simple comparator for distance values, comparing the double value. + * This is intentionally sorting the distances in descending order, i.e. + * the longer gaps will be at the front. + */ +static int +compare_distances(const void *a, const void *b) +{ + DistanceValue *da = (DistanceValue *) a; + DistanceValue *db = (DistanceValue *) b; + + if (da->value < db->value) + return 1; + else if (da->value > db->value) + return -1; + + return 0; +} + +/* + * Given an array of expanded ranges, compute size of the gaps between each + * range. For neranges there are (neranges-1) gaps. + * + * We simply call the "distance" function to compute the (max-min) for pairs + * of consecutive ranges. The function may be fairly expensive, so we do that + * just once (and then use it to pick as many ranges to merge as possible). + * + * See reduce_expanded_ranges for details. + */ +static DistanceValue * +build_distances(FmgrInfo *distanceFn, Oid colloid, + ExpandedRange *eranges, int neranges) +{ + int i; + int ndistances; + DistanceValue *distances; + + Assert(neranges >= 2); + + ndistances = (neranges - 1); + distances = (DistanceValue *) palloc0(sizeof(DistanceValue) * ndistances); + + /* + * Walk through the ranges once and compute the distance between the + * ranges so that we can sort them once. + */ + for (i = 0; i < ndistances; i++) + { + Datum a1, + a2, + r; + + a1 = eranges[i].maxval; + a2 = eranges[i + 1].minval; + + /* compute length of the gap (between max/min) */ + r = FunctionCall2Coll(distanceFn, colloid, a1, a2); + + /* remember the index of the gap the distance is for */ + distances[i].index = i; + distances[i].value = DatumGetFloat8(r); + } + + /* + * Sort the distances in descending order, so that the longest gaps are at + * the front. + */ + pg_qsort(distances, ndistances, sizeof(DistanceValue), compare_distances); + + return distances; +} + +/* + * Builds expanded ranges for the existing ranges (and single-point ranges), + * and also the new value (which did not fit into the array). This expanded + * representation makes the processing a bit easier, as it allows handling + * ranges and points the same way. + * + * We sort and deduplicate the expanded ranges - this is necessary, because + * the points may be unsorted. And moreover the two parts (ranges and + * points) are sorted on their own. + */ +static ExpandedRange * +build_expanded_ranges(FmgrInfo *cmp, Oid colloid, Ranges *ranges, + int *nranges) +{ + int neranges; + ExpandedRange *eranges; + + /* both ranges and points are expanded into a separate element */ + neranges = ranges->nranges + ranges->nvalues; + + eranges = (ExpandedRange *) palloc0(neranges * sizeof(ExpandedRange)); + + /* fill the expanded ranges */ + fill_expanded_ranges(eranges, neranges, ranges); + + /* sort and deduplicate the expanded ranges */ + neranges = sort_expanded_ranges(cmp, colloid, eranges, neranges); + + /* remember how many ranges we built */ + *nranges = neranges; + + return eranges; +} + +#ifdef USE_ASSERT_CHECKING +/* + * Counts boundary values needed to store the ranges. Each single-point + * range is stored using a single value, each regular range needs two. + */ +static int +count_values(ExpandedRange *cranges, int ncranges) +{ + int i; + int count; + + count = 0; + for (i = 0; i < ncranges; i++) + { + if (cranges[i].collapsed) + count += 1; + else + count += 2; + } + + return count; +} +#endif + +/* + * reduce_expanded_ranges + * reduce the ranges until the number of values is low enough + * + * Combines ranges until the number of boundary values drops below the + * threshold specified by max_values. This happens by merging enough + * ranges by the distance between them. + * + * Returns the number of result ranges. + * + * We simply use the global min/max and then add boundaries for enough + * largest gaps. Each gap adds 2 values, so we simply use (target/2-1) + * distances. Then we simply sort all the values - each two values are + * a boundary of a range (possibly collapsed). + * + * XXX Some of the ranges may be collapsed (i.e. the min/max values are + * equal), but we ignore that for now. We could repeat the process, + * adding a couple more gaps recursively. + * + * XXX The ranges to merge are selected solely using the distance. But + * that may not be the best strategy, for example when multiple gaps + * are of equal (or very similar) length. + * + * Consider for example points 1, 2, 3, .., 64, which have gaps of the + * same length 1 of course. In that case, we tend to pick the first + * gap of that length, which leads to this: + * + * step 1: [1, 2], 3, 4, 5, .., 64 + * step 2: [1, 3], 4, 5, .., 64 + * step 3: [1, 4], 5, .., 64 + * ... + * + * So in the end we'll have one "large" range and multiple small points. + * That may be fine, but it seems a bit strange and non-optimal. Maybe + * we should consider other things when picking ranges to merge - e.g. + * length of the ranges? Or perhaps randomize the choice of ranges, with + * probability inversely proportional to the distance (the gap lengths + * may be very close, but not exactly the same). + * + * XXX Or maybe we could just handle this by using random value as a + * tie-break, or by adding random noise to the actual distance. + */ +static int +reduce_expanded_ranges(ExpandedRange *eranges, int neranges, + DistanceValue *distances, int max_values, + FmgrInfo *cmp, Oid colloid) +{ + int i; + int nvalues; + Datum *values; + + compare_context cxt; + + /* total number of gaps between ranges */ + int ndistances = (neranges - 1); + + /* number of gaps to keep */ + int keep = (max_values / 2 - 1); + + /* + * Maybe we have a sufficiently low number of ranges already? + * + * XXX This should happen before we actually do the expensive stuff like + * sorting, so maybe this should be just an assert. + */ + if (keep >= ndistances) + return neranges; + + /* sort the values */ + cxt.colloid = colloid; + cxt.cmpFn = cmp; + + /* allocate space for the boundary values */ + nvalues = 0; + values = (Datum *) palloc(sizeof(Datum) * max_values); + + /* add the global min/max values, from the first/last range */ + values[nvalues++] = eranges[0].minval; + values[nvalues++] = eranges[neranges - 1].maxval; + + /* add boundary values for enough gaps */ + for (i = 0; i < keep; i++) + { + /* index of the gap between (index) and (index+1) ranges */ + int index = distances[i].index; + + Assert((index >= 0) && ((index + 1) < neranges)); + + /* add max from the preceding range, minval from the next one */ + values[nvalues++] = eranges[index].maxval; + values[nvalues++] = eranges[index + 1].minval; + + Assert(nvalues <= max_values); + } + + /* We should have an even number of range values. */ + Assert(nvalues % 2 == 0); + + /* + * Sort the values using the comparator function, and form ranges from the + * sorted result. + */ + qsort_arg(values, nvalues, sizeof(Datum), + compare_values, (void *) &cxt); + + /* We have nvalues boundary values, which means nvalues/2 ranges. */ + for (i = 0; i < (nvalues / 2); i++) + { + eranges[i].minval = values[2 * i]; + eranges[i].maxval = values[2 * i + 1]; + + /* if the boundary values are the same, it's a collapsed range */ + eranges[i].collapsed = (compare_values(&values[2 * i], + &values[2 * i + 1], + &cxt) == 0); + } + + return (nvalues / 2); +} + +/* + * Store the boundary values from ExpandedRanges back into 'ranges' (using + * only the minimal number of values needed). + */ +static void +store_expanded_ranges(Ranges *ranges, ExpandedRange *eranges, int neranges) +{ + int i; + int idx = 0; + + /* first copy in the regular ranges */ + ranges->nranges = 0; + for (i = 0; i < neranges; i++) + { + if (!eranges[i].collapsed) + { + ranges->values[idx++] = eranges[i].minval; + ranges->values[idx++] = eranges[i].maxval; + ranges->nranges++; + } + } + + /* now copy in the collapsed ones */ + ranges->nvalues = 0; + for (i = 0; i < neranges; i++) + { + if (eranges[i].collapsed) + { + ranges->values[idx++] = eranges[i].minval; + ranges->nvalues++; + } + } + + /* all the values are sorted */ + ranges->nsorted = ranges->nvalues; + + Assert(count_values(eranges, neranges) == 2 * ranges->nranges + ranges->nvalues); + Assert(2 * ranges->nranges + ranges->nvalues <= ranges->maxvalues); +} + + +/* + * Consider freeing space in the ranges. Checks if there's space for at least + * one new value, and performs compaction if needed. + * + * Returns true if the value was actually modified. + */ +static bool +ensure_free_space_in_buffer(BrinDesc *bdesc, Oid colloid, + AttrNumber attno, Form_pg_attribute attr, + Ranges *range) +{ + MemoryContext ctx; + MemoryContext oldctx; + + FmgrInfo *cmpFn, + *distanceFn; + + /* expanded ranges */ + ExpandedRange *eranges; + int neranges; + DistanceValue *distances; + + /* + * If there is free space in the buffer, we're done without having to + * modify anything. + */ + if (2 * range->nranges + range->nvalues < range->maxvalues) + return false; + + /* we'll certainly need the comparator, so just look it up now */ + cmpFn = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTLessStrategyNumber); + + /* deduplicate values, if there's an unsorted part */ + range_deduplicate_values(range); + + /* + * Did we reduce enough free space by just the deduplication? + * + * We don't simply check against range->maxvalues again. The deduplication + * might have freed very little space (e.g. just one value), forcing us to + * do deduplication very often. In that case, it's better to do the + * compaction and reduce more space. + */ + if (2 * range->nranges + range->nvalues <= range->maxvalues * MINMAX_BUFFER_LOAD_FACTOR) + return true; + + /* + * We need to combine some of the existing ranges, to reduce the number of + * values we have to store. + * + * The distanceFn calls (which may internally call e.g. numeric_le) may + * allocate quite a bit of memory, and we must not leak it (we might have + * to do this repeatedly, even for a single BRIN page range). Otherwise + * we'd have problems e.g. when building new indexes. So we use a memory + * context and make sure we free the memory at the end (so if we call the + * distance function many times, it might be an issue, but meh). + */ + ctx = AllocSetContextCreate(CurrentMemoryContext, + "minmax-multi context", + ALLOCSET_DEFAULT_SIZES); + + oldctx = MemoryContextSwitchTo(ctx); + + /* build the expanded ranges */ + eranges = build_expanded_ranges(cmpFn, colloid, range, &neranges); + + /* and we'll also need the 'distance' procedure */ + distanceFn = minmax_multi_get_procinfo(bdesc, attno, PROCNUM_DISTANCE); + + /* build array of gap distances and sort them in ascending order */ + distances = build_distances(distanceFn, colloid, eranges, neranges); + + /* + * Combine ranges until we release at least 50% of the space. This + * threshold is somewhat arbitrary, perhaps needs tuning. We must not use + * too low or high value. + */ + neranges = reduce_expanded_ranges(eranges, neranges, distances, + range->maxvalues * MINMAX_BUFFER_LOAD_FACTOR, + cmpFn, colloid); + + /* Make sure we've sufficiently reduced the number of ranges. */ + Assert(count_values(eranges, neranges) <= range->maxvalues * MINMAX_BUFFER_LOAD_FACTOR); + + /* decompose the expanded ranges into regular ranges and single values */ + store_expanded_ranges(range, eranges, neranges); + + MemoryContextSwitchTo(oldctx); + MemoryContextDelete(ctx); + + /* Did we break the ranges somehow? */ + AssertCheckRanges(range, cmpFn, colloid); + + return true; +} + +/* + * range_add_value + * Add the new value to the minmax-multi range. + */ +static bool +range_add_value(BrinDesc *bdesc, Oid colloid, + AttrNumber attno, Form_pg_attribute attr, + Ranges *ranges, Datum newval) +{ + FmgrInfo *cmpFn; + bool modified = false; + + /* we'll certainly need the comparator, so just look it up now */ + cmpFn = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTLessStrategyNumber); + + /* comprehensive checks of the input ranges */ + AssertCheckRanges(ranges, cmpFn, colloid); + + /* + * Make sure there's enough free space in the buffer. We only trigger this + * when the buffer is full, which means it had to be modified as we size + * it to be larger than what is stored on disk. + * + * This needs to happen before we check if the value is contained in the + * range, because the value might be in the unsorted part, and we don't + * check that in range_contains_value. The deduplication would then move + * it to the sorted part, and we'd add the value too, which violates the + * rule that we never have duplicates with the ranges or sorted values. + * + * We might also deduplicate and recheck if the value is contained, but + * that seems like overkill. We'd need to deduplicate anyway, so why not + * do it now. + */ + modified = ensure_free_space_in_buffer(bdesc, colloid, + attno, attr, ranges); + + /* + * Bail out if the value already is covered by the range. + * + * We could also add values until we hit values_per_range, and then do the + * deduplication in a batch, hoping for better efficiency. But that would + * mean we actually modify the range every time, which means having to + * serialize the value, which does palloc, walks the values, copies them, + * etc. Not exactly cheap. + * + * So instead we do the check, which should be fairly cheap - assuming the + * comparator function is not very expensive. + * + * This also implies the values array can't contain duplicate values. + */ + if (range_contains_value(bdesc, colloid, attno, attr, ranges, newval, false)) + return modified; + + /* Make a copy of the value, if needed. */ + newval = datumCopy(newval, attr->attbyval, attr->attlen); + + /* + * If there's space in the values array, copy it in and we're done. + * + * We do want to keep the values sorted (to speed up searches), so we do a + * simple insertion sort. We could do something more elaborate, e.g. by + * sorting the values only now and then, but for small counts (e.g. when + * maxvalues is 64) this should be fine. + */ + ranges->values[2 * ranges->nranges + ranges->nvalues] = newval; + ranges->nvalues++; + + /* If we added the first value, we can consider it as sorted. */ + if (ranges->nvalues == 1) + ranges->nsorted = 1; + + /* + * Check we haven't broken the ordering of boundary values (checks both + * parts, but that doesn't hurt). + */ + AssertCheckRanges(ranges, cmpFn, colloid); + + /* Check the range contains the value we just added. */ + Assert(range_contains_value(bdesc, colloid, attno, attr, ranges, newval, true)); + + /* yep, we've modified the range */ + return true; +} + +/* + * Generate range representation of data collected during "batch mode". + * This is similar to reduce_expanded_ranges, except that we can't assume + * the values are sorted and there may be duplicate values. + */ +static void +compactify_ranges(BrinDesc *bdesc, Ranges *ranges, int max_values) +{ + FmgrInfo *cmpFn, + *distanceFn; + + /* expanded ranges */ + ExpandedRange *eranges; + int neranges; + DistanceValue *distances; + + MemoryContext ctx; + MemoryContext oldctx; + + /* + * Do we need to actually compactify anything? + * + * There are two reasons why compaction may be needed - firstly, there may + * be too many values, or some of the values may be unsorted. + */ + if ((ranges->nranges * 2 + ranges->nvalues <= max_values) && + (ranges->nsorted == ranges->nvalues)) + return; + + /* we'll certainly need the comparator, so just look it up now */ + cmpFn = minmax_multi_get_strategy_procinfo(bdesc, ranges->attno, ranges->typid, + BTLessStrategyNumber); + + /* and we'll also need the 'distance' procedure */ + distanceFn = minmax_multi_get_procinfo(bdesc, ranges->attno, PROCNUM_DISTANCE); + + /* + * The distanceFn calls (which may internally call e.g. numeric_le) may + * allocate quite a bit of memory, and we must not leak it. Otherwise, + * we'd have problems e.g. when building indexes. So we create a local + * memory context and make sure we free the memory before leaving this + * function (not after every call). + */ + ctx = AllocSetContextCreate(CurrentMemoryContext, + "minmax-multi context", + ALLOCSET_DEFAULT_SIZES); + + oldctx = MemoryContextSwitchTo(ctx); + + /* build the expanded ranges */ + eranges = build_expanded_ranges(cmpFn, ranges->colloid, ranges, &neranges); + + /* build array of gap distances and sort them in ascending order */ + distances = build_distances(distanceFn, ranges->colloid, + eranges, neranges); + + /* + * Combine ranges until we get below max_values. We don't use any scale + * factor, because this is used during serialization, and we don't expect + * more tuples to be inserted anytime soon. + */ + neranges = reduce_expanded_ranges(eranges, neranges, distances, + max_values, cmpFn, ranges->colloid); + + Assert(count_values(eranges, neranges) <= max_values); + + /* transform back into regular ranges and single values */ + store_expanded_ranges(ranges, eranges, neranges); + + /* check all the range invariants */ + AssertCheckRanges(ranges, cmpFn, ranges->colloid); + + MemoryContextSwitchTo(oldctx); + MemoryContextDelete(ctx); +} + +Datum +brin_minmax_multi_opcinfo(PG_FUNCTION_ARGS) +{ + BrinOpcInfo *result; + + /* + * opaque->strategy_procinfos is initialized lazily; here it is set to + * all-uninitialized by palloc0 which sets fn_oid to InvalidOid. + */ + + result = palloc0(MAXALIGN(SizeofBrinOpcInfo(1)) + + sizeof(MinmaxMultiOpaque)); + result->oi_nstored = 1; + result->oi_regular_nulls = true; + result->oi_opaque = (MinmaxMultiOpaque *) + MAXALIGN((char *) result + SizeofBrinOpcInfo(1)); + result->oi_typcache[0] = lookup_type_cache(PG_BRIN_MINMAX_MULTI_SUMMARYOID, 0); + + PG_RETURN_POINTER(result); +} + +/* + * Compute the distance between two float4 values (plain subtraction). + */ +Datum +brin_minmax_multi_distance_float4(PG_FUNCTION_ARGS) +{ + float a1 = PG_GETARG_FLOAT4(0); + float a2 = PG_GETARG_FLOAT4(1); + + /* if both values are NaN, then we consider them the same */ + if (isnan(a1) && isnan(a2)) + PG_RETURN_FLOAT8(0.0); + + /* if one value is NaN, use infinite distance */ + if (isnan(a1) || isnan(a2)) + PG_RETURN_FLOAT8(get_float8_infinity()); + + /* + * We know the values are range boundaries, but the range may be collapsed + * (i.e. single points), with equal values. + */ + Assert(a1 <= a2); + + PG_RETURN_FLOAT8((double) a2 - (double) a1); +} + +/* + * Compute the distance between two float8 values (plain subtraction). + */ +Datum +brin_minmax_multi_distance_float8(PG_FUNCTION_ARGS) +{ + double a1 = PG_GETARG_FLOAT8(0); + double a2 = PG_GETARG_FLOAT8(1); + + /* if both values are NaN, then we consider them the same */ + if (isnan(a1) && isnan(a2)) + PG_RETURN_FLOAT8(0.0); + + /* if one value is NaN, use infinite distance */ + if (isnan(a1) || isnan(a2)) + PG_RETURN_FLOAT8(get_float8_infinity()); + + /* + * We know the values are range boundaries, but the range may be collapsed + * (i.e. single points), with equal values. + */ + Assert(a1 <= a2); + + PG_RETURN_FLOAT8(a2 - a1); +} + +/* + * Compute the distance between two int2 values (plain subtraction). + */ +Datum +brin_minmax_multi_distance_int2(PG_FUNCTION_ARGS) +{ + int16 a1 = PG_GETARG_INT16(0); + int16 a2 = PG_GETARG_INT16(1); + + /* + * We know the values are range boundaries, but the range may be collapsed + * (i.e. single points), with equal values. + */ + Assert(a1 <= a2); + + PG_RETURN_FLOAT8((double) a2 - (double) a1); +} + +/* + * Compute the distance between two int4 values (plain subtraction). + */ +Datum +brin_minmax_multi_distance_int4(PG_FUNCTION_ARGS) +{ + int32 a1 = PG_GETARG_INT32(0); + int32 a2 = PG_GETARG_INT32(1); + + /* + * We know the values are range boundaries, but the range may be collapsed + * (i.e. single points), with equal values. + */ + Assert(a1 <= a2); + + PG_RETURN_FLOAT8((double) a2 - (double) a1); +} + +/* + * Compute the distance between two int8 values (plain subtraction). + */ +Datum +brin_minmax_multi_distance_int8(PG_FUNCTION_ARGS) +{ + int64 a1 = PG_GETARG_INT64(0); + int64 a2 = PG_GETARG_INT64(1); + + /* + * We know the values are range boundaries, but the range may be collapsed + * (i.e. single points), with equal values. + */ + Assert(a1 <= a2); + + PG_RETURN_FLOAT8((double) a2 - (double) a1); +} + +/* + * Compute the distance between two tid values (by mapping them to float8 and + * then subtracting them). + */ +Datum +brin_minmax_multi_distance_tid(PG_FUNCTION_ARGS) +{ + double da1, + da2; + + ItemPointer pa1 = (ItemPointer) PG_GETARG_DATUM(0); + ItemPointer pa2 = (ItemPointer) PG_GETARG_DATUM(1); + + /* + * We know the values are range boundaries, but the range may be collapsed + * (i.e. single points), with equal values. + */ + Assert(ItemPointerCompare(pa1, pa2) <= 0); + + /* + * We use the no-check variants here, because user-supplied values may + * have (ip_posid == 0). See ItemPointerCompare. + */ + da1 = ItemPointerGetBlockNumberNoCheck(pa1) * MaxHeapTuplesPerPage + + ItemPointerGetOffsetNumberNoCheck(pa1); + + da2 = ItemPointerGetBlockNumberNoCheck(pa2) * MaxHeapTuplesPerPage + + ItemPointerGetOffsetNumberNoCheck(pa2); + + PG_RETURN_FLOAT8(da2 - da1); +} + +/* + * Compute the distance between two numeric values (plain subtraction). + */ +Datum +brin_minmax_multi_distance_numeric(PG_FUNCTION_ARGS) +{ + Datum d; + Datum a1 = PG_GETARG_DATUM(0); + Datum a2 = PG_GETARG_DATUM(1); + + /* + * We know the values are range boundaries, but the range may be collapsed + * (i.e. single points), with equal values. + */ + Assert(DatumGetBool(DirectFunctionCall2(numeric_le, a1, a2))); + + d = DirectFunctionCall2(numeric_sub, a2, a1); /* a2 - a1 */ + + PG_RETURN_FLOAT8(DirectFunctionCall1(numeric_float8, d)); +} + +/* + * Compute the approximate distance between two UUID values. + * + * XXX We do not need a perfectly accurate value, so we approximate the + * deltas (which would have to be 128-bit integers) with a 64-bit float. + * The small inaccuracies do not matter in practice, in the worst case + * we'll decide to merge ranges that are not the closest ones. + */ +Datum +brin_minmax_multi_distance_uuid(PG_FUNCTION_ARGS) +{ + int i; + float8 delta = 0; + + Datum a1 = PG_GETARG_DATUM(0); + Datum a2 = PG_GETARG_DATUM(1); + + pg_uuid_t *u1 = DatumGetUUIDP(a1); + pg_uuid_t *u2 = DatumGetUUIDP(a2); + + /* + * We know the values are range boundaries, but the range may be collapsed + * (i.e. single points), with equal values. + */ + Assert(DatumGetBool(DirectFunctionCall2(uuid_le, a1, a2))); + + /* compute approximate delta as a double precision value */ + for (i = UUID_LEN - 1; i >= 0; i--) + { + delta += (int) u2->data[i] - (int) u1->data[i]; + delta /= 256; + } + + Assert(delta >= 0); + + PG_RETURN_FLOAT8(delta); +} + +/* + * Compute the approximate distance between two dates. + */ +Datum +brin_minmax_multi_distance_date(PG_FUNCTION_ARGS) +{ + DateADT dateVal1 = PG_GETARG_DATEADT(0); + DateADT dateVal2 = PG_GETARG_DATEADT(1); + + if (DATE_NOT_FINITE(dateVal1) || DATE_NOT_FINITE(dateVal2)) + PG_RETURN_FLOAT8(0); + + PG_RETURN_FLOAT8(dateVal1 - dateVal2); +} + +/* + * Compute the approximate distance between two time (without tz) values. + * + * TimeADT is just an int64, so we simply subtract the values directly. + */ +Datum +brin_minmax_multi_distance_time(PG_FUNCTION_ARGS) +{ + float8 delta = 0; + + TimeADT ta = PG_GETARG_TIMEADT(0); + TimeADT tb = PG_GETARG_TIMEADT(1); + + delta = (tb - ta); + + Assert(delta >= 0); + + PG_RETURN_FLOAT8(delta); +} + +/* + * Compute the approximate distance between two timetz values. + * + * Simply subtracts the TimeADT (int64) values embedded in TimeTzADT. + */ +Datum +brin_minmax_multi_distance_timetz(PG_FUNCTION_ARGS) +{ + float8 delta = 0; + + TimeTzADT *ta = PG_GETARG_TIMETZADT_P(0); + TimeTzADT *tb = PG_GETARG_TIMETZADT_P(1); + + delta = (tb->time - ta->time) + (tb->zone - ta->zone) * USECS_PER_SEC; + + Assert(delta >= 0); + + PG_RETURN_FLOAT8(delta); +} + +/* + * Compute the distance between two timestamp values. + */ +Datum +brin_minmax_multi_distance_timestamp(PG_FUNCTION_ARGS) +{ + float8 delta = 0; + + Timestamp dt1 = PG_GETARG_TIMESTAMP(0); + Timestamp dt2 = PG_GETARG_TIMESTAMP(1); + + if (TIMESTAMP_NOT_FINITE(dt1) || TIMESTAMP_NOT_FINITE(dt2)) + PG_RETURN_FLOAT8(0); + + delta = dt2 - dt1; + + Assert(delta >= 0); + + PG_RETURN_FLOAT8(delta); +} + +/* + * Compute the distance between two interval values. + */ +Datum +brin_minmax_multi_distance_interval(PG_FUNCTION_ARGS) +{ + float8 delta = 0; + + Interval *ia = PG_GETARG_INTERVAL_P(0); + Interval *ib = PG_GETARG_INTERVAL_P(1); + Interval *result; + + int64 dayfraction; + int64 days; + + result = (Interval *) palloc(sizeof(Interval)); + + result->month = ib->month - ia->month; + /* overflow check copied from int4mi */ + if (!SAMESIGN(ib->month, ia->month) && + !SAMESIGN(result->month, ib->month)) + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("interval out of range"))); + + result->day = ib->day - ia->day; + if (!SAMESIGN(ib->day, ia->day) && + !SAMESIGN(result->day, ib->day)) + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("interval out of range"))); + + result->time = ib->time - ia->time; + if (!SAMESIGN(ib->time, ia->time) && + !SAMESIGN(result->time, ib->time)) + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("interval out of range"))); + + /* + * Delta is (fractional) number of days between the intervals. Assume + * months have 30 days for consistency with interval_cmp_internal. We + * don't need to be exact, in the worst case we'll build a bit less + * efficient ranges. But we should not contradict interval_cmp. + */ + dayfraction = result->time % USECS_PER_DAY; + days = result->time / USECS_PER_DAY; + days += result->month * INT64CONST(30); + days += result->day; + + /* convert to double precision */ + delta = (double) days + dayfraction / (double) USECS_PER_DAY; + + Assert(delta >= 0); + + PG_RETURN_FLOAT8(delta); +} + +/* + * Compute the distance between two pg_lsn values. + * + * LSN is just an int64 encoding position in the stream, so just subtract + * those int64 values directly. + */ +Datum +brin_minmax_multi_distance_pg_lsn(PG_FUNCTION_ARGS) +{ + float8 delta = 0; + + XLogRecPtr lsna = PG_GETARG_LSN(0); + XLogRecPtr lsnb = PG_GETARG_LSN(1); + + delta = (lsnb - lsna); + + Assert(delta >= 0); + + PG_RETURN_FLOAT8(delta); +} + +/* + * Compute the distance between two macaddr values. + * + * mac addresses are treated as 6 unsigned chars, so do the same thing we + * already do for UUID values. + */ +Datum +brin_minmax_multi_distance_macaddr(PG_FUNCTION_ARGS) +{ + float8 delta; + + macaddr *a = PG_GETARG_MACADDR_P(0); + macaddr *b = PG_GETARG_MACADDR_P(1); + + delta = ((float8) b->f - (float8) a->f); + delta /= 256; + + delta += ((float8) b->e - (float8) a->e); + delta /= 256; + + delta += ((float8) b->d - (float8) a->d); + delta /= 256; + + delta += ((float8) b->c - (float8) a->c); + delta /= 256; + + delta += ((float8) b->b - (float8) a->b); + delta /= 256; + + delta += ((float8) b->a - (float8) a->a); + delta /= 256; + + Assert(delta >= 0); + + PG_RETURN_FLOAT8(delta); +} + +/* + * Compute the distance between two macaddr8 values. + * + * macaddr8 addresses are 8 unsigned chars, so do the same thing we + * already do for UUID values. + */ +Datum +brin_minmax_multi_distance_macaddr8(PG_FUNCTION_ARGS) +{ + float8 delta; + + macaddr8 *a = PG_GETARG_MACADDR8_P(0); + macaddr8 *b = PG_GETARG_MACADDR8_P(1); + + delta = ((float8) b->h - (float8) a->h); + delta /= 256; + + delta += ((float8) b->g - (float8) a->g); + delta /= 256; + + delta += ((float8) b->f - (float8) a->f); + delta /= 256; + + delta += ((float8) b->e - (float8) a->e); + delta /= 256; + + delta += ((float8) b->d - (float8) a->d); + delta /= 256; + + delta += ((float8) b->c - (float8) a->c); + delta /= 256; + + delta += ((float8) b->b - (float8) a->b); + delta /= 256; + + delta += ((float8) b->a - (float8) a->a); + delta /= 256; + + Assert(delta >= 0); + + PG_RETURN_FLOAT8(delta); +} + +/* + * Compute the distance between two inet values. + * + * The distance is defined as the difference between 32-bit/128-bit values, + * depending on the IP version. The distance is computed by subtracting + * the bytes and normalizing it to [0,1] range for each IP family. + * Addresses from different families are considered to be in maximum + * distance, which is 1.0. + * + * XXX Does this need to consider the mask (bits)? For now, it's ignored. + */ +Datum +brin_minmax_multi_distance_inet(PG_FUNCTION_ARGS) +{ + float8 delta; + int i; + int len; + unsigned char *addra, + *addrb; + + inet *ipa = PG_GETARG_INET_PP(0); + inet *ipb = PG_GETARG_INET_PP(1); + + int lena, + lenb; + + /* + * If the addresses are from different families, consider them to be in + * maximal possible distance (which is 1.0). + */ + if (ip_family(ipa) != ip_family(ipb)) + PG_RETURN_FLOAT8(1.0); + + addra = (unsigned char *) palloc(ip_addrsize(ipa)); + memcpy(addra, ip_addr(ipa), ip_addrsize(ipa)); + + addrb = (unsigned char *) palloc(ip_addrsize(ipb)); + memcpy(addrb, ip_addr(ipb), ip_addrsize(ipb)); + + /* + * The length is calculated from the mask length, because we sort the + * addresses by first address in the range, so A.B.C.D/24 < A.B.C.1 (the + * first range starts at A.B.C.0, which is before A.B.C.1). We don't want + * to produce a negative delta in this case, so we just cut the extra + * bytes. + * + * XXX Maybe this should be a bit more careful and cut the bits, not just + * whole bytes. + */ + lena = ip_bits(ipa); + lenb = ip_bits(ipb); + + len = ip_addrsize(ipa); + + /* apply the network mask to both addresses */ + for (i = 0; i < len; i++) + { + unsigned char mask; + int nbits; + + nbits = lena - (i * 8); + if (nbits < 8) + { + mask = (0xFF << (8 - nbits)); + addra[i] = (addra[i] & mask); + } + + nbits = lenb - (i * 8); + if (nbits < 8) + { + mask = (0xFF << (8 - nbits)); + addrb[i] = (addrb[i] & mask); + } + } + + /* Calculate the difference between the addresses. */ + delta = 0; + for (i = len - 1; i >= 0; i--) + { + unsigned char a = addra[i]; + unsigned char b = addrb[i]; + + delta += (float8) b - (float8) a; + delta /= 256; + } + + Assert((delta >= 0) && (delta <= 1)); + + pfree(addra); + pfree(addrb); + + PG_RETURN_FLOAT8(delta); +} + +static void +brin_minmax_multi_serialize(BrinDesc *bdesc, Datum src, Datum *dst) +{ + Ranges *ranges = (Ranges *) DatumGetPointer(src); + SerializedRanges *s; + + /* + * In batch mode, we need to compress the accumulated values to the + * actually requested number of values/ranges. + */ + compactify_ranges(bdesc, ranges, ranges->target_maxvalues); + + /* At this point everything has to be fully sorted. */ + Assert(ranges->nsorted == ranges->nvalues); + + s = range_serialize(ranges); + dst[0] = PointerGetDatum(s); +} + +static int +brin_minmax_multi_get_values(BrinDesc *bdesc, MinMaxMultiOptions *opts) +{ + return MinMaxMultiGetValuesPerRange(opts); +} + +/* + * Examine the given index tuple (which contains the partial status of a + * certain page range) by comparing it to the given value that comes from + * another heap tuple. If the new value is outside the min/max range + * specified by the existing tuple values, update the index tuple and return + * true. Otherwise, return false and do not modify in this case. + */ +Datum +brin_minmax_multi_add_value(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); + Datum newval = PG_GETARG_DATUM(2); + bool isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_DATUM(3); + MinMaxMultiOptions *opts = (MinMaxMultiOptions *) PG_GET_OPCLASS_OPTIONS(); + Oid colloid = PG_GET_COLLATION(); + bool modified = false; + Form_pg_attribute attr; + AttrNumber attno; + Ranges *ranges; + SerializedRanges *serialized = NULL; + + Assert(!isnull); + + attno = column->bv_attno; + attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1); + + /* use the already deserialized value, if possible */ + ranges = (Ranges *) DatumGetPointer(column->bv_mem_value); + + /* + * If this is the first non-null value, we need to initialize the range + * list. Otherwise, just extract the existing range list from BrinValues. + * + * When starting with an empty range, we assume this is a batch mode and + * we use a larger buffer. The buffer size is derived from the BRIN range + * size, number of rows per page, with some sensible min/max values. A + * small buffer would be bad for performance, but a large buffer might + * require a lot of memory (because of keeping all the values). + */ + if (column->bv_allnulls) + { + MemoryContext oldctx; + + int target_maxvalues; + int maxvalues; + BlockNumber pagesPerRange = BrinGetPagesPerRange(bdesc->bd_index); + + /* what was specified as a reloption? */ + target_maxvalues = brin_minmax_multi_get_values(bdesc, opts); + + /* + * Determine the insert buffer size - we use 10x the target, capped to + * the maximum number of values in the heap range. This is more than + * enough, considering the actual number of rows per page is likely + * much lower, but meh. + */ + maxvalues = Min(target_maxvalues * MINMAX_BUFFER_FACTOR, + MaxHeapTuplesPerPage * pagesPerRange); + + /* but always at least the original value */ + maxvalues = Max(maxvalues, target_maxvalues); + + /* always cap by MIN/MAX */ + maxvalues = Max(maxvalues, MINMAX_BUFFER_MIN); + maxvalues = Min(maxvalues, MINMAX_BUFFER_MAX); + + oldctx = MemoryContextSwitchTo(column->bv_context); + ranges = minmax_multi_init(maxvalues); + ranges->attno = attno; + ranges->colloid = colloid; + ranges->typid = attr->atttypid; + ranges->target_maxvalues = target_maxvalues; + + /* we'll certainly need the comparator, so just look it up now */ + ranges->cmp = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTLessStrategyNumber); + + MemoryContextSwitchTo(oldctx); + + column->bv_allnulls = false; + modified = true; + + column->bv_mem_value = PointerGetDatum(ranges); + column->bv_serialize = brin_minmax_multi_serialize; + } + else if (!ranges) + { + MemoryContext oldctx; + + int maxvalues; + BlockNumber pagesPerRange = BrinGetPagesPerRange(bdesc->bd_index); + + oldctx = MemoryContextSwitchTo(column->bv_context); + + serialized = (SerializedRanges *) PG_DETOAST_DATUM(column->bv_values[0]); + + /* + * Determine the insert buffer size - we use 10x the target, capped to + * the maximum number of values in the heap range. This is more than + * enough, considering the actual number of rows per page is likely + * much lower, but meh. + */ + maxvalues = Min(serialized->maxvalues * MINMAX_BUFFER_FACTOR, + MaxHeapTuplesPerPage * pagesPerRange); + + /* but always at least the original value */ + maxvalues = Max(maxvalues, serialized->maxvalues); + + /* always cap by MIN/MAX */ + maxvalues = Max(maxvalues, MINMAX_BUFFER_MIN); + maxvalues = Min(maxvalues, MINMAX_BUFFER_MAX); + + ranges = range_deserialize(maxvalues, serialized); + + ranges->attno = attno; + ranges->colloid = colloid; + ranges->typid = attr->atttypid; + + /* we'll certainly need the comparator, so just look it up now */ + ranges->cmp = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTLessStrategyNumber); + + column->bv_mem_value = PointerGetDatum(ranges); + column->bv_serialize = brin_minmax_multi_serialize; + + MemoryContextSwitchTo(oldctx); + } + + /* + * Try to add the new value to the range. We need to update the modified + * flag, so that we serialize the updated summary later. + */ + modified |= range_add_value(bdesc, colloid, attno, attr, ranges, newval); + + + PG_RETURN_BOOL(modified); +} + +/* + * Given an index tuple corresponding to a certain page range and a scan key, + * return whether the scan key is consistent with the index tuple's min/max + * values. Return true if so, false otherwise. + */ +Datum +brin_minmax_multi_consistent(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); + ScanKey *keys = (ScanKey *) PG_GETARG_POINTER(2); + int nkeys = PG_GETARG_INT32(3); + + Oid colloid = PG_GET_COLLATION(), + subtype; + AttrNumber attno; + Datum value; + FmgrInfo *finfo; + SerializedRanges *serialized; + Ranges *ranges; + int keyno; + int rangeno; + int i; + + attno = column->bv_attno; + + serialized = (SerializedRanges *) PG_DETOAST_DATUM(column->bv_values[0]); + ranges = range_deserialize(serialized->maxvalues, serialized); + + /* inspect the ranges, and for each one evaluate the scan keys */ + for (rangeno = 0; rangeno < ranges->nranges; rangeno++) + { + Datum minval = ranges->values[2 * rangeno]; + Datum maxval = ranges->values[2 * rangeno + 1]; + + /* assume the range is matching, and we'll try to prove otherwise */ + bool matching = true; + + for (keyno = 0; keyno < nkeys; keyno++) + { + Datum matches; + ScanKey key = keys[keyno]; + + /* NULL keys are handled and filtered-out in bringetbitmap */ + Assert(!(key->sk_flags & SK_ISNULL)); + + attno = key->sk_attno; + subtype = key->sk_subtype; + value = key->sk_argument; + switch (key->sk_strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + finfo = minmax_multi_get_strategy_procinfo(bdesc, attno, subtype, + key->sk_strategy); + /* first value from the array */ + matches = FunctionCall2Coll(finfo, colloid, minval, value); + break; + + case BTEqualStrategyNumber: + { + Datum compar; + FmgrInfo *cmpFn; + + /* by default this range does not match */ + matches = false; + + /* + * Otherwise, need to compare the new value with + * boundaries of all the ranges. First check if it's + * less than the absolute minimum, which is the first + * value in the array. + */ + cmpFn = minmax_multi_get_strategy_procinfo(bdesc, attno, subtype, + BTGreaterStrategyNumber); + compar = FunctionCall2Coll(cmpFn, colloid, minval, value); + + /* smaller than the smallest value in this range */ + if (DatumGetBool(compar)) + break; + + cmpFn = minmax_multi_get_strategy_procinfo(bdesc, attno, subtype, + BTLessStrategyNumber); + compar = FunctionCall2Coll(cmpFn, colloid, maxval, value); + + /* larger than the largest value in this range */ + if (DatumGetBool(compar)) + break; + + /* + * We haven't managed to eliminate this range, so + * consider it matching. + */ + matches = true; + + break; + } + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + finfo = minmax_multi_get_strategy_procinfo(bdesc, attno, subtype, + key->sk_strategy); + /* last value from the array */ + matches = FunctionCall2Coll(finfo, colloid, maxval, value); + break; + + default: + /* shouldn't happen */ + elog(ERROR, "invalid strategy number %d", key->sk_strategy); + matches = 0; + break; + } + + /* the range has to match all the scan keys */ + matching &= DatumGetBool(matches); + + /* once we find a non-matching key, we're done */ + if (!matching) + break; + } + + /* + * have we found a range matching all scan keys? if yes, we're done + */ + if (matching) + PG_RETURN_DATUM(BoolGetDatum(true)); + } + + /* + * And now inspect the values. We don't bother with doing a binary search + * here, because we're dealing with serialized / fully compacted ranges, + * so there should be only very few values. + */ + for (i = 0; i < ranges->nvalues; i++) + { + Datum val = ranges->values[2 * ranges->nranges + i]; + + /* assume the range is matching, and we'll try to prove otherwise */ + bool matching = true; + + for (keyno = 0; keyno < nkeys; keyno++) + { + Datum matches; + ScanKey key = keys[keyno]; + + /* we've already dealt with NULL keys at the beginning */ + if (key->sk_flags & SK_ISNULL) + continue; + + attno = key->sk_attno; + subtype = key->sk_subtype; + value = key->sk_argument; + switch (key->sk_strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + case BTEqualStrategyNumber: + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + + finfo = minmax_multi_get_strategy_procinfo(bdesc, attno, subtype, + key->sk_strategy); + matches = FunctionCall2Coll(finfo, colloid, val, value); + break; + + default: + /* shouldn't happen */ + elog(ERROR, "invalid strategy number %d", key->sk_strategy); + matches = 0; + break; + } + + /* the range has to match all the scan keys */ + matching &= DatumGetBool(matches); + + /* once we find a non-matching key, we're done */ + if (!matching) + break; + } + + /* have we found a range matching all scan keys? if yes, we're done */ + if (matching) + PG_RETURN_DATUM(BoolGetDatum(true)); + } + + PG_RETURN_DATUM(BoolGetDatum(false)); +} + +/* + * Given two BrinValues, update the first of them as a union of the summary + * values contained in both. The second one is untouched. + */ +Datum +brin_minmax_multi_union(PG_FUNCTION_ARGS) +{ + BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); + BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1); + BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2); + + Oid colloid = PG_GET_COLLATION(); + SerializedRanges *serialized_a; + SerializedRanges *serialized_b; + Ranges *ranges_a; + Ranges *ranges_b; + AttrNumber attno; + Form_pg_attribute attr; + ExpandedRange *eranges; + int neranges; + FmgrInfo *cmpFn, + *distanceFn; + DistanceValue *distances; + MemoryContext ctx; + MemoryContext oldctx; + + Assert(col_a->bv_attno == col_b->bv_attno); + Assert(!col_a->bv_allnulls && !col_b->bv_allnulls); + + attno = col_a->bv_attno; + attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1); + + serialized_a = (SerializedRanges *) PG_DETOAST_DATUM(col_a->bv_values[0]); + serialized_b = (SerializedRanges *) PG_DETOAST_DATUM(col_b->bv_values[0]); + + ranges_a = range_deserialize(serialized_a->maxvalues, serialized_a); + ranges_b = range_deserialize(serialized_b->maxvalues, serialized_b); + + /* make sure neither of the ranges is NULL */ + Assert(ranges_a && ranges_b); + + neranges = (ranges_a->nranges + ranges_a->nvalues) + + (ranges_b->nranges + ranges_b->nvalues); + + /* + * The distanceFn calls (which may internally call e.g. numeric_le) may + * allocate quite a bit of memory, and we must not leak it. Otherwise, + * we'd have problems e.g. when building indexes. So we create a local + * memory context and make sure we free the memory before leaving this + * function (not after every call). + */ + ctx = AllocSetContextCreate(CurrentMemoryContext, + "minmax-multi context", + ALLOCSET_DEFAULT_SIZES); + + oldctx = MemoryContextSwitchTo(ctx); + + /* allocate and fill */ + eranges = (ExpandedRange *) palloc0(neranges * sizeof(ExpandedRange)); + + /* fill the expanded ranges with entries for the first range */ + fill_expanded_ranges(eranges, ranges_a->nranges + ranges_a->nvalues, + ranges_a); + + /* and now add combine ranges for the second range */ + fill_expanded_ranges(&eranges[ranges_a->nranges + ranges_a->nvalues], + ranges_b->nranges + ranges_b->nvalues, + ranges_b); + + cmpFn = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid, + BTLessStrategyNumber); + + /* sort the expanded ranges */ + neranges = sort_expanded_ranges(cmpFn, colloid, eranges, neranges); + + /* + * We've loaded two different lists of expanded ranges, so some of them + * may be overlapping. So walk through them and merge them. + */ + neranges = merge_overlapping_ranges(cmpFn, colloid, eranges, neranges); + + /* check that the combine ranges are correct (no overlaps, ordering) */ + AssertCheckExpandedRanges(bdesc, colloid, attno, attr, eranges, neranges); + + /* + * If needed, reduce some of the ranges. + * + * XXX This may be fairly expensive, so maybe we should do it only when + * it's actually needed (when we have too many ranges). + */ + + /* build array of gap distances and sort them in ascending order */ + distanceFn = minmax_multi_get_procinfo(bdesc, attno, PROCNUM_DISTANCE); + distances = build_distances(distanceFn, colloid, eranges, neranges); + + /* + * See how many values would be needed to store the current ranges, and if + * needed combine as many of them to get below the threshold. The + * collapsed ranges will be stored as a single value. + * + * XXX This does not apply the load factor, as we don't expect to add more + * values to the range, so we prefer to keep as many ranges as possible. + * + * XXX Can the maxvalues be different in the two ranges? Perhaps we should + * use maximum of those? + */ + neranges = reduce_expanded_ranges(eranges, neranges, distances, + ranges_a->maxvalues, + cmpFn, colloid); + + /* update the first range summary */ + store_expanded_ranges(ranges_a, eranges, neranges); + + MemoryContextSwitchTo(oldctx); + MemoryContextDelete(ctx); + + /* cleanup and update the serialized value */ + pfree(serialized_a); + col_a->bv_values[0] = PointerGetDatum(range_serialize(ranges_a)); + + PG_RETURN_VOID(); +} + +/* + * Cache and return minmax multi opclass support procedure + * + * Return the procedure corresponding to the given function support number + * or null if it does not exist. + */ +static FmgrInfo * +minmax_multi_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum) +{ + MinmaxMultiOpaque *opaque; + uint16 basenum = procnum - PROCNUM_BASE; + + /* + * We cache these in the opaque struct, to avoid repetitive syscache + * lookups. + */ + opaque = (MinmaxMultiOpaque *) bdesc->bd_info[attno - 1]->oi_opaque; + + /* + * If we already searched for this proc and didn't find it, don't bother + * searching again. + */ + if (opaque->extra_proc_missing[basenum]) + return NULL; + + if (opaque->extra_procinfos[basenum].fn_oid == InvalidOid) + { + if (RegProcedureIsValid(index_getprocid(bdesc->bd_index, attno, + procnum))) + { + fmgr_info_copy(&opaque->extra_procinfos[basenum], + index_getprocinfo(bdesc->bd_index, attno, procnum), + bdesc->bd_context); + } + else + { + opaque->extra_proc_missing[basenum] = true; + return NULL; + } + } + + return &opaque->extra_procinfos[basenum]; +} + +/* + * Cache and return the procedure for the given strategy. + * + * Note: this function mirrors minmax_multi_get_strategy_procinfo; see notes + * there. If changes are made here, see that function too. + */ +static FmgrInfo * +minmax_multi_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno, Oid subtype, + uint16 strategynum) +{ + MinmaxMultiOpaque *opaque; + + Assert(strategynum >= 1 && + strategynum <= BTMaxStrategyNumber); + + opaque = (MinmaxMultiOpaque *) bdesc->bd_info[attno - 1]->oi_opaque; + + /* + * We cache the procedures for the previous subtype in the opaque struct, + * to avoid repetitive syscache lookups. If the subtype changed, + * invalidate all the cached entries. + */ + if (opaque->cached_subtype != subtype) + { + uint16 i; + + for (i = 1; i <= BTMaxStrategyNumber; i++) + opaque->strategy_procinfos[i - 1].fn_oid = InvalidOid; + opaque->cached_subtype = subtype; + } + + if (opaque->strategy_procinfos[strategynum - 1].fn_oid == InvalidOid) + { + Form_pg_attribute attr; + HeapTuple tuple; + Oid opfamily, + oprid; + bool isNull; + + opfamily = bdesc->bd_index->rd_opfamily[attno - 1]; + attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1); + tuple = SearchSysCache4(AMOPSTRATEGY, ObjectIdGetDatum(opfamily), + ObjectIdGetDatum(attr->atttypid), + ObjectIdGetDatum(subtype), + Int16GetDatum(strategynum)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "missing operator %d(%u,%u) in opfamily %u", + strategynum, attr->atttypid, subtype, opfamily); + + oprid = DatumGetObjectId(SysCacheGetAttr(AMOPSTRATEGY, tuple, + Anum_pg_amop_amopopr, &isNull)); + ReleaseSysCache(tuple); + Assert(!isNull && RegProcedureIsValid(oprid)); + + fmgr_info_cxt(get_opcode(oprid), + &opaque->strategy_procinfos[strategynum - 1], + bdesc->bd_context); + } + + return &opaque->strategy_procinfos[strategynum - 1]; +} + +Datum +brin_minmax_multi_options(PG_FUNCTION_ARGS) +{ + local_relopts *relopts = (local_relopts *) PG_GETARG_POINTER(0); + + init_local_reloptions(relopts, sizeof(MinMaxMultiOptions)); + + add_local_int_reloption(relopts, "values_per_range", "desc", + MINMAX_MULTI_DEFAULT_VALUES_PER_PAGE, 8, 256, + offsetof(MinMaxMultiOptions, valuesPerRange)); + + PG_RETURN_VOID(); +} + +/* + * brin_minmax_multi_summary_in + * - input routine for type brin_minmax_multi_summary. + * + * brin_minmax_multi_summary is only used internally to represent summaries + * in BRIN minmax-multi indexes, so it has no operations of its own, and we + * disallow input too. + */ +Datum +brin_minmax_multi_summary_in(PG_FUNCTION_ARGS) +{ + /* + * brin_minmax_multi_summary stores the data in binary form and parsing + * text input is not needed, so disallow this. + */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "brin_minmax_multi_summary"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + + +/* + * brin_minmax_multi_summary_out + * - output routine for type brin_minmax_multi_summary. + * + * BRIN minmax-multi summaries are serialized into a bytea value, but we + * want to output something nicer humans can understand. + */ +Datum +brin_minmax_multi_summary_out(PG_FUNCTION_ARGS) +{ + int i; + int idx; + SerializedRanges *ranges; + Ranges *ranges_deserialized; + StringInfoData str; + bool isvarlena; + Oid outfunc; + FmgrInfo fmgrinfo; + ArrayBuildState *astate_values = NULL; + + initStringInfo(&str); + appendStringInfoChar(&str, '{'); + + /* + * Detoast to get value with full 4B header (can't be stored in a toast + * table, but can use 1B header). + */ + ranges = (SerializedRanges *) PG_DETOAST_DATUM(PG_GETARG_BYTEA_PP(0)); + + /* lookup output func for the type */ + getTypeOutputInfo(ranges->typid, &outfunc, &isvarlena); + fmgr_info(outfunc, &fmgrinfo); + + /* deserialize the range info easy-to-process pieces */ + ranges_deserialized = range_deserialize(ranges->maxvalues, ranges); + + appendStringInfo(&str, "nranges: %u nvalues: %u maxvalues: %u", + ranges_deserialized->nranges, + ranges_deserialized->nvalues, + ranges_deserialized->maxvalues); + + /* serialize ranges */ + idx = 0; + for (i = 0; i < ranges_deserialized->nranges; i++) + { + char *a, + *b; + text *c; + StringInfoData str; + + initStringInfo(&str); + + a = OutputFunctionCall(&fmgrinfo, ranges_deserialized->values[idx++]); + b = OutputFunctionCall(&fmgrinfo, ranges_deserialized->values[idx++]); + + appendStringInfo(&str, "%s ... %s", a, b); + + c = cstring_to_text(str.data); + + astate_values = accumArrayResult(astate_values, + PointerGetDatum(c), + false, + TEXTOID, + CurrentMemoryContext); + } + + if (ranges_deserialized->nranges > 0) + { + Oid typoutput; + bool typIsVarlena; + Datum val; + char *extval; + + getTypeOutputInfo(ANYARRAYOID, &typoutput, &typIsVarlena); + + val = PointerGetDatum(makeArrayResult(astate_values, CurrentMemoryContext)); + + extval = OidOutputFunctionCall(typoutput, val); + + appendStringInfo(&str, " ranges: %s", extval); + } + + /* serialize individual values */ + astate_values = NULL; + + for (i = 0; i < ranges_deserialized->nvalues; i++) + { + Datum a; + text *b; + StringInfoData str; + + initStringInfo(&str); + + a = FunctionCall1(&fmgrinfo, ranges_deserialized->values[idx++]); + + appendStringInfoString(&str, DatumGetCString(a)); + + b = cstring_to_text(str.data); + + astate_values = accumArrayResult(astate_values, + PointerGetDatum(b), + false, + TEXTOID, + CurrentMemoryContext); + } + + if (ranges_deserialized->nvalues > 0) + { + Oid typoutput; + bool typIsVarlena; + Datum val; + char *extval; + + getTypeOutputInfo(ANYARRAYOID, &typoutput, &typIsVarlena); + + val = PointerGetDatum(makeArrayResult(astate_values, CurrentMemoryContext)); + + extval = OidOutputFunctionCall(typoutput, val); + + appendStringInfo(&str, " values: %s", extval); + } + + + appendStringInfoChar(&str, '}'); + + PG_RETURN_CSTRING(str.data); +} + +/* + * brin_minmax_multi_summary_recv + * - binary input routine for type brin_minmax_multi_summary. + */ +Datum +brin_minmax_multi_summary_recv(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot accept a value of type %s", "brin_minmax_multi_summary"))); + + PG_RETURN_VOID(); /* keep compiler quiet */ +} + +/* + * brin_minmax_multi_summary_send + * - binary output routine for type brin_minmax_multi_summary. + * + * BRIN minmax-multi summaries are serialized in a bytea value (although + * the type is named differently), so let's just send that. + */ +Datum +brin_minmax_multi_summary_send(PG_FUNCTION_ARGS) +{ + return byteasend(fcinfo); +} diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c new file mode 100644 index 0000000..992b33a --- /dev/null +++ b/src/backend/access/brin/brin_pageops.c @@ -0,0 +1,920 @@ +/* + * brin_pageops.c + * Page-handling routines for BRIN indexes + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_pageops.c + */ +#include "postgres.h" + +#include "access/brin_page.h" +#include "access/brin_pageops.h" +#include "access/brin_revmap.h" +#include "access/brin_xlog.h" +#include "access/xloginsert.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/freespace.h" +#include "storage/lmgr.h" +#include "storage/smgr.h" +#include "utils/rel.h" + +/* + * Maximum size of an entry in a BRIN_PAGETYPE_REGULAR page. We can tolerate + * a single item per page, unlike other index AMs. + */ +#define BrinMaxItemSize \ + MAXALIGN_DOWN(BLCKSZ - \ + (MAXALIGN(SizeOfPageHeaderData + \ + sizeof(ItemIdData)) + \ + MAXALIGN(sizeof(BrinSpecialSpace)))) + +static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, + bool *extended); +static Size br_page_get_freespace(Page page); +static void brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer); + + +/* + * Update tuple origtup (size origsz), located in offset oldoff of buffer + * oldbuf, to newtup (size newsz) as summary tuple for the page range starting + * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit. + * + * If samepage is true, attempt to put the new tuple in the same page, but if + * there's no room, use some other one. + * + * If the update is successful, return true; the revmap is updated to point to + * the new tuple. If the update is not done for whatever reason, return false. + * Caller may retry the update if this happens. + */ +bool +brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, + BrinRevmap *revmap, BlockNumber heapBlk, + Buffer oldbuf, OffsetNumber oldoff, + const BrinTuple *origtup, Size origsz, + const BrinTuple *newtup, Size newsz, + bool samepage) +{ + Page oldpage; + ItemId oldlp; + BrinTuple *oldtup; + Size oldsz; + Buffer newbuf; + BlockNumber newblk = InvalidBlockNumber; + bool extended; + + Assert(newsz == MAXALIGN(newsz)); + + /* If the item is oversized, don't bother. */ + if (newsz > BrinMaxItemSize) + { + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", + newsz, BrinMaxItemSize, RelationGetRelationName(idxrel)))); + return false; /* keep compiler quiet */ + } + + /* make sure the revmap is long enough to contain the entry we need */ + brinRevmapExtend(revmap, heapBlk); + + if (!samepage) + { + /* need a page on which to put the item */ + newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended); + if (!BufferIsValid(newbuf)) + { + Assert(!extended); + return false; + } + + /* + * Note: it's possible (though unlikely) that the returned newbuf is + * the same as oldbuf, if brin_getinsertbuffer determined that the old + * buffer does in fact have enough space. + */ + if (newbuf == oldbuf) + { + Assert(!extended); + newbuf = InvalidBuffer; + } + else + newblk = BufferGetBlockNumber(newbuf); + } + else + { + LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); + newbuf = InvalidBuffer; + extended = false; + } + oldpage = BufferGetPage(oldbuf); + oldlp = PageGetItemId(oldpage, oldoff); + + /* + * Check that the old tuple wasn't updated concurrently: it might have + * moved someplace else entirely, and for that matter the whole page + * might've become a revmap page. Note that in the first two cases + * checked here, the "oldlp" we just calculated is garbage; but + * PageGetItemId() is simple enough that it was safe to do that + * calculation anyway. + */ + if (!BRIN_IS_REGULAR_PAGE(oldpage) || + oldoff > PageGetMaxOffsetNumber(oldpage) || + !ItemIdIsNormal(oldlp)) + { + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + + /* + * If this happens, and the new buffer was obtained by extending the + * relation, then we need to ensure we don't leave it uninitialized or + * forget about it. + */ + if (BufferIsValid(newbuf)) + { + if (extended) + brin_initialize_empty_new_buffer(idxrel, newbuf); + UnlockReleaseBuffer(newbuf); + if (extended) + FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); + } + return false; + } + + oldsz = ItemIdGetLength(oldlp); + oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp); + + /* + * ... or it might have been updated in place to different contents. + */ + if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz)) + { + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + if (BufferIsValid(newbuf)) + { + /* As above, initialize and record new page if we got one */ + if (extended) + brin_initialize_empty_new_buffer(idxrel, newbuf); + UnlockReleaseBuffer(newbuf); + if (extended) + FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); + } + return false; + } + + /* + * Great, the old tuple is intact. We can proceed with the update. + * + * If there's enough room in the old page for the new tuple, replace it. + * + * Note that there might now be enough space on the page even though the + * caller told us there isn't, if a concurrent update moved another tuple + * elsewhere or replaced a tuple with a smaller one. + */ + if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) && + brin_can_do_samepage_update(oldbuf, origsz, newsz)) + { + START_CRIT_SECTION(); + if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) unconstify(BrinTuple *, newtup), newsz)) + elog(ERROR, "failed to replace BRIN tuple"); + MarkBufferDirty(oldbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(idxrel)) + { + xl_brin_samepage_update xlrec; + XLogRecPtr recptr; + uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE; + + xlrec.offnum = oldoff; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBrinSamepageUpdate); + + XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) unconstify(BrinTuple *, newtup), newsz); + + recptr = XLogInsert(RM_BRIN_ID, info); + + PageSetLSN(oldpage, recptr); + } + + END_CRIT_SECTION(); + + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + + if (BufferIsValid(newbuf)) + { + /* As above, initialize and record new page if we got one */ + if (extended) + brin_initialize_empty_new_buffer(idxrel, newbuf); + UnlockReleaseBuffer(newbuf); + if (extended) + FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); + } + + return true; + } + else if (newbuf == InvalidBuffer) + { + /* + * Not enough space, but caller said that there was. Tell them to + * start over. + */ + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + return false; + } + else + { + /* + * Not enough free space on the oldpage. Put the new tuple on the new + * page, and update the revmap. + */ + Page newpage = BufferGetPage(newbuf); + Buffer revmapbuf; + ItemPointerData newtid; + OffsetNumber newoff; + Size freespace = 0; + + revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); + + START_CRIT_SECTION(); + + /* + * We need to initialize the page if it's newly obtained. Note we + * will WAL-log the initialization as part of the update, so we don't + * need to do that here. + */ + if (extended) + brin_page_init(newpage, BRIN_PAGETYPE_REGULAR); + + PageIndexTupleDeleteNoCompact(oldpage, oldoff); + newoff = PageAddItem(newpage, (Item) unconstify(BrinTuple *, newtup), newsz, + InvalidOffsetNumber, false, false); + if (newoff == InvalidOffsetNumber) + elog(ERROR, "failed to add BRIN tuple to new page"); + MarkBufferDirty(oldbuf); + MarkBufferDirty(newbuf); + + /* needed to update FSM below */ + if (extended) + freespace = br_page_get_freespace(newpage); + + ItemPointerSet(&newtid, newblk, newoff); + brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid); + MarkBufferDirty(revmapbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(idxrel)) + { + xl_brin_update xlrec; + XLogRecPtr recptr; + uint8 info; + + info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0); + + xlrec.insert.offnum = newoff; + xlrec.insert.heapBlk = heapBlk; + xlrec.insert.pagesPerRange = pagesPerRange; + xlrec.oldOffnum = oldoff; + + XLogBeginInsert(); + + /* new page */ + XLogRegisterData((char *) &xlrec, SizeOfBrinUpdate); + + XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); + XLogRegisterBufData(0, (char *) unconstify(BrinTuple *, newtup), newsz); + + /* revmap page */ + XLogRegisterBuffer(1, revmapbuf, 0); + + /* old page */ + XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_BRIN_ID, info); + + PageSetLSN(oldpage, recptr); + PageSetLSN(newpage, recptr); + PageSetLSN(BufferGetPage(revmapbuf), recptr); + } + + END_CRIT_SECTION(); + + LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + UnlockReleaseBuffer(newbuf); + + if (extended) + { + RecordPageWithFreeSpace(idxrel, newblk, freespace); + FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); + } + + return true; + } +} + +/* + * Return whether brin_doupdate can do a samepage update. + */ +bool +brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz) +{ + return + ((newsz <= origsz) || + PageGetExactFreeSpace(BufferGetPage(buffer)) >= (newsz - origsz)); +} + +/* + * Insert an index tuple into the index relation. The revmap is updated to + * mark the range containing the given page as pointing to the inserted entry. + * A WAL record is written. + * + * The buffer, if valid, is first checked for free space to insert the new + * entry; if there isn't enough, a new buffer is obtained and pinned. No + * buffer lock must be held on entry, no buffer lock is held on exit. + * + * Return value is the offset number where the tuple was inserted. + */ +OffsetNumber +brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, + BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, + BrinTuple *tup, Size itemsz) +{ + Page page; + BlockNumber blk; + OffsetNumber off; + Size freespace = 0; + Buffer revmapbuf; + ItemPointerData tid; + bool extended; + + Assert(itemsz == MAXALIGN(itemsz)); + + /* If the item is oversized, don't even bother. */ + if (itemsz > BrinMaxItemSize) + { + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", + itemsz, BrinMaxItemSize, RelationGetRelationName(idxrel)))); + return InvalidOffsetNumber; /* keep compiler quiet */ + } + + /* Make sure the revmap is long enough to contain the entry we need */ + brinRevmapExtend(revmap, heapBlk); + + /* + * Acquire lock on buffer supplied by caller, if any. If it doesn't have + * enough space, unpin it to obtain a new one below. + */ + if (BufferIsValid(*buffer)) + { + /* + * It's possible that another backend (or ourselves!) extended the + * revmap over the page we held a pin on, so we cannot assume that + * it's still a regular page. + */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz) + { + UnlockReleaseBuffer(*buffer); + *buffer = InvalidBuffer; + } + } + + /* + * If we still don't have a usable buffer, have brin_getinsertbuffer + * obtain one for us. + */ + if (!BufferIsValid(*buffer)) + { + do + *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended); + while (!BufferIsValid(*buffer)); + } + else + extended = false; + + /* Now obtain lock on revmap buffer */ + revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); + + page = BufferGetPage(*buffer); + blk = BufferGetBlockNumber(*buffer); + + /* Execute the actual insertion */ + START_CRIT_SECTION(); + if (extended) + brin_page_init(page, BRIN_PAGETYPE_REGULAR); + off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber, + false, false); + if (off == InvalidOffsetNumber) + elog(ERROR, "failed to add BRIN tuple to new page"); + MarkBufferDirty(*buffer); + + /* needed to update FSM below */ + if (extended) + freespace = br_page_get_freespace(page); + + ItemPointerSet(&tid, blk, off); + brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid); + MarkBufferDirty(revmapbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(idxrel)) + { + xl_brin_insert xlrec; + XLogRecPtr recptr; + uint8 info; + + info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0); + xlrec.heapBlk = heapBlk; + xlrec.pagesPerRange = pagesPerRange; + xlrec.offnum = off; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBrinInsert); + + XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); + XLogRegisterBufData(0, (char *) tup, itemsz); + + XLogRegisterBuffer(1, revmapbuf, 0); + + recptr = XLogInsert(RM_BRIN_ID, info); + + PageSetLSN(page, recptr); + PageSetLSN(BufferGetPage(revmapbuf), recptr); + } + + END_CRIT_SECTION(); + + /* Tuple is firmly on buffer; we can release our locks */ + LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); + LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); + + BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u", + blk, off, heapBlk)); + + if (extended) + { + RecordPageWithFreeSpace(idxrel, blk, freespace); + FreeSpaceMapVacuumRange(idxrel, blk, blk + 1); + } + + return off; +} + +/* + * Initialize a page with the given type. + * + * Caller is responsible for marking it dirty, as appropriate. + */ +void +brin_page_init(Page page, uint16 type) +{ + PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace)); + + BrinPageType(page) = type; +} + +/* + * Initialize a new BRIN index's metapage. + */ +void +brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version) +{ + BrinMetaPageData *metadata; + + brin_page_init(page, BRIN_PAGETYPE_META); + + metadata = (BrinMetaPageData *) PageGetContents(page); + + metadata->brinMagic = BRIN_META_MAGIC; + metadata->brinVersion = version; + metadata->pagesPerRange = pagesPerRange; + + /* + * Note we cheat here a little. 0 is not a valid revmap block number + * (because it's the metapage buffer), but doing this enables the first + * revmap page to be created when the index is. + */ + metadata->lastRevmapPage = 0; + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c compresses + * the page. + */ + ((PageHeader) page)->pd_lower = + ((char *) metadata + sizeof(BrinMetaPageData)) - (char *) page; +} + +/* + * Initiate page evacuation protocol. + * + * The page must be locked in exclusive mode by the caller. + * + * If the page is not yet initialized or empty, return false without doing + * anything; it can be used for revmap without any further changes. If it + * contains tuples, mark it for evacuation and return true. + */ +bool +brin_start_evacuating_page(Relation idxRel, Buffer buf) +{ + OffsetNumber off; + OffsetNumber maxoff; + Page page; + + page = BufferGetPage(buf); + + if (PageIsNew(page)) + return false; + + maxoff = PageGetMaxOffsetNumber(page); + for (off = FirstOffsetNumber; off <= maxoff; off++) + { + ItemId lp; + + lp = PageGetItemId(page, off); + if (ItemIdIsUsed(lp)) + { + /* + * Prevent other backends from adding more stuff to this page: + * BRIN_EVACUATE_PAGE informs br_page_get_freespace that this page + * can no longer be used to add new tuples. Note that this flag + * is not WAL-logged, except accidentally. + */ + BrinPageFlags(page) |= BRIN_EVACUATE_PAGE; + MarkBufferDirtyHint(buf, true); + + return true; + } + } + return false; +} + +/* + * Move all tuples out of a page. + * + * The caller must hold lock on the page. The lock and pin are released. + */ +void +brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange, + BrinRevmap *revmap, Buffer buf) +{ + OffsetNumber off; + OffsetNumber maxoff; + Page page; + BrinTuple *btup = NULL; + Size btupsz = 0; + + page = BufferGetPage(buf); + + Assert(BrinPageFlags(page) & BRIN_EVACUATE_PAGE); + + maxoff = PageGetMaxOffsetNumber(page); + for (off = FirstOffsetNumber; off <= maxoff; off++) + { + BrinTuple *tup; + Size sz; + ItemId lp; + + CHECK_FOR_INTERRUPTS(); + + lp = PageGetItemId(page, off); + if (ItemIdIsUsed(lp)) + { + sz = ItemIdGetLength(lp); + tup = (BrinTuple *) PageGetItem(page, lp); + tup = brin_copy_tuple(tup, sz, btup, &btupsz); + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno, + buf, off, tup, sz, tup, sz, false)) + off--; /* retry */ + + LockBuffer(buf, BUFFER_LOCK_SHARE); + + /* It's possible that someone extended the revmap over this page */ + if (!BRIN_IS_REGULAR_PAGE(page)) + break; + } + } + + UnlockReleaseBuffer(buf); +} + +/* + * Given a BRIN index page, initialize it if necessary, and record its + * current free space in the FSM. + * + * The main use for this is when, during vacuuming, an uninitialized page is + * found, which could be the result of relation extension followed by a crash + * before the page can be used. + * + * Here, we don't bother to update upper FSM pages, instead expecting that our + * caller (brin_vacuum_scan) will fix them at the end of the scan. Elsewhere + * in this file, it's generally a good idea to propagate additions of free + * space into the upper FSM pages immediately. + */ +void +brin_page_cleanup(Relation idxrel, Buffer buf) +{ + Page page = BufferGetPage(buf); + + /* + * If a page was left uninitialized, initialize it now; also record it in + * FSM. + * + * Somebody else might be extending the relation concurrently. To avoid + * re-initializing the page before they can grab the buffer lock, we + * acquire the extension lock momentarily. Since they hold the extension + * lock from before getting the page and after its been initialized, we're + * sure to see their initialization. + */ + if (PageIsNew(page)) + { + LockRelationForExtension(idxrel, ShareLock); + UnlockRelationForExtension(idxrel, ShareLock); + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + if (PageIsNew(page)) + { + brin_initialize_empty_new_buffer(idxrel, buf); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + return; + } + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + + /* Nothing to be done for non-regular index pages */ + if (BRIN_IS_META_PAGE(BufferGetPage(buf)) || + BRIN_IS_REVMAP_PAGE(BufferGetPage(buf))) + return; + + /* Measure free space and record it */ + RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf), + br_page_get_freespace(page)); +} + +/* + * Return a pinned and exclusively locked buffer which can be used to insert an + * index item of size itemsz (caller must ensure not to request sizes + * impossible to fulfill). If oldbuf is a valid buffer, it is also locked (in + * an order determined to avoid deadlocks). + * + * If we find that the old page is no longer a regular index page (because + * of a revmap extension), the old buffer is unlocked and we return + * InvalidBuffer. + * + * If there's no existing page with enough free space to accommodate the new + * item, the relation is extended. If this happens, *extended is set to true, + * and it is the caller's responsibility to initialize the page (and WAL-log + * that fact) prior to use. The caller should also update the FSM with the + * page's remaining free space after the insertion. + * + * Note that the caller is not expected to update FSM unless *extended is set + * true. This policy means that we'll update FSM when a page is created, and + * when it's found to have too little space for a desired tuple insertion, + * but not every single time we add a tuple to the page. + * + * Note that in some corner cases it is possible for this routine to extend + * the relation and then not return the new page. It is this routine's + * responsibility to WAL-log the page initialization and to record the page in + * FSM if that happens, since the caller certainly can't do it. + */ +static Buffer +brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, + bool *extended) +{ + BlockNumber oldblk; + BlockNumber newblk; + Page page; + Size freespace; + + /* callers must have checked */ + Assert(itemsz <= BrinMaxItemSize); + + if (BufferIsValid(oldbuf)) + oldblk = BufferGetBlockNumber(oldbuf); + else + oldblk = InvalidBlockNumber; + + /* Choose initial target page, re-using existing target if known */ + newblk = RelationGetTargetBlock(irel); + if (newblk == InvalidBlockNumber) + newblk = GetPageWithFreeSpace(irel, itemsz); + + /* + * Loop until we find a page with sufficient free space. By the time we + * return to caller out of this loop, both buffers are valid and locked; + * if we have to restart here, neither page is locked and newblk isn't + * pinned (if it's even valid). + */ + for (;;) + { + Buffer buf; + bool extensionLockHeld = false; + + CHECK_FOR_INTERRUPTS(); + + *extended = false; + + if (newblk == InvalidBlockNumber) + { + /* + * There's not enough free space in any existing index page, + * according to the FSM: extend the relation to obtain a shiny new + * page. + */ + if (!RELATION_IS_LOCAL(irel)) + { + LockRelationForExtension(irel, ExclusiveLock); + extensionLockHeld = true; + } + buf = ReadBuffer(irel, P_NEW); + newblk = BufferGetBlockNumber(buf); + *extended = true; + + BRIN_elog((DEBUG2, "brin_getinsertbuffer: extending to page %u", + BufferGetBlockNumber(buf))); + } + else if (newblk == oldblk) + { + /* + * There's an odd corner-case here where the FSM is out-of-date, + * and gave us the old page. + */ + buf = oldbuf; + } + else + { + buf = ReadBuffer(irel, newblk); + } + + /* + * We lock the old buffer first, if it's earlier than the new one; but + * then we need to check that it hasn't been turned into a revmap page + * concurrently. If we detect that that happened, give up and tell + * caller to start over. + */ + if (BufferIsValid(oldbuf) && oldblk < newblk) + { + LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); + if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))) + { + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + + /* + * It is possible that the new page was obtained from + * extending the relation. In that case, we must be sure to + * record it in the FSM before leaving, because otherwise the + * space would be lost forever. However, we cannot let an + * uninitialized page get in the FSM, so we need to initialize + * it first. + */ + if (*extended) + brin_initialize_empty_new_buffer(irel, buf); + + if (extensionLockHeld) + UnlockRelationForExtension(irel, ExclusiveLock); + + ReleaseBuffer(buf); + + if (*extended) + { + FreeSpaceMapVacuumRange(irel, newblk, newblk + 1); + /* shouldn't matter, but don't confuse caller */ + *extended = false; + } + + return InvalidBuffer; + } + } + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + if (extensionLockHeld) + UnlockRelationForExtension(irel, ExclusiveLock); + + page = BufferGetPage(buf); + + /* + * We have a new buffer to insert into. Check that the new page has + * enough free space, and return it if it does; otherwise start over. + * (br_page_get_freespace also checks that the FSM didn't hand us a + * page that has since been repurposed for the revmap.) + */ + freespace = *extended ? + BrinMaxItemSize : br_page_get_freespace(page); + if (freespace >= itemsz) + { + RelationSetTargetBlock(irel, newblk); + + /* + * Lock the old buffer if not locked already. Note that in this + * case we know for sure it's a regular page: it's later than the + * new page we just got, which is not a revmap page, and revmap + * pages are always consecutive. + */ + if (BufferIsValid(oldbuf) && oldblk > newblk) + { + LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); + Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))); + } + + return buf; + } + + /* This page is no good. */ + + /* + * If an entirely new page does not contain enough free space for the + * new item, then surely that item is oversized. Complain loudly; but + * first make sure we initialize the page and record it as free, for + * next time. + */ + if (*extended) + { + brin_initialize_empty_new_buffer(irel, buf); + /* since this should not happen, skip FreeSpaceMapVacuum */ + + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", + itemsz, freespace, RelationGetRelationName(irel)))); + return InvalidBuffer; /* keep compiler quiet */ + } + + if (newblk != oldblk) + UnlockReleaseBuffer(buf); + if (BufferIsValid(oldbuf) && oldblk <= newblk) + LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); + + /* + * Update the FSM with the new, presumably smaller, freespace value + * for this page, then search for a new target page. + */ + newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz); + } +} + +/* + * Initialize a page as an empty regular BRIN page, WAL-log this, and record + * the page in FSM. + * + * There are several corner situations in which we extend the relation to + * obtain a new page and later find that we cannot use it immediately. When + * that happens, we don't want to leave the page go unrecorded in FSM, because + * there is no mechanism to get the space back and the index would bloat. + * Also, because we would not WAL-log the action that would initialize the + * page, the page would go uninitialized in a standby (or after recovery). + * + * While we record the page in FSM here, caller is responsible for doing FSM + * upper-page update if that seems appropriate. + */ +static void +brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer) +{ + Page page; + + BRIN_elog((DEBUG2, + "brin_initialize_empty_new_buffer: initializing blank page %u", + BufferGetBlockNumber(buffer))); + + START_CRIT_SECTION(); + page = BufferGetPage(buffer); + brin_page_init(page, BRIN_PAGETYPE_REGULAR); + MarkBufferDirty(buffer); + log_newpage_buffer(buffer, true); + END_CRIT_SECTION(); + + /* + * We update the FSM for this page, but this is not WAL-logged. This is + * acceptable because VACUUM will scan the index and update the FSM with + * pages whose FSM records were forgotten in a crash. + */ + RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer), + br_page_get_freespace(page)); +} + + +/* + * Return the amount of free space on a regular BRIN index page. + * + * If the page is not a regular page, or has been marked with the + * BRIN_EVACUATE_PAGE flag, returns 0. + */ +static Size +br_page_get_freespace(Page page) +{ + if (!BRIN_IS_REGULAR_PAGE(page) || + (BrinPageFlags(page) & BRIN_EVACUATE_PAGE) != 0) + return 0; + else + return PageGetFreeSpace(page); +} diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c new file mode 100644 index 0000000..c574c8a --- /dev/null +++ b/src/backend/access/brin/brin_revmap.c @@ -0,0 +1,664 @@ +/* + * brin_revmap.c + * Range map for BRIN indexes + * + * The range map (revmap) is a translation structure for BRIN indexes: for each + * page range there is one summary tuple, and its location is tracked by the + * revmap. Whenever a new tuple is inserted into a table that violates the + * previously recorded summary values, a new tuple is inserted into the index + * and the revmap is updated to point to it. + * + * The revmap is stored in the first pages of the index, immediately following + * the metapage. When the revmap needs to be expanded, all tuples on the + * regular BRIN page at that block (if any) are moved out of the way. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_revmap.c + */ +#include "postgres.h" + +#include "access/brin_page.h" +#include "access/brin_pageops.h" +#include "access/brin_revmap.h" +#include "access/brin_tuple.h" +#include "access/brin_xlog.h" +#include "access/rmgr.h" +#include "access/xloginsert.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "storage/lmgr.h" +#include "utils/rel.h" + + +/* + * In revmap pages, each item stores an ItemPointerData. These defines let one + * find the logical revmap page number and index number of the revmap item for + * the given heap block number. + */ +#define HEAPBLK_TO_REVMAP_BLK(pagesPerRange, heapBlk) \ + ((heapBlk / pagesPerRange) / REVMAP_PAGE_MAXITEMS) +#define HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk) \ + ((heapBlk / pagesPerRange) % REVMAP_PAGE_MAXITEMS) + + +struct BrinRevmap +{ + Relation rm_irel; + BlockNumber rm_pagesPerRange; + BlockNumber rm_lastRevmapPage; /* cached from the metapage */ + Buffer rm_metaBuf; + Buffer rm_currBuf; +}; + +/* typedef appears in brin_revmap.h */ + + +static BlockNumber revmap_get_blkno(BrinRevmap *revmap, + BlockNumber heapBlk); +static Buffer revmap_get_buffer(BrinRevmap *revmap, BlockNumber heapBlk); +static BlockNumber revmap_extend_and_get_blkno(BrinRevmap *revmap, + BlockNumber heapBlk); +static void revmap_physical_extend(BrinRevmap *revmap); + +/* + * Initialize an access object for a range map. This must be freed by + * brinRevmapTerminate when caller is done with it. + */ +BrinRevmap * +brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange, + Snapshot snapshot) +{ + BrinRevmap *revmap; + Buffer meta; + BrinMetaPageData *metadata; + Page page; + + meta = ReadBuffer(idxrel, BRIN_METAPAGE_BLKNO); + LockBuffer(meta, BUFFER_LOCK_SHARE); + page = BufferGetPage(meta); + TestForOldSnapshot(snapshot, idxrel, page); + metadata = (BrinMetaPageData *) PageGetContents(page); + + revmap = palloc(sizeof(BrinRevmap)); + revmap->rm_irel = idxrel; + revmap->rm_pagesPerRange = metadata->pagesPerRange; + revmap->rm_lastRevmapPage = metadata->lastRevmapPage; + revmap->rm_metaBuf = meta; + revmap->rm_currBuf = InvalidBuffer; + + *pagesPerRange = metadata->pagesPerRange; + + LockBuffer(meta, BUFFER_LOCK_UNLOCK); + + return revmap; +} + +/* + * Release resources associated with a revmap access object. + */ +void +brinRevmapTerminate(BrinRevmap *revmap) +{ + ReleaseBuffer(revmap->rm_metaBuf); + if (revmap->rm_currBuf != InvalidBuffer) + ReleaseBuffer(revmap->rm_currBuf); + pfree(revmap); +} + +/* + * Extend the revmap to cover the given heap block number. + */ +void +brinRevmapExtend(BrinRevmap *revmap, BlockNumber heapBlk) +{ + BlockNumber mapBlk PG_USED_FOR_ASSERTS_ONLY; + + mapBlk = revmap_extend_and_get_blkno(revmap, heapBlk); + + /* Ensure the buffer we got is in the expected range */ + Assert(mapBlk != InvalidBlockNumber && + mapBlk != BRIN_METAPAGE_BLKNO && + mapBlk <= revmap->rm_lastRevmapPage); +} + +/* + * Prepare to insert an entry into the revmap; the revmap buffer in which the + * entry is to reside is locked and returned. Most callers should call + * brinRevmapExtend beforehand, as this routine does not extend the revmap if + * it's not long enough. + * + * The returned buffer is also recorded in the revmap struct; finishing that + * releases the buffer, therefore the caller needn't do it explicitly. + */ +Buffer +brinLockRevmapPageForUpdate(BrinRevmap *revmap, BlockNumber heapBlk) +{ + Buffer rmBuf; + + rmBuf = revmap_get_buffer(revmap, heapBlk); + LockBuffer(rmBuf, BUFFER_LOCK_EXCLUSIVE); + + return rmBuf; +} + +/* + * In the given revmap buffer (locked appropriately by caller), which is used + * in a BRIN index of pagesPerRange pages per range, set the element + * corresponding to heap block number heapBlk to the given TID. + * + * Once the operation is complete, the caller must update the LSN on the + * returned buffer. + * + * This is used both in regular operation and during WAL replay. + */ +void +brinSetHeapBlockItemptr(Buffer buf, BlockNumber pagesPerRange, + BlockNumber heapBlk, ItemPointerData tid) +{ + RevmapContents *contents; + ItemPointerData *iptr; + Page page; + + /* The correct page should already be pinned and locked */ + page = BufferGetPage(buf); + contents = (RevmapContents *) PageGetContents(page); + iptr = (ItemPointerData *) contents->rm_tids; + iptr += HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk); + + if (ItemPointerIsValid(&tid)) + ItemPointerSet(iptr, + ItemPointerGetBlockNumber(&tid), + ItemPointerGetOffsetNumber(&tid)); + else + ItemPointerSetInvalid(iptr); +} + +/* + * Fetch the BrinTuple for a given heap block. + * + * The buffer containing the tuple is locked, and returned in *buf. The + * returned tuple points to the shared buffer and must not be freed; if caller + * wants to use it after releasing the buffer lock, it must create its own + * palloc'ed copy. As an optimization, the caller can pass a pinned buffer + * *buf on entry, which will avoid a pin-unpin cycle when the next tuple is on + * the same page as a previous one. + * + * If no tuple is found for the given heap range, returns NULL. In that case, + * *buf might still be updated (and pin must be released by caller), but it's + * not locked. + * + * The output tuple offset within the buffer is returned in *off, and its size + * is returned in *size. + */ +BrinTuple * +brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk, + Buffer *buf, OffsetNumber *off, Size *size, int mode, + Snapshot snapshot) +{ + Relation idxRel = revmap->rm_irel; + BlockNumber mapBlk; + RevmapContents *contents; + ItemPointerData *iptr; + BlockNumber blk; + Page page; + ItemId lp; + BrinTuple *tup; + ItemPointerData previptr; + + /* normalize the heap block number to be the first page in the range */ + heapBlk = (heapBlk / revmap->rm_pagesPerRange) * revmap->rm_pagesPerRange; + + /* + * Compute the revmap page number we need. If Invalid is returned (i.e., + * the revmap page hasn't been created yet), the requested page range is + * not summarized. + */ + mapBlk = revmap_get_blkno(revmap, heapBlk); + if (mapBlk == InvalidBlockNumber) + { + *off = InvalidOffsetNumber; + return NULL; + } + + ItemPointerSetInvalid(&previptr); + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + if (revmap->rm_currBuf == InvalidBuffer || + BufferGetBlockNumber(revmap->rm_currBuf) != mapBlk) + { + if (revmap->rm_currBuf != InvalidBuffer) + ReleaseBuffer(revmap->rm_currBuf); + + Assert(mapBlk != InvalidBlockNumber); + revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk); + } + + LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_SHARE); + + contents = (RevmapContents *) + PageGetContents(BufferGetPage(revmap->rm_currBuf)); + iptr = contents->rm_tids; + iptr += HEAPBLK_TO_REVMAP_INDEX(revmap->rm_pagesPerRange, heapBlk); + + if (!ItemPointerIsValid(iptr)) + { + LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK); + return NULL; + } + + /* + * Check the TID we got in a previous iteration, if any, and save the + * current TID we got from the revmap; if we loop, we can sanity-check + * that the next one we get is different. Otherwise we might be stuck + * looping forever if the revmap is somehow badly broken. + */ + if (ItemPointerIsValid(&previptr) && ItemPointerEquals(&previptr, iptr)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("corrupted BRIN index: inconsistent range map"))); + previptr = *iptr; + + blk = ItemPointerGetBlockNumber(iptr); + *off = ItemPointerGetOffsetNumber(iptr); + + LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK); + + /* Ok, got a pointer to where the BrinTuple should be. Fetch it. */ + if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != blk) + { + if (BufferIsValid(*buf)) + ReleaseBuffer(*buf); + *buf = ReadBuffer(idxRel, blk); + } + LockBuffer(*buf, mode); + page = BufferGetPage(*buf); + TestForOldSnapshot(snapshot, idxRel, page); + + /* If we land on a revmap page, start over */ + if (BRIN_IS_REGULAR_PAGE(page)) + { + /* + * If the offset number is greater than what's in the page, it's + * possible that the range was desummarized concurrently. Just + * return NULL to handle that case. + */ + if (*off > PageGetMaxOffsetNumber(page)) + { + LockBuffer(*buf, BUFFER_LOCK_UNLOCK); + return NULL; + } + + lp = PageGetItemId(page, *off); + if (ItemIdIsUsed(lp)) + { + tup = (BrinTuple *) PageGetItem(page, lp); + + if (tup->bt_blkno == heapBlk) + { + if (size) + *size = ItemIdGetLength(lp); + /* found it! */ + return tup; + } + } + } + + /* + * No luck. Assume that the revmap was updated concurrently. + */ + LockBuffer(*buf, BUFFER_LOCK_UNLOCK); + } + /* not reached, but keep compiler quiet */ + return NULL; +} + +/* + * Delete an index tuple, marking a page range as unsummarized. + * + * Index must be locked in ShareUpdateExclusiveLock mode. + * + * Return false if caller should retry. + */ +bool +brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk) +{ + BrinRevmap *revmap; + BlockNumber pagesPerRange; + RevmapContents *contents; + ItemPointerData *iptr; + ItemPointerData invalidIptr; + BlockNumber revmapBlk; + Buffer revmapBuf; + Buffer regBuf; + Page revmapPg; + Page regPg; + OffsetNumber revmapOffset; + OffsetNumber regOffset; + ItemId lp; + + revmap = brinRevmapInitialize(idxrel, &pagesPerRange, NULL); + + revmapBlk = revmap_get_blkno(revmap, heapBlk); + if (!BlockNumberIsValid(revmapBlk)) + { + /* revmap page doesn't exist: range not summarized, we're done */ + brinRevmapTerminate(revmap); + return true; + } + + /* Lock the revmap page, obtain the index tuple pointer from it */ + revmapBuf = brinLockRevmapPageForUpdate(revmap, heapBlk); + revmapPg = BufferGetPage(revmapBuf); + revmapOffset = HEAPBLK_TO_REVMAP_INDEX(revmap->rm_pagesPerRange, heapBlk); + + contents = (RevmapContents *) PageGetContents(revmapPg); + iptr = contents->rm_tids; + iptr += revmapOffset; + + if (!ItemPointerIsValid(iptr)) + { + /* no index tuple: range not summarized, we're done */ + LockBuffer(revmapBuf, BUFFER_LOCK_UNLOCK); + brinRevmapTerminate(revmap); + return true; + } + + regBuf = ReadBuffer(idxrel, ItemPointerGetBlockNumber(iptr)); + LockBuffer(regBuf, BUFFER_LOCK_EXCLUSIVE); + regPg = BufferGetPage(regBuf); + + /* + * We're only removing data, not reading it, so there's no need to + * TestForOldSnapshot here. + */ + + /* if this is no longer a regular page, tell caller to start over */ + if (!BRIN_IS_REGULAR_PAGE(regPg)) + { + LockBuffer(revmapBuf, BUFFER_LOCK_UNLOCK); + LockBuffer(regBuf, BUFFER_LOCK_UNLOCK); + brinRevmapTerminate(revmap); + return false; + } + + regOffset = ItemPointerGetOffsetNumber(iptr); + if (regOffset > PageGetMaxOffsetNumber(regPg)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("corrupted BRIN index: inconsistent range map"))); + + lp = PageGetItemId(regPg, regOffset); + if (!ItemIdIsUsed(lp)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("corrupted BRIN index: inconsistent range map"))); + + /* + * Placeholder tuples only appear during unfinished summarization, and we + * hold ShareUpdateExclusiveLock, so this function cannot run concurrently + * with that. So any placeholder tuples that exist are leftovers from a + * crashed or aborted summarization; remove them silently. + */ + + START_CRIT_SECTION(); + + ItemPointerSetInvalid(&invalidIptr); + brinSetHeapBlockItemptr(revmapBuf, revmap->rm_pagesPerRange, heapBlk, + invalidIptr); + PageIndexTupleDeleteNoCompact(regPg, regOffset); + /* XXX record free space in FSM? */ + + MarkBufferDirty(regBuf); + MarkBufferDirty(revmapBuf); + + if (RelationNeedsWAL(idxrel)) + { + xl_brin_desummarize xlrec; + XLogRecPtr recptr; + + xlrec.pagesPerRange = revmap->rm_pagesPerRange; + xlrec.heapBlk = heapBlk; + xlrec.regOffset = regOffset; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBrinDesummarize); + XLogRegisterBuffer(0, revmapBuf, 0); + XLogRegisterBuffer(1, regBuf, REGBUF_STANDARD); + recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_DESUMMARIZE); + PageSetLSN(revmapPg, recptr); + PageSetLSN(regPg, recptr); + } + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(regBuf); + LockBuffer(revmapBuf, BUFFER_LOCK_UNLOCK); + brinRevmapTerminate(revmap); + + return true; +} + +/* + * Given a heap block number, find the corresponding physical revmap block + * number and return it. If the revmap page hasn't been allocated yet, return + * InvalidBlockNumber. + */ +static BlockNumber +revmap_get_blkno(BrinRevmap *revmap, BlockNumber heapBlk) +{ + BlockNumber targetblk; + + /* obtain revmap block number, skip 1 for metapage block */ + targetblk = HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk) + 1; + + /* Normal case: the revmap page is already allocated */ + if (targetblk <= revmap->rm_lastRevmapPage) + return targetblk; + + return InvalidBlockNumber; +} + +/* + * Obtain and return a buffer containing the revmap page for the given heap + * page. The revmap must have been previously extended to cover that page. + * The returned buffer is also recorded in the revmap struct; finishing that + * releases the buffer, therefore the caller needn't do it explicitly. + */ +static Buffer +revmap_get_buffer(BrinRevmap *revmap, BlockNumber heapBlk) +{ + BlockNumber mapBlk; + + /* Translate the heap block number to physical index location. */ + mapBlk = revmap_get_blkno(revmap, heapBlk); + + if (mapBlk == InvalidBlockNumber) + elog(ERROR, "revmap does not cover heap block %u", heapBlk); + + /* Ensure the buffer we got is in the expected range */ + Assert(mapBlk != BRIN_METAPAGE_BLKNO && + mapBlk <= revmap->rm_lastRevmapPage); + + /* + * Obtain the buffer from which we need to read. If we already have the + * correct buffer in our access struct, use that; otherwise, release that, + * (if valid) and read the one we need. + */ + if (revmap->rm_currBuf == InvalidBuffer || + mapBlk != BufferGetBlockNumber(revmap->rm_currBuf)) + { + if (revmap->rm_currBuf != InvalidBuffer) + ReleaseBuffer(revmap->rm_currBuf); + + revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk); + } + + return revmap->rm_currBuf; +} + +/* + * Given a heap block number, find the corresponding physical revmap block + * number and return it. If the revmap page hasn't been allocated yet, extend + * the revmap until it is. + */ +static BlockNumber +revmap_extend_and_get_blkno(BrinRevmap *revmap, BlockNumber heapBlk) +{ + BlockNumber targetblk; + + /* obtain revmap block number, skip 1 for metapage block */ + targetblk = HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk) + 1; + + /* Extend the revmap, if necessary */ + while (targetblk > revmap->rm_lastRevmapPage) + { + CHECK_FOR_INTERRUPTS(); + revmap_physical_extend(revmap); + } + + return targetblk; +} + +/* + * Try to extend the revmap by one page. This might not happen for a number of + * reasons; caller is expected to retry until the expected outcome is obtained. + */ +static void +revmap_physical_extend(BrinRevmap *revmap) +{ + Buffer buf; + Page page; + Page metapage; + BrinMetaPageData *metadata; + BlockNumber mapBlk; + BlockNumber nblocks; + Relation irel = revmap->rm_irel; + bool needLock = !RELATION_IS_LOCAL(irel); + + /* + * Lock the metapage. This locks out concurrent extensions of the revmap, + * but note that we still need to grab the relation extension lock because + * another backend can extend the index with regular BRIN pages. + */ + LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_EXCLUSIVE); + metapage = BufferGetPage(revmap->rm_metaBuf); + metadata = (BrinMetaPageData *) PageGetContents(metapage); + + /* + * Check that our cached lastRevmapPage value was up-to-date; if it + * wasn't, update the cached copy and have caller start over. + */ + if (metadata->lastRevmapPage != revmap->rm_lastRevmapPage) + { + revmap->rm_lastRevmapPage = metadata->lastRevmapPage; + LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); + return; + } + mapBlk = metadata->lastRevmapPage + 1; + + nblocks = RelationGetNumberOfBlocks(irel); + if (mapBlk < nblocks) + { + buf = ReadBuffer(irel, mapBlk); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buf); + } + else + { + if (needLock) + LockRelationForExtension(irel, ExclusiveLock); + + buf = ReadBuffer(irel, P_NEW); + if (BufferGetBlockNumber(buf) != mapBlk) + { + /* + * Very rare corner case: somebody extended the relation + * concurrently after we read its length. If this happens, give + * up and have caller start over. We will have to evacuate that + * page from under whoever is using it. + */ + if (needLock) + UnlockRelationForExtension(irel, ExclusiveLock); + LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buf); + return; + } + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buf); + + if (needLock) + UnlockRelationForExtension(irel, ExclusiveLock); + } + + /* Check that it's a regular block (or an empty page) */ + if (!PageIsNew(page) && !BRIN_IS_REGULAR_PAGE(page)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("unexpected page type 0x%04X in BRIN index \"%s\" block %u", + BrinPageType(page), + RelationGetRelationName(irel), + BufferGetBlockNumber(buf)))); + + /* If the page is in use, evacuate it and restart */ + if (brin_start_evacuating_page(irel, buf)) + { + LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); + brin_evacuate_page(irel, revmap->rm_pagesPerRange, revmap, buf); + + /* have caller start over */ + return; + } + + /* + * Ok, we have now locked the metapage and the target block. Re-initialize + * the target block as a revmap page, and update the metapage. + */ + START_CRIT_SECTION(); + + /* the rm_tids array is initialized to all invalid by PageInit */ + brin_page_init(page, BRIN_PAGETYPE_REVMAP); + MarkBufferDirty(buf); + + metadata->lastRevmapPage = mapBlk; + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c compresses + * the page. (We must do this here because pre-v11 versions of PG did not + * set the metapage's pd_lower correctly, so a pg_upgraded index might + * contain the wrong value.) + */ + ((PageHeader) metapage)->pd_lower = + ((char *) metadata + sizeof(BrinMetaPageData)) - (char *) metapage; + + MarkBufferDirty(revmap->rm_metaBuf); + + if (RelationNeedsWAL(revmap->rm_irel)) + { + xl_brin_revmap_extend xlrec; + XLogRecPtr recptr; + + xlrec.targetBlk = mapBlk; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfBrinRevmapExtend); + XLogRegisterBuffer(0, revmap->rm_metaBuf, REGBUF_STANDARD); + + XLogRegisterBuffer(1, buf, REGBUF_WILL_INIT); + + recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_REVMAP_EXTEND); + PageSetLSN(metapage, recptr); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); + + UnlockReleaseBuffer(buf); +} diff --git a/src/backend/access/brin/brin_tuple.c b/src/backend/access/brin/brin_tuple.c new file mode 100644 index 0000000..09e563b --- /dev/null +++ b/src/backend/access/brin/brin_tuple.c @@ -0,0 +1,708 @@ +/* + * brin_tuple.c + * Method implementations for tuples in BRIN indexes. + * + * Intended usage is that code outside this file only deals with + * BrinMemTuples, and convert to and from the on-disk representation through + * functions in this file. + * + * NOTES + * + * A BRIN tuple is similar to a heap tuple, with a few key differences. The + * first interesting difference is that the tuple header is much simpler, only + * containing its total length and a small area for flags. Also, the stored + * data does not match the relation tuple descriptor exactly: for each + * attribute in the descriptor, the index tuple carries an arbitrary number + * of values, depending on the opclass. + * + * Also, for each column of the index relation there are two null bits: one + * (hasnulls) stores whether any tuple within the page range has that column + * set to null; the other one (allnulls) stores whether the column values are + * all null. If allnulls is true, then the tuple data area does not contain + * values for that column at all; whereas it does if the hasnulls is set. + * Note the size of the null bitmask may not be the same as that of the + * datum array. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_tuple.c + */ +#include "postgres.h" + +#include "access/brin_tuple.h" +#include "access/detoast.h" +#include "access/heaptoast.h" +#include "access/htup_details.h" +#include "access/toast_internals.h" +#include "access/tupdesc.h" +#include "access/tupmacs.h" +#include "utils/datum.h" +#include "utils/memutils.h" + + +/* + * This enables de-toasting of index entries. Needed until VACUUM is + * smart enough to rebuild indexes from scratch. + */ +#define TOAST_INDEX_HACK + + +static inline void brin_deconstruct_tuple(BrinDesc *brdesc, + char *tp, bits8 *nullbits, bool nulls, + Datum *values, bool *allnulls, bool *hasnulls); + + +/* + * Return a tuple descriptor used for on-disk storage of BRIN tuples. + */ +static TupleDesc +brtuple_disk_tupdesc(BrinDesc *brdesc) +{ + /* We cache these in the BrinDesc */ + if (brdesc->bd_disktdesc == NULL) + { + int i; + int j; + AttrNumber attno = 1; + TupleDesc tupdesc; + MemoryContext oldcxt; + + /* make sure it's in the bdesc's context */ + oldcxt = MemoryContextSwitchTo(brdesc->bd_context); + + tupdesc = CreateTemplateTupleDesc(brdesc->bd_totalstored); + + for (i = 0; i < brdesc->bd_tupdesc->natts; i++) + { + for (j = 0; j < brdesc->bd_info[i]->oi_nstored; j++) + TupleDescInitEntry(tupdesc, attno++, NULL, + brdesc->bd_info[i]->oi_typcache[j]->type_id, + -1, 0); + } + + MemoryContextSwitchTo(oldcxt); + + brdesc->bd_disktdesc = tupdesc; + } + + return brdesc->bd_disktdesc; +} + +/* + * Generate a new on-disk tuple to be inserted in a BRIN index. + * + * See brin_form_placeholder_tuple if you touch this. + */ +BrinTuple * +brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple, + Size *size) +{ + Datum *values; + bool *nulls; + bool anynulls = false; + BrinTuple *rettuple; + int keyno; + int idxattno; + uint16 phony_infomask = 0; + bits8 *phony_nullbitmap; + Size len, + hoff, + data_len; + int i; + +#ifdef TOAST_INDEX_HACK + Datum *untoasted_values; + int nuntoasted = 0; +#endif + + Assert(brdesc->bd_totalstored > 0); + + values = (Datum *) palloc(sizeof(Datum) * brdesc->bd_totalstored); + nulls = (bool *) palloc0(sizeof(bool) * brdesc->bd_totalstored); + phony_nullbitmap = (bits8 *) + palloc(sizeof(bits8) * BITMAPLEN(brdesc->bd_totalstored)); + +#ifdef TOAST_INDEX_HACK + untoasted_values = (Datum *) palloc(sizeof(Datum) * brdesc->bd_totalstored); +#endif + + /* + * Set up the values/nulls arrays for heap_fill_tuple + */ + idxattno = 0; + for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + int datumno; + + /* + * "allnulls" is set when there's no nonnull value in any row in the + * column; when this happens, there is no data to store. Thus set the + * nullable bits for all data elements of this column and we're done. + */ + if (tuple->bt_columns[keyno].bv_allnulls) + { + for (datumno = 0; + datumno < brdesc->bd_info[keyno]->oi_nstored; + datumno++) + nulls[idxattno++] = true; + anynulls = true; + continue; + } + + /* + * The "hasnulls" bit is set when there are some null values in the + * data. We still need to store a real value, but the presence of + * this means we need a null bitmap. + */ + if (tuple->bt_columns[keyno].bv_hasnulls) + anynulls = true; + + /* If needed, serialize the values before forming the on-disk tuple. */ + if (tuple->bt_columns[keyno].bv_serialize) + { + tuple->bt_columns[keyno].bv_serialize(brdesc, + tuple->bt_columns[keyno].bv_mem_value, + tuple->bt_columns[keyno].bv_values); + } + + /* + * Now obtain the values of each stored datum. Note that some values + * might be toasted, and we cannot rely on the original heap values + * sticking around forever, so we must detoast them. Also try to + * compress them. + */ + for (datumno = 0; + datumno < brdesc->bd_info[keyno]->oi_nstored; + datumno++) + { + Datum value = tuple->bt_columns[keyno].bv_values[datumno]; + +#ifdef TOAST_INDEX_HACK + + /* We must look at the stored type, not at the index descriptor. */ + TypeCacheEntry *atttype = brdesc->bd_info[keyno]->oi_typcache[datumno]; + + /* Do we need to free the value at the end? */ + bool free_value = false; + + /* For non-varlena types we don't need to do anything special */ + if (atttype->typlen != -1) + { + values[idxattno++] = value; + continue; + } + + /* + * Do nothing if value is not of varlena type. We don't need to + * care about NULL values here, thanks to bv_allnulls above. + * + * If value is stored EXTERNAL, must fetch it so we are not + * depending on outside storage. + * + * XXX Is this actually true? Could it be that the summary is NULL + * even for range with non-NULL data? E.g. degenerate bloom filter + * may be thrown away, etc. + */ + if (VARATT_IS_EXTERNAL(DatumGetPointer(value))) + { + value = PointerGetDatum(detoast_external_attr((struct varlena *) + DatumGetPointer(value))); + free_value = true; + } + + /* + * If value is above size target, and is of a compressible + * datatype, try to compress it in-line. + */ + if (!VARATT_IS_EXTENDED(DatumGetPointer(value)) && + VARSIZE(DatumGetPointer(value)) > TOAST_INDEX_TARGET && + (atttype->typstorage == TYPSTORAGE_EXTENDED || + atttype->typstorage == TYPSTORAGE_MAIN)) + { + Datum cvalue; + char compression; + Form_pg_attribute att = TupleDescAttr(brdesc->bd_tupdesc, + keyno); + + /* + * If the BRIN summary and indexed attribute use the same data + * type and it has a valid compression method, we can use the + * same compression method. Otherwise we have to use the + * default method. + */ + if (att->atttypid == atttype->type_id) + compression = att->attcompression; + else + compression = InvalidCompressionMethod; + + cvalue = toast_compress_datum(value, compression); + + if (DatumGetPointer(cvalue) != NULL) + { + /* successful compression */ + if (free_value) + pfree(DatumGetPointer(value)); + + value = cvalue; + free_value = true; + } + } + + /* + * If we untoasted / compressed the value, we need to free it + * after forming the index tuple. + */ + if (free_value) + untoasted_values[nuntoasted++] = value; + +#endif + + values[idxattno++] = value; + } + } + + /* Assert we did not overrun temp arrays */ + Assert(idxattno <= brdesc->bd_totalstored); + + /* compute total space needed */ + len = SizeOfBrinTuple; + if (anynulls) + { + /* + * We need a double-length bitmap on an on-disk BRIN index tuple; the + * first half stores the "allnulls" bits, the second stores + * "hasnulls". + */ + len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2); + } + + len = hoff = MAXALIGN(len); + + data_len = heap_compute_data_size(brtuple_disk_tupdesc(brdesc), + values, nulls); + len += data_len; + + len = MAXALIGN(len); + + rettuple = palloc0(len); + rettuple->bt_blkno = blkno; + rettuple->bt_info = hoff; + + /* Assert that hoff fits in the space available */ + Assert((rettuple->bt_info & BRIN_OFFSET_MASK) == hoff); + + /* + * The infomask and null bitmap as computed by heap_fill_tuple are useless + * to us. However, that function will not accept a null infomask; and we + * need to pass a valid null bitmap so that it will correctly skip + * outputting null attributes in the data area. + */ + heap_fill_tuple(brtuple_disk_tupdesc(brdesc), + values, + nulls, + (char *) rettuple + hoff, + data_len, + &phony_infomask, + phony_nullbitmap); + + /* done with these */ + pfree(values); + pfree(nulls); + pfree(phony_nullbitmap); + +#ifdef TOAST_INDEX_HACK + for (i = 0; i < nuntoasted; i++) + pfree(DatumGetPointer(untoasted_values[i])); +#endif + + /* + * Now fill in the real null bitmasks. allnulls first. + */ + if (anynulls) + { + bits8 *bitP; + int bitmask; + + rettuple->bt_info |= BRIN_NULLS_MASK; + + /* + * Note that we reverse the sense of null bits in this module: we + * store a 1 for a null attribute rather than a 0. So we must reverse + * the sense of the att_isnull test in brin_deconstruct_tuple as well. + */ + bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1; + bitmask = HIGHBIT; + for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + if (bitmask != HIGHBIT) + bitmask <<= 1; + else + { + bitP += 1; + *bitP = 0x0; + bitmask = 1; + } + + if (!tuple->bt_columns[keyno].bv_allnulls) + continue; + + *bitP |= bitmask; + } + /* hasnulls bits follow */ + for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + if (bitmask != HIGHBIT) + bitmask <<= 1; + else + { + bitP += 1; + *bitP = 0x0; + bitmask = 1; + } + + if (!tuple->bt_columns[keyno].bv_hasnulls) + continue; + + *bitP |= bitmask; + } + } + + if (tuple->bt_placeholder) + rettuple->bt_info |= BRIN_PLACEHOLDER_MASK; + + *size = len; + return rettuple; +} + +/* + * Generate a new on-disk tuple with no data values, marked as placeholder. + * + * This is a cut-down version of brin_form_tuple. + */ +BrinTuple * +brin_form_placeholder_tuple(BrinDesc *brdesc, BlockNumber blkno, Size *size) +{ + Size len; + Size hoff; + BrinTuple *rettuple; + int keyno; + bits8 *bitP; + int bitmask; + + /* compute total space needed: always add nulls */ + len = SizeOfBrinTuple; + len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2); + len = hoff = MAXALIGN(len); + + rettuple = palloc0(len); + rettuple->bt_blkno = blkno; + rettuple->bt_info = hoff; + rettuple->bt_info |= BRIN_NULLS_MASK | BRIN_PLACEHOLDER_MASK; + + bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1; + bitmask = HIGHBIT; + /* set allnulls true for all attributes */ + for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + if (bitmask != HIGHBIT) + bitmask <<= 1; + else + { + bitP += 1; + *bitP = 0x0; + bitmask = 1; + } + + *bitP |= bitmask; + } + /* no need to set hasnulls */ + + *size = len; + return rettuple; +} + +/* + * Free a tuple created by brin_form_tuple + */ +void +brin_free_tuple(BrinTuple *tuple) +{ + pfree(tuple); +} + +/* + * Given a brin tuple of size len, create a copy of it. If 'dest' is not + * NULL, its size is destsz, and can be used as output buffer; if the tuple + * to be copied does not fit, it is enlarged by repalloc, and the size is + * updated to match. This avoids palloc/free cycles when many brin tuples + * are being processed in loops. + */ +BrinTuple * +brin_copy_tuple(BrinTuple *tuple, Size len, BrinTuple *dest, Size *destsz) +{ + if (!destsz || *destsz == 0) + dest = palloc(len); + else if (len > *destsz) + { + dest = repalloc(dest, len); + *destsz = len; + } + + memcpy(dest, tuple, len); + + return dest; +} + +/* + * Return whether two BrinTuples are bitwise identical. + */ +bool +brin_tuples_equal(const BrinTuple *a, Size alen, const BrinTuple *b, Size blen) +{ + if (alen != blen) + return false; + if (memcmp(a, b, alen) != 0) + return false; + return true; +} + +/* + * Create a new BrinMemTuple from scratch, and initialize it to an empty + * state. + * + * Note: we don't provide any means to free a deformed tuple, so make sure to + * use a temporary memory context. + */ +BrinMemTuple * +brin_new_memtuple(BrinDesc *brdesc) +{ + BrinMemTuple *dtup; + long basesize; + + basesize = MAXALIGN(sizeof(BrinMemTuple) + + sizeof(BrinValues) * brdesc->bd_tupdesc->natts); + dtup = palloc0(basesize + sizeof(Datum) * brdesc->bd_totalstored); + + dtup->bt_values = palloc(sizeof(Datum) * brdesc->bd_totalstored); + dtup->bt_allnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts); + dtup->bt_hasnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts); + + dtup->bt_context = AllocSetContextCreate(CurrentMemoryContext, + "brin dtuple", + ALLOCSET_DEFAULT_SIZES); + + brin_memtuple_initialize(dtup, brdesc); + + return dtup; +} + +/* + * Reset a BrinMemTuple to initial state. We return the same tuple, for + * notational convenience. + */ +BrinMemTuple * +brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc) +{ + int i; + char *currdatum; + + MemoryContextReset(dtuple->bt_context); + + currdatum = (char *) dtuple + + MAXALIGN(sizeof(BrinMemTuple) + + sizeof(BrinValues) * brdesc->bd_tupdesc->natts); + for (i = 0; i < brdesc->bd_tupdesc->natts; i++) + { + dtuple->bt_columns[i].bv_attno = i + 1; + dtuple->bt_columns[i].bv_allnulls = true; + dtuple->bt_columns[i].bv_hasnulls = false; + dtuple->bt_columns[i].bv_values = (Datum *) currdatum; + + dtuple->bt_columns[i].bv_mem_value = PointerGetDatum(NULL); + dtuple->bt_columns[i].bv_serialize = NULL; + dtuple->bt_columns[i].bv_context = dtuple->bt_context; + + currdatum += sizeof(Datum) * brdesc->bd_info[i]->oi_nstored; + } + + return dtuple; +} + +/* + * Convert a BrinTuple back to a BrinMemTuple. This is the reverse of + * brin_form_tuple. + * + * As an optimization, the caller can pass a previously allocated 'dMemtuple'. + * This avoids having to allocate it here, which can be useful when this + * function is called many times in a loop. It is caller's responsibility + * that the given BrinMemTuple matches what we need here. + * + * Note we don't need the "on disk tupdesc" here; we rely on our own routine to + * deconstruct the tuple from the on-disk format. + */ +BrinMemTuple * +brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple, BrinMemTuple *dMemtuple) +{ + BrinMemTuple *dtup; + Datum *values; + bool *allnulls; + bool *hasnulls; + char *tp; + bits8 *nullbits; + int keyno; + int valueno; + MemoryContext oldcxt; + + dtup = dMemtuple ? brin_memtuple_initialize(dMemtuple, brdesc) : + brin_new_memtuple(brdesc); + + if (BrinTupleIsPlaceholder(tuple)) + dtup->bt_placeholder = true; + dtup->bt_blkno = tuple->bt_blkno; + + values = dtup->bt_values; + allnulls = dtup->bt_allnulls; + hasnulls = dtup->bt_hasnulls; + + tp = (char *) tuple + BrinTupleDataOffset(tuple); + + if (BrinTupleHasNulls(tuple)) + nullbits = (bits8 *) ((char *) tuple + SizeOfBrinTuple); + else + nullbits = NULL; + brin_deconstruct_tuple(brdesc, + tp, nullbits, BrinTupleHasNulls(tuple), + values, allnulls, hasnulls); + + /* + * Iterate to assign each of the values to the corresponding item in the + * values array of each column. The copies occur in the tuple's context. + */ + oldcxt = MemoryContextSwitchTo(dtup->bt_context); + for (valueno = 0, keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) + { + int i; + + if (allnulls[keyno]) + { + valueno += brdesc->bd_info[keyno]->oi_nstored; + continue; + } + + /* + * We would like to skip datumCopy'ing the values datum in some cases, + * caller permitting ... + */ + for (i = 0; i < brdesc->bd_info[keyno]->oi_nstored; i++) + dtup->bt_columns[keyno].bv_values[i] = + datumCopy(values[valueno++], + brdesc->bd_info[keyno]->oi_typcache[i]->typbyval, + brdesc->bd_info[keyno]->oi_typcache[i]->typlen); + + dtup->bt_columns[keyno].bv_hasnulls = hasnulls[keyno]; + dtup->bt_columns[keyno].bv_allnulls = false; + + dtup->bt_columns[keyno].bv_mem_value = PointerGetDatum(NULL); + dtup->bt_columns[keyno].bv_serialize = NULL; + dtup->bt_columns[keyno].bv_context = dtup->bt_context; + } + + MemoryContextSwitchTo(oldcxt); + + return dtup; +} + +/* + * brin_deconstruct_tuple + * Guts of attribute extraction from an on-disk BRIN tuple. + * + * Its arguments are: + * brdesc BRIN descriptor for the stored tuple + * tp pointer to the tuple data area + * nullbits pointer to the tuple nulls bitmask + * nulls "has nulls" bit in tuple infomask + * values output values, array of size brdesc->bd_totalstored + * allnulls output "allnulls", size brdesc->bd_tupdesc->natts + * hasnulls output "hasnulls", size brdesc->bd_tupdesc->natts + * + * Output arrays must have been allocated by caller. + */ +static inline void +brin_deconstruct_tuple(BrinDesc *brdesc, + char *tp, bits8 *nullbits, bool nulls, + Datum *values, bool *allnulls, bool *hasnulls) +{ + int attnum; + int stored; + TupleDesc diskdsc; + long off; + + /* + * First iterate to natts to obtain both null flags for each attribute. + * Note that we reverse the sense of the att_isnull test, because we store + * 1 for a null value (rather than a 1 for a not null value as is the + * att_isnull convention used elsewhere.) See brin_form_tuple. + */ + for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++) + { + /* + * the "all nulls" bit means that all values in the page range for + * this column are nulls. Therefore there are no values in the tuple + * data area. + */ + allnulls[attnum] = nulls && !att_isnull(attnum, nullbits); + + /* + * the "has nulls" bit means that some tuples have nulls, but others + * have not-null values. Therefore we know the tuple contains data + * for this column. + * + * The hasnulls bits follow the allnulls bits in the same bitmask. + */ + hasnulls[attnum] = + nulls && !att_isnull(brdesc->bd_tupdesc->natts + attnum, nullbits); + } + + /* + * Iterate to obtain each attribute's stored values. Note that since we + * may reuse attribute entries for more than one column, we cannot cache + * offsets here. + */ + diskdsc = brtuple_disk_tupdesc(brdesc); + stored = 0; + off = 0; + for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++) + { + int datumno; + + if (allnulls[attnum]) + { + stored += brdesc->bd_info[attnum]->oi_nstored; + continue; + } + + for (datumno = 0; + datumno < brdesc->bd_info[attnum]->oi_nstored; + datumno++) + { + Form_pg_attribute thisatt = TupleDescAttr(diskdsc, stored); + + if (thisatt->attlen == -1) + { + off = att_align_pointer(off, thisatt->attalign, -1, + tp + off); + } + else + { + /* not varlena, so safe to use att_align_nominal */ + off = att_align_nominal(off, thisatt->attalign); + } + + values[stored++] = fetchatt(thisatt, tp + off); + + off = att_addlength_pointer(off, thisatt->attlen, tp + off); + } + } +} diff --git a/src/backend/access/brin/brin_validate.c b/src/backend/access/brin/brin_validate.c new file mode 100644 index 0000000..11835d8 --- /dev/null +++ b/src/backend/access/brin/brin_validate.c @@ -0,0 +1,281 @@ +/*------------------------------------------------------------------------- + * + * brin_validate.c + * Opclass validator for BRIN. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_validate.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/amvalidate.h" +#include "access/brin_internal.h" +#include "access/htup_details.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_amproc.h" +#include "catalog/pg_opclass.h" +#include "catalog/pg_opfamily.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/regproc.h" +#include "utils/syscache.h" + +/* + * Validator for a BRIN opclass. + * + * Some of the checks done here cover the whole opfamily, and therefore are + * redundant when checking each opclass in a family. But they don't run long + * enough to be much of a problem, so we accept the duplication rather than + * complicate the amvalidate API. + */ +bool +brinvalidate(Oid opclassoid) +{ + bool result = true; + HeapTuple classtup; + Form_pg_opclass classform; + Oid opfamilyoid; + Oid opcintype; + char *opclassname; + HeapTuple familytup; + Form_pg_opfamily familyform; + char *opfamilyname; + CatCList *proclist, + *oprlist; + uint64 allfuncs = 0; + uint64 allops = 0; + List *grouplist; + OpFamilyOpFuncGroup *opclassgroup; + int i; + ListCell *lc; + + /* Fetch opclass information */ + classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid)); + if (!HeapTupleIsValid(classtup)) + elog(ERROR, "cache lookup failed for operator class %u", opclassoid); + classform = (Form_pg_opclass) GETSTRUCT(classtup); + + opfamilyoid = classform->opcfamily; + opcintype = classform->opcintype; + opclassname = NameStr(classform->opcname); + + /* Fetch opfamily information */ + familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid)); + if (!HeapTupleIsValid(familytup)) + elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid); + familyform = (Form_pg_opfamily) GETSTRUCT(familytup); + + opfamilyname = NameStr(familyform->opfname); + + /* Fetch all operators and support functions of the opfamily */ + oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid)); + proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid)); + + /* Check individual support functions */ + for (i = 0; i < proclist->n_members; i++) + { + HeapTuple proctup = &proclist->members[i]->tuple; + Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup); + bool ok; + + /* Check procedure numbers and function signatures */ + switch (procform->amprocnum) + { + case BRIN_PROCNUM_OPCINFO: + ok = check_amproc_signature(procform->amproc, INTERNALOID, true, + 1, 1, INTERNALOID); + break; + case BRIN_PROCNUM_ADDVALUE: + ok = check_amproc_signature(procform->amproc, BOOLOID, true, + 4, 4, INTERNALOID, INTERNALOID, + INTERNALOID, INTERNALOID); + break; + case BRIN_PROCNUM_CONSISTENT: + ok = check_amproc_signature(procform->amproc, BOOLOID, true, + 3, 4, INTERNALOID, INTERNALOID, + INTERNALOID, INT4OID); + break; + case BRIN_PROCNUM_UNION: + ok = check_amproc_signature(procform->amproc, BOOLOID, true, + 3, 3, INTERNALOID, INTERNALOID, + INTERNALOID); + break; + case BRIN_PROCNUM_OPTIONS: + ok = check_amoptsproc_signature(procform->amproc); + break; + default: + /* Complain if it's not a valid optional proc number */ + if (procform->amprocnum < BRIN_FIRST_OPTIONAL_PROCNUM || + procform->amprocnum > BRIN_LAST_OPTIONAL_PROCNUM) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d", + opfamilyname, "brin", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + continue; /* omit bad proc numbers from allfuncs */ + } + /* Can't check signatures of optional procs, so assume OK */ + ok = true; + break; + } + + if (!ok) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d", + opfamilyname, "brin", + format_procedure(procform->amproc), + procform->amprocnum))); + result = false; + } + + /* Track all valid procedure numbers seen in opfamily */ + allfuncs |= ((uint64) 1) << procform->amprocnum; + } + + /* Check individual operators */ + for (i = 0; i < oprlist->n_members; i++) + { + HeapTuple oprtup = &oprlist->members[i]->tuple; + Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup); + + /* Check that only allowed strategy numbers exist */ + if (oprform->amopstrategy < 1 || oprform->amopstrategy > 63) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d", + opfamilyname, "brin", + format_operator(oprform->amopopr), + oprform->amopstrategy))); + result = false; + } + else + { + /* + * The set of operators supplied varies across BRIN opfamilies. + * Our plan is to identify all operator strategy numbers used in + * the opfamily and then complain about datatype combinations that + * are missing any operator(s). However, consider only numbers + * that appear in some non-cross-type case, since cross-type + * operators may have unique strategies. (This is not a great + * heuristic, in particular an erroneous number used in a + * cross-type operator will not get noticed; but the core BRIN + * opfamilies are messy enough to make it necessary.) + */ + if (oprform->amoplefttype == oprform->amoprighttype) + allops |= ((uint64) 1) << oprform->amopstrategy; + } + + /* brin doesn't support ORDER BY operators */ + if (oprform->amoppurpose != AMOP_SEARCH || + OidIsValid(oprform->amopsortfamily)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains invalid ORDER BY specification for operator %s", + opfamilyname, "brin", + format_operator(oprform->amopopr)))); + result = false; + } + + /* Check operator signature --- same for all brin strategies */ + if (!check_amop_signature(oprform->amopopr, BOOLOID, + oprform->amoplefttype, + oprform->amoprighttype)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature", + opfamilyname, "brin", + format_operator(oprform->amopopr)))); + result = false; + } + } + + /* Now check for inconsistent groups of operators/functions */ + grouplist = identify_opfamily_groups(oprlist, proclist); + opclassgroup = NULL; + foreach(lc, grouplist) + { + OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc); + + /* Remember the group exactly matching the test opclass */ + if (thisgroup->lefttype == opcintype && + thisgroup->righttype == opcintype) + opclassgroup = thisgroup; + + /* + * Some BRIN opfamilies expect cross-type support functions to exist, + * and some don't. We don't know exactly which are which, so if we + * find a cross-type operator for which there are no support functions + * at all, let it pass. (Don't expect that all operators exist for + * such cross-type cases, either.) + */ + if (thisgroup->functionset == 0 && + thisgroup->lefttype != thisgroup->righttype) + continue; + + /* + * Else complain if there seems to be an incomplete set of either + * operators or support functions for this datatype pair. + */ + if (thisgroup->operatorset != allops) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s is missing operator(s) for types %s and %s", + opfamilyname, "brin", + format_type_be(thisgroup->lefttype), + format_type_be(thisgroup->righttype)))); + result = false; + } + if (thisgroup->functionset != allfuncs) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator family \"%s\" of access method %s is missing support function(s) for types %s and %s", + opfamilyname, "brin", + format_type_be(thisgroup->lefttype), + format_type_be(thisgroup->righttype)))); + result = false; + } + } + + /* Check that the originally-named opclass is complete */ + if (!opclassgroup || opclassgroup->operatorset != allops) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator class \"%s\" of access method %s is missing operator(s)", + opclassname, "brin"))); + result = false; + } + for (i = 1; i <= BRIN_MANDATORY_NPROCS; i++) + { + if (opclassgroup && + (opclassgroup->functionset & (((int64) 1) << i)) != 0) + continue; /* got it */ + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("operator class \"%s\" of access method %s is missing support function %d", + opclassname, "brin", i))); + result = false; + } + + ReleaseCatCacheList(proclist); + ReleaseCatCacheList(oprlist); + ReleaseSysCache(familytup); + ReleaseSysCache(classtup); + + return result; +} diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c new file mode 100644 index 0000000..3519038 --- /dev/null +++ b/src/backend/access/brin/brin_xlog.c @@ -0,0 +1,367 @@ +/* + * brin_xlog.c + * XLog replay routines for BRIN indexes + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/brin/brin_xlog.c + */ +#include "postgres.h" + +#include "access/brin_page.h" +#include "access/brin_pageops.h" +#include "access/brin_xlog.h" +#include "access/bufmask.h" +#include "access/xlogutils.h" + + +/* + * xlog replay routines + */ +static void +brin_xlog_createidx(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_brin_createidx *xlrec = (xl_brin_createidx *) XLogRecGetData(record); + Buffer buf; + Page page; + + /* create the index' metapage */ + buf = XLogInitBufferForRedo(record, 0); + Assert(BufferIsValid(buf)); + page = (Page) BufferGetPage(buf); + brin_metapage_init(page, xlrec->pagesPerRange, xlrec->version); + PageSetLSN(page, lsn); + MarkBufferDirty(buf); + UnlockReleaseBuffer(buf); +} + +/* + * Common part of an insert or update. Inserts the new tuple and updates the + * revmap. + */ +static void +brin_xlog_insert_update(XLogReaderState *record, + xl_brin_insert *xlrec) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer buffer; + BlockNumber regpgno; + Page page; + XLogRedoAction action; + + /* + * If we inserted the first and only tuple on the page, re-initialize the + * page from scratch. + */ + if (XLogRecGetInfo(record) & XLOG_BRIN_INIT_PAGE) + { + buffer = XLogInitBufferForRedo(record, 0); + page = BufferGetPage(buffer); + brin_page_init(page, BRIN_PAGETYPE_REGULAR); + action = BLK_NEEDS_REDO; + } + else + { + action = XLogReadBufferForRedo(record, 0, &buffer); + } + + /* need this page's blkno to store in revmap */ + regpgno = BufferGetBlockNumber(buffer); + + /* insert the index item into the page */ + if (action == BLK_NEEDS_REDO) + { + OffsetNumber offnum; + BrinTuple *tuple; + Size tuplen; + + tuple = (BrinTuple *) XLogRecGetBlockData(record, 0, &tuplen); + + Assert(tuple->bt_blkno == xlrec->heapBlk); + + page = (Page) BufferGetPage(buffer); + offnum = xlrec->offnum; + if (PageGetMaxOffsetNumber(page) + 1 < offnum) + elog(PANIC, "brin_xlog_insert_update: invalid max offset number"); + + offnum = PageAddItem(page, (Item) tuple, tuplen, offnum, true, false); + if (offnum == InvalidOffsetNumber) + elog(PANIC, "brin_xlog_insert_update: failed to add tuple"); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* update the revmap */ + action = XLogReadBufferForRedo(record, 1, &buffer); + if (action == BLK_NEEDS_REDO) + { + ItemPointerData tid; + + ItemPointerSet(&tid, regpgno, xlrec->offnum); + page = (Page) BufferGetPage(buffer); + + brinSetHeapBlockItemptr(buffer, xlrec->pagesPerRange, xlrec->heapBlk, + tid); + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* XXX no FSM updates here ... */ +} + +/* + * replay a BRIN index insertion + */ +static void +brin_xlog_insert(XLogReaderState *record) +{ + xl_brin_insert *xlrec = (xl_brin_insert *) XLogRecGetData(record); + + brin_xlog_insert_update(record, xlrec); +} + +/* + * replay a BRIN index update + */ +static void +brin_xlog_update(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_brin_update *xlrec = (xl_brin_update *) XLogRecGetData(record); + Buffer buffer; + XLogRedoAction action; + + /* First remove the old tuple */ + action = XLogReadBufferForRedo(record, 2, &buffer); + if (action == BLK_NEEDS_REDO) + { + Page page; + OffsetNumber offnum; + + page = (Page) BufferGetPage(buffer); + + offnum = xlrec->oldOffnum; + + PageIndexTupleDeleteNoCompact(page, offnum); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + + /* Then insert the new tuple and update revmap, like in an insertion. */ + brin_xlog_insert_update(record, &xlrec->insert); + + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +/* + * Update a tuple on a single page. + */ +static void +brin_xlog_samepage_update(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_brin_samepage_update *xlrec; + Buffer buffer; + XLogRedoAction action; + + xlrec = (xl_brin_samepage_update *) XLogRecGetData(record); + action = XLogReadBufferForRedo(record, 0, &buffer); + if (action == BLK_NEEDS_REDO) + { + Size tuplen; + BrinTuple *brintuple; + Page page; + OffsetNumber offnum; + + brintuple = (BrinTuple *) XLogRecGetBlockData(record, 0, &tuplen); + + page = (Page) BufferGetPage(buffer); + + offnum = xlrec->offnum; + + if (!PageIndexTupleOverwrite(page, offnum, (Item) brintuple, tuplen)) + elog(PANIC, "brin_xlog_samepage_update: failed to replace tuple"); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* XXX no FSM updates here ... */ +} + +/* + * Replay a revmap page extension + */ +static void +brin_xlog_revmap_extend(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_brin_revmap_extend *xlrec; + Buffer metabuf; + Buffer buf; + Page page; + BlockNumber targetBlk; + XLogRedoAction action; + + xlrec = (xl_brin_revmap_extend *) XLogRecGetData(record); + + XLogRecGetBlockTag(record, 1, NULL, NULL, &targetBlk); + Assert(xlrec->targetBlk == targetBlk); + + /* Update the metapage */ + action = XLogReadBufferForRedo(record, 0, &metabuf); + if (action == BLK_NEEDS_REDO) + { + Page metapg; + BrinMetaPageData *metadata; + + metapg = BufferGetPage(metabuf); + metadata = (BrinMetaPageData *) PageGetContents(metapg); + + Assert(metadata->lastRevmapPage == xlrec->targetBlk - 1); + metadata->lastRevmapPage = xlrec->targetBlk; + + PageSetLSN(metapg, lsn); + + /* + * Set pd_lower just past the end of the metadata. This is essential, + * because without doing so, metadata will be lost if xlog.c + * compresses the page. (We must do this here because pre-v11 + * versions of PG did not set the metapage's pd_lower correctly, so a + * pg_upgraded index might contain the wrong value.) + */ + ((PageHeader) metapg)->pd_lower = + ((char *) metadata + sizeof(BrinMetaPageData)) - (char *) metapg; + + MarkBufferDirty(metabuf); + } + + /* + * Re-init the target block as a revmap page. There's never a full- page + * image here. + */ + + buf = XLogInitBufferForRedo(record, 1); + page = (Page) BufferGetPage(buf); + brin_page_init(page, BRIN_PAGETYPE_REVMAP); + + PageSetLSN(page, lsn); + MarkBufferDirty(buf); + + UnlockReleaseBuffer(buf); + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +static void +brin_xlog_desummarize_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_brin_desummarize *xlrec; + Buffer buffer; + XLogRedoAction action; + + xlrec = (xl_brin_desummarize *) XLogRecGetData(record); + + /* Update the revmap */ + action = XLogReadBufferForRedo(record, 0, &buffer); + if (action == BLK_NEEDS_REDO) + { + ItemPointerData iptr; + + ItemPointerSetInvalid(&iptr); + brinSetHeapBlockItemptr(buffer, xlrec->pagesPerRange, xlrec->heapBlk, iptr); + + PageSetLSN(BufferGetPage(buffer), lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + /* remove the leftover entry from the regular page */ + action = XLogReadBufferForRedo(record, 1, &buffer); + if (action == BLK_NEEDS_REDO) + { + Page regPg = BufferGetPage(buffer); + + PageIndexTupleDeleteNoCompact(regPg, xlrec->regOffset); + + PageSetLSN(regPg, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +void +brin_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info & XLOG_BRIN_OPMASK) + { + case XLOG_BRIN_CREATE_INDEX: + brin_xlog_createidx(record); + break; + case XLOG_BRIN_INSERT: + brin_xlog_insert(record); + break; + case XLOG_BRIN_UPDATE: + brin_xlog_update(record); + break; + case XLOG_BRIN_SAMEPAGE_UPDATE: + brin_xlog_samepage_update(record); + break; + case XLOG_BRIN_REVMAP_EXTEND: + brin_xlog_revmap_extend(record); + break; + case XLOG_BRIN_DESUMMARIZE: + brin_xlog_desummarize_page(record); + break; + default: + elog(PANIC, "brin_redo: unknown op code %u", info); + } +} + +/* + * Mask a BRIN page before doing consistency checks. + */ +void +brin_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + PageHeader pagehdr = (PageHeader) page; + + mask_page_lsn_and_checksum(page); + + mask_page_hint_bits(page); + + /* + * Regular brin pages contain unused space which needs to be masked. + * Similarly for meta pages, but mask it only if pd_lower appears to have + * been set correctly. + */ + if (BRIN_IS_REGULAR_PAGE(page) || + (BRIN_IS_META_PAGE(page) && pagehdr->pd_lower > SizeOfPageHeaderData)) + { + mask_unused_space(page); + } + + /* + * BRIN_EVACUATE_PAGE is not WAL-logged, since it's of no use in recovery. + * Mask it. See brin_start_evacuating_page() for details. + */ + BrinPageFlags(page) &= ~BRIN_EVACUATE_PAGE; +} |