summaryrefslogtreecommitdiffstats
path: root/src/backend/access/brin
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
commit46651ce6fe013220ed397add242004d764fc0153 (patch)
tree6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/access/brin
parentInitial commit. (diff)
downloadpostgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz
postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/access/brin')
-rw-r--r--src/backend/access/brin/Makefile27
-rw-r--r--src/backend/access/brin/README189
-rw-r--r--src/backend/access/brin/brin.c1800
-rw-r--r--src/backend/access/brin/brin_bloom.c809
-rw-r--r--src/backend/access/brin/brin_inclusion.c657
-rw-r--r--src/backend/access/brin/brin_minmax.c317
-rw-r--r--src/backend/access/brin/brin_minmax_multi.c3163
-rw-r--r--src/backend/access/brin/brin_pageops.c920
-rw-r--r--src/backend/access/brin/brin_revmap.c664
-rw-r--r--src/backend/access/brin/brin_tuple.c708
-rw-r--r--src/backend/access/brin/brin_validate.c281
-rw-r--r--src/backend/access/brin/brin_xlog.c367
12 files changed, 9902 insertions, 0 deletions
diff --git a/src/backend/access/brin/Makefile b/src/backend/access/brin/Makefile
new file mode 100644
index 0000000..a386cb7
--- /dev/null
+++ b/src/backend/access/brin/Makefile
@@ -0,0 +1,27 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for access/brin
+#
+# IDENTIFICATION
+# src/backend/access/brin/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/access/brin
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ brin.o \
+ brin_bloom.o \
+ brin_inclusion.o \
+ brin_minmax.o \
+ brin_minmax_multi.o \
+ brin_pageops.o \
+ brin_revmap.o \
+ brin_tuple.o \
+ brin_validate.o \
+ brin_xlog.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/brin/README b/src/backend/access/brin/README
new file mode 100644
index 0000000..636d965
--- /dev/null
+++ b/src/backend/access/brin/README
@@ -0,0 +1,189 @@
+Block Range Indexes (BRIN)
+==========================
+
+BRIN indexes intend to enable very fast scanning of extremely large tables.
+
+The essential idea of a BRIN index is to keep track of summarizing values in
+consecutive groups of heap pages (page ranges); for example, the minimum and
+maximum values for datatypes with a btree opclass, or the bounding box for
+geometric types. These values can be used to avoid scanning such pages
+during a table scan, depending on query quals.
+
+The cost of this is having to update the stored summary values of each page
+range as tuples are inserted into them.
+
+
+Access Method Design
+--------------------
+
+Since item pointers are not stored inside indexes of this type, it is not
+possible to support the amgettuple interface. Instead, we only provide
+amgetbitmap support. The amgetbitmap routine returns a lossy TIDBitmap
+comprising all pages in those page ranges that match the query
+qualifications. The recheck step in the BitmapHeapScan node prunes tuples
+that are not visible according to the query qualifications.
+
+An operator class must have the following entries:
+
+- generic support procedures (pg_amproc), identical to all opclasses:
+ * "opcinfo" (BRIN_PROCNUM_OPCINFO) initializes a structure for index
+ creation or scanning
+ * "addValue" (BRIN_PROCNUM_ADDVALUE) takes an index tuple and a heap item,
+ and possibly changes the index tuple so that it includes the heap item
+ values
+ * "consistent" (BRIN_PROCNUM_CONSISTENT) takes an index tuple and query
+ quals, and returns whether the index tuple values match the query quals.
+ * "union" (BRIN_PROCNUM_UNION) takes two index tuples and modifies the first
+ one so that it represents the union of the two.
+Procedure numbers up to 10 are reserved for future expansion.
+
+Additionally, each opclass needs additional support functions:
+- Minmax-style operator classes:
+ * Proc numbers 11-14 are used for the functions implementing inequality
+ operators for the type, in this order: less than, less or equal,
+ greater or equal, greater than.
+
+Opclasses using a different design will require different additional procedure
+numbers.
+
+Operator classes also need to have operator (pg_amop) entries so that the
+optimizer can choose the index to execute queries.
+- Minmax-style operator classes:
+ * The same operators as btree (<=, <, =, >=, >)
+
+Each index tuple stores some NULL bits and some opclass-specified values, which
+are stored in a single null bitmask of length twice the number of columns. The
+generic NULL bits indicate, for each column:
+ * bt_hasnulls: Whether there's any NULL value at all in the page range
+ * bt_allnulls: Whether all values are NULLs in the page range
+
+The opclass-specified values are:
+- Minmax-style operator classes
+ * minimum value across all tuples in the range
+ * maximum value across all tuples in the range
+
+Note that the addValue and Union support procedures must be careful to
+datumCopy() the values they want to store in the in-memory BRIN tuple, and
+must pfree() the old copies when replacing older ones. Since some values
+referenced from the tuple persist and others go away, there is no
+well-defined lifetime for a memory context that would make this automatic.
+
+
+The Range Map
+-------------
+
+To find the index tuple for a particular page range, we have an internal
+structure we call the range map, or "revmap" for short. This stores one TID
+per page range, which is the address of the index tuple summarizing that
+range. Since the map entries are fixed size, it is possible to compute the
+address of the range map entry for any given heap page by simple arithmetic.
+
+When a new heap tuple is inserted in a summarized page range, we compare the
+existing index tuple with the new heap tuple. If the heap tuple is outside
+the summarization data given by the index tuple for any indexed column (or
+if the new heap tuple contains null values but the index tuple indicates
+there are no nulls), the index is updated with the new values. In many
+cases it is possible to update the index tuple in-place, but if the new
+index tuple is larger than the old one and there's not enough space in the
+page, it is necessary to create a new index tuple with the new values. The
+range map can be updated quickly to point to it; the old index tuple is
+removed.
+
+If the range map points to an invalid TID, the corresponding page range is
+considered to be not summarized. When tuples are added to unsummarized
+pages, nothing needs to happen.
+
+To scan a table following a BRIN index, we scan the range map sequentially.
+This yields index tuples in ascending page range order. Query quals are
+matched to each index tuple; if they match, each page within the page range
+is returned as part of the output TID bitmap. If there's no match, they are
+skipped. Range map entries returning invalid index TIDs, that is
+unsummarized page ranges, are also returned in the TID bitmap.
+
+The revmap is stored in the first few blocks of the index main fork,
+immediately following the metapage. Whenever the revmap needs to be
+extended by another page, existing tuples in that page are moved to some
+other page.
+
+Heap tuples can be removed from anywhere without restriction. It might be
+useful to mark the corresponding index tuple somehow, if the heap tuple is
+one of the constraining values of the summary data (i.e. either min or max
+in the case of a btree-opclass-bearing datatype), so that in the future we
+are aware of the need to re-execute summarization on that range, leading to
+a possible tightening of the summary values.
+
+Summarization
+-------------
+
+At index creation time, the whole table is scanned; for each page range the
+summarizing values of each indexed column and nulls bitmap are collected and
+stored in the index. The partially-filled page range at the end of the
+table is also summarized.
+
+As new tuples get inserted at the end of the table, they may update the
+index tuple that summarizes the partial page range at the end. Eventually
+that page range is complete and new tuples belong in a new page range that
+hasn't yet been summarized. Those insertions do not create a new index
+entry; instead, the page range remains unsummarized until later.
+
+Whenever VACUUM is run on the table, all unsummarized page ranges are
+summarized. This action can also be invoked by the user via
+brin_summarize_new_values(). Both these procedures scan all the
+unsummarized ranges, and create a summary tuple. Again, this includes the
+partially-filled page range at the end of the table.
+
+Vacuuming
+---------
+
+Since no heap TIDs are stored in a BRIN index, it's not necessary to scan the
+index when heap tuples are removed. It might be that some summary values can
+be tightened if heap tuples have been deleted; but this would represent an
+optimization opportunity only, not a correctness issue. It's simpler to
+represent this as the need to re-run summarization on the affected page range
+rather than "subtracting" values from the existing one. This is not
+currently implemented.
+
+Note that if there are no indexes on the table other than the BRIN index,
+usage of maintenance_work_mem by vacuum can be decreased significantly, because
+no detailed index scan needs to take place (and thus it's not necessary for
+vacuum to save TIDs to remove). It's unlikely that BRIN would be the only
+indexes in a table, though, because primary keys can be btrees only, and so
+we don't implement this optimization.
+
+
+Optimizer
+---------
+
+The optimizer selects the index based on the operator class' pg_amop
+entries for the column.
+
+
+Future improvements
+-------------------
+
+* Different-size page ranges?
+ In the current design, each "index entry" in a BRIN index covers the same
+ number of pages. There's no hard reason for this; it might make sense to
+ allow the index to self-tune so that some index entries cover smaller page
+ ranges, if this allows the summary values to be more compact. This would incur
+ larger BRIN overhead for the index itself, but might allow better pruning of
+ page ranges during scan. In the limit of one index tuple per page, the index
+ itself would occupy too much space, even though we would be able to skip
+ reading the most heap pages, because the summary values are tight; in the
+ opposite limit of a single tuple that summarizes the whole table, we wouldn't
+ be able to prune anything even though the index is very small. This can
+ probably be made to work by using the range map as an index in itself.
+
+* More compact representation for TIDBitmap?
+ TIDBitmap is the structure used to represent bitmap scans. The
+ representation of lossy page ranges is not optimal for our purposes, because
+ it uses a Bitmapset to represent pages in the range; since we're going to return
+ all pages in a large range, it might be more convenient to allow for a
+ struct that uses start and end page numbers to represent the range, instead.
+
+* Better vacuuming?
+ It might be useful to enable passing more useful info to BRIN indexes during
+ vacuuming about tuples that are deleted, i.e. do not require the callback to
+ pass each tuple's TID. For instance we might need a callback that passes a
+ block number instead of a TID. That would help determine when to re-run
+ summarization on blocks that have seen lots of tuple deletions.
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
new file mode 100644
index 0000000..21a2384
--- /dev/null
+++ b/src/backend/access/brin/brin.c
@@ -0,0 +1,1800 @@
+/*
+ * brin.c
+ * Implementation of BRIN indexes for Postgres
+ *
+ * See src/backend/access/brin/README for details.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/brin/brin.c
+ *
+ * TODO
+ * * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY)
+ */
+#include "postgres.h"
+
+#include "access/brin.h"
+#include "access/brin_page.h"
+#include "access/brin_pageops.h"
+#include "access/brin_xlog.h"
+#include "access/relation.h"
+#include "access/reloptions.h"
+#include "access/relscan.h"
+#include "access/table.h"
+#include "access/tableam.h"
+#include "access/xloginsert.h"
+#include "catalog/index.h"
+#include "catalog/pg_am.h"
+#include "commands/vacuum.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/autovacuum.h"
+#include "storage/bufmgr.h"
+#include "storage/freespace.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/index_selfuncs.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+
+/*
+ * We use a BrinBuildState during initial construction of a BRIN index.
+ * The running state is kept in a BrinMemTuple.
+ */
+typedef struct BrinBuildState
+{
+ Relation bs_irel;
+ int bs_numtuples;
+ Buffer bs_currentInsertBuf;
+ BlockNumber bs_pagesPerRange;
+ BlockNumber bs_currRangeStart;
+ BrinRevmap *bs_rmAccess;
+ BrinDesc *bs_bdesc;
+ BrinMemTuple *bs_dtuple;
+} BrinBuildState;
+
+/*
+ * Struct used as "opaque" during index scans
+ */
+typedef struct BrinOpaque
+{
+ BlockNumber bo_pagesPerRange;
+ BrinRevmap *bo_rmAccess;
+ BrinDesc *bo_bdesc;
+} BrinOpaque;
+
+#define BRIN_ALL_BLOCKRANGES InvalidBlockNumber
+
+static BrinBuildState *initialize_brin_buildstate(Relation idxRel,
+ BrinRevmap *revmap, BlockNumber pagesPerRange);
+static void terminate_brin_buildstate(BrinBuildState *state);
+static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
+ bool include_partial, double *numSummarized, double *numExisting);
+static void form_and_insert_tuple(BrinBuildState *state);
+static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a,
+ BrinTuple *b);
+static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy);
+static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc,
+ BrinMemTuple *dtup, Datum *values, bool *nulls);
+static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys);
+
+/*
+ * BRIN handler function: return IndexAmRoutine with access method parameters
+ * and callbacks.
+ */
+Datum
+brinhandler(PG_FUNCTION_ARGS)
+{
+ IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
+
+ amroutine->amstrategies = 0;
+ amroutine->amsupport = BRIN_LAST_OPTIONAL_PROCNUM;
+ amroutine->amoptsprocnum = BRIN_PROCNUM_OPTIONS;
+ amroutine->amcanorder = false;
+ amroutine->amcanorderbyop = false;
+ amroutine->amcanbackward = false;
+ amroutine->amcanunique = false;
+ amroutine->amcanmulticol = true;
+ amroutine->amoptionalkey = true;
+ amroutine->amsearcharray = false;
+ amroutine->amsearchnulls = true;
+ amroutine->amstorage = true;
+ amroutine->amclusterable = false;
+ amroutine->ampredlocks = false;
+ amroutine->amcanparallel = false;
+ amroutine->amcaninclude = false;
+ amroutine->amusemaintenanceworkmem = false;
+ amroutine->amparallelvacuumoptions =
+ VACUUM_OPTION_PARALLEL_CLEANUP;
+ amroutine->amkeytype = InvalidOid;
+
+ amroutine->ambuild = brinbuild;
+ amroutine->ambuildempty = brinbuildempty;
+ amroutine->aminsert = brininsert;
+ amroutine->ambulkdelete = brinbulkdelete;
+ amroutine->amvacuumcleanup = brinvacuumcleanup;
+ amroutine->amcanreturn = NULL;
+ amroutine->amcostestimate = brincostestimate;
+ amroutine->amoptions = brinoptions;
+ amroutine->amproperty = NULL;
+ amroutine->ambuildphasename = NULL;
+ amroutine->amvalidate = brinvalidate;
+ amroutine->amadjustmembers = NULL;
+ amroutine->ambeginscan = brinbeginscan;
+ amroutine->amrescan = brinrescan;
+ amroutine->amgettuple = NULL;
+ amroutine->amgetbitmap = bringetbitmap;
+ amroutine->amendscan = brinendscan;
+ amroutine->ammarkpos = NULL;
+ amroutine->amrestrpos = NULL;
+ amroutine->amestimateparallelscan = NULL;
+ amroutine->aminitparallelscan = NULL;
+ amroutine->amparallelrescan = NULL;
+
+ PG_RETURN_POINTER(amroutine);
+}
+
+/*
+ * A tuple in the heap is being inserted. To keep a brin index up to date,
+ * we need to obtain the relevant index tuple and compare its stored values
+ * with those of the new tuple. If the tuple values are not consistent with
+ * the summary tuple, we need to update the index tuple.
+ *
+ * If autosummarization is enabled, check if we need to summarize the previous
+ * page range.
+ *
+ * If the range is not currently summarized (i.e. the revmap returns NULL for
+ * it), there's nothing to do for this tuple.
+ */
+bool
+brininsert(Relation idxRel, Datum *values, bool *nulls,
+ ItemPointer heaptid, Relation heapRel,
+ IndexUniqueCheck checkUnique,
+ bool indexUnchanged,
+ IndexInfo *indexInfo)
+{
+ BlockNumber pagesPerRange;
+ BlockNumber origHeapBlk;
+ BlockNumber heapBlk;
+ BrinDesc *bdesc = (BrinDesc *) indexInfo->ii_AmCache;
+ BrinRevmap *revmap;
+ Buffer buf = InvalidBuffer;
+ MemoryContext tupcxt = NULL;
+ MemoryContext oldcxt = CurrentMemoryContext;
+ bool autosummarize = BrinGetAutoSummarize(idxRel);
+
+ revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL);
+
+ /*
+ * origHeapBlk is the block number where the insertion occurred. heapBlk
+ * is the first block in the corresponding page range.
+ */
+ origHeapBlk = ItemPointerGetBlockNumber(heaptid);
+ heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange;
+
+ for (;;)
+ {
+ bool need_insert = false;
+ OffsetNumber off;
+ BrinTuple *brtup;
+ BrinMemTuple *dtup;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * If auto-summarization is enabled and we just inserted the first
+ * tuple into the first block of a new non-first page range, request a
+ * summarization run of the previous range.
+ */
+ if (autosummarize &&
+ heapBlk > 0 &&
+ heapBlk == origHeapBlk &&
+ ItemPointerGetOffsetNumber(heaptid) == FirstOffsetNumber)
+ {
+ BlockNumber lastPageRange = heapBlk - 1;
+ BrinTuple *lastPageTuple;
+
+ lastPageTuple =
+ brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off,
+ NULL, BUFFER_LOCK_SHARE, NULL);
+ if (!lastPageTuple)
+ {
+ bool recorded;
+
+ recorded = AutoVacuumRequestWork(AVW_BRINSummarizeRange,
+ RelationGetRelid(idxRel),
+ lastPageRange);
+ if (!recorded)
+ ereport(LOG,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded",
+ RelationGetRelationName(idxRel),
+ lastPageRange)));
+ }
+ else
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ }
+
+ brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off,
+ NULL, BUFFER_LOCK_SHARE, NULL);
+
+ /* if range is unsummarized, there's nothing to do */
+ if (!brtup)
+ break;
+
+ /* First time through in this statement? */
+ if (bdesc == NULL)
+ {
+ MemoryContextSwitchTo(indexInfo->ii_Context);
+ bdesc = brin_build_desc(idxRel);
+ indexInfo->ii_AmCache = (void *) bdesc;
+ MemoryContextSwitchTo(oldcxt);
+ }
+ /* First time through in this brininsert call? */
+ if (tupcxt == NULL)
+ {
+ tupcxt = AllocSetContextCreate(CurrentMemoryContext,
+ "brininsert cxt",
+ ALLOCSET_DEFAULT_SIZES);
+ MemoryContextSwitchTo(tupcxt);
+ }
+
+ dtup = brin_deform_tuple(bdesc, brtup, NULL);
+
+ need_insert = add_values_to_range(idxRel, bdesc, dtup, values, nulls);
+
+ if (!need_insert)
+ {
+ /*
+ * The tuple is consistent with the new values, so there's nothing
+ * to do.
+ */
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ }
+ else
+ {
+ Page page = BufferGetPage(buf);
+ ItemId lp = PageGetItemId(page, off);
+ Size origsz;
+ BrinTuple *origtup;
+ Size newsz;
+ BrinTuple *newtup;
+ bool samepage;
+
+ /*
+ * Make a copy of the old tuple, so that we can compare it after
+ * re-acquiring the lock.
+ */
+ origsz = ItemIdGetLength(lp);
+ origtup = brin_copy_tuple(brtup, origsz, NULL, NULL);
+
+ /*
+ * Before releasing the lock, check if we can attempt a same-page
+ * update. Another process could insert a tuple concurrently in
+ * the same page though, so downstream we must be prepared to cope
+ * if this turns out to not be possible after all.
+ */
+ newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz);
+ samepage = brin_can_do_samepage_update(buf, origsz, newsz);
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+ /*
+ * Try to update the tuple. If this doesn't work for whatever
+ * reason, we need to restart from the top; the revmap might be
+ * pointing at a different tuple for this block now, so we need to
+ * recompute to ensure both our new heap tuple and the other
+ * inserter's are covered by the combined tuple. It might be that
+ * we don't need to update at all.
+ */
+ if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk,
+ buf, off, origtup, origsz, newtup, newsz,
+ samepage))
+ {
+ /* no luck; start over */
+ MemoryContextResetAndDeleteChildren(tupcxt);
+ continue;
+ }
+ }
+
+ /* success! */
+ break;
+ }
+
+ brinRevmapTerminate(revmap);
+ if (BufferIsValid(buf))
+ ReleaseBuffer(buf);
+ MemoryContextSwitchTo(oldcxt);
+ if (tupcxt != NULL)
+ MemoryContextDelete(tupcxt);
+
+ return false;
+}
+
+/*
+ * Initialize state for a BRIN index scan.
+ *
+ * We read the metapage here to determine the pages-per-range number that this
+ * index was built with. Note that since this cannot be changed while we're
+ * holding lock on index, it's not necessary to recompute it during brinrescan.
+ */
+IndexScanDesc
+brinbeginscan(Relation r, int nkeys, int norderbys)
+{
+ IndexScanDesc scan;
+ BrinOpaque *opaque;
+
+ scan = RelationGetIndexScan(r, nkeys, norderbys);
+
+ opaque = (BrinOpaque *) palloc(sizeof(BrinOpaque));
+ opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange,
+ scan->xs_snapshot);
+ opaque->bo_bdesc = brin_build_desc(r);
+ scan->opaque = opaque;
+
+ return scan;
+}
+
+/*
+ * Execute the index scan.
+ *
+ * This works by reading index TIDs from the revmap, and obtaining the index
+ * tuples pointed to by them; the summary values in the index tuples are
+ * compared to the scan keys. We return into the TID bitmap all the pages in
+ * ranges corresponding to index tuples that match the scan keys.
+ *
+ * If a TID from the revmap is read as InvalidTID, we know that range is
+ * unsummarized. Pages in those ranges need to be returned regardless of scan
+ * keys.
+ */
+int64
+bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
+{
+ Relation idxRel = scan->indexRelation;
+ Buffer buf = InvalidBuffer;
+ BrinDesc *bdesc;
+ Oid heapOid;
+ Relation heapRel;
+ BrinOpaque *opaque;
+ BlockNumber nblocks;
+ BlockNumber heapBlk;
+ int totalpages = 0;
+ FmgrInfo *consistentFn;
+ MemoryContext oldcxt;
+ MemoryContext perRangeCxt;
+ BrinMemTuple *dtup;
+ BrinTuple *btup = NULL;
+ Size btupsz = 0;
+ ScanKey **keys,
+ **nullkeys;
+ int *nkeys,
+ *nnullkeys;
+ int keyno;
+ char *ptr;
+ Size len;
+ char *tmp PG_USED_FOR_ASSERTS_ONLY;
+
+ opaque = (BrinOpaque *) scan->opaque;
+ bdesc = opaque->bo_bdesc;
+ pgstat_count_index_scan(idxRel);
+
+ /*
+ * We need to know the size of the table so that we know how long to
+ * iterate on the revmap.
+ */
+ heapOid = IndexGetRelation(RelationGetRelid(idxRel), false);
+ heapRel = table_open(heapOid, AccessShareLock);
+ nblocks = RelationGetNumberOfBlocks(heapRel);
+ table_close(heapRel, AccessShareLock);
+
+ /*
+ * Make room for the consistent support procedures of indexed columns. We
+ * don't look them up here; we do that lazily the first time we see a scan
+ * key reference each of them. We rely on zeroing fn_oid to InvalidOid.
+ */
+ consistentFn = palloc0(sizeof(FmgrInfo) * bdesc->bd_tupdesc->natts);
+
+ /*
+ * Make room for per-attribute lists of scan keys that we'll pass to the
+ * consistent support procedure. We don't know which attributes have scan
+ * keys, so we allocate space for all attributes. That may use more memory
+ * but it's probably cheaper than determining which attributes are used.
+ *
+ * We keep null and regular keys separate, so that we can pass just the
+ * regular keys to the consistent function easily.
+ *
+ * To reduce the allocation overhead, we allocate one big chunk and then
+ * carve it into smaller arrays ourselves. All the pieces have exactly the
+ * same lifetime, so that's OK.
+ *
+ * XXX The widest index can have 32 attributes, so the amount of wasted
+ * memory is negligible. We could invent a more compact approach (with
+ * just space for used attributes) but that would make the matching more
+ * complex so it's not a good trade-off.
+ */
+ len =
+ MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* regular keys */
+ MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
+ MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts) +
+ MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* NULL keys */
+ MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
+ MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
+
+ ptr = palloc(len);
+ tmp = ptr;
+
+ keys = (ScanKey **) ptr;
+ ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
+
+ nullkeys = (ScanKey **) ptr;
+ ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
+
+ nkeys = (int *) ptr;
+ ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
+
+ nnullkeys = (int *) ptr;
+ ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
+
+ for (int i = 0; i < bdesc->bd_tupdesc->natts; i++)
+ {
+ keys[i] = (ScanKey *) ptr;
+ ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
+
+ nullkeys[i] = (ScanKey *) ptr;
+ ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
+ }
+
+ Assert(tmp + len == ptr);
+
+ /* zero the number of keys */
+ memset(nkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
+ memset(nnullkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
+
+ /* Preprocess the scan keys - split them into per-attribute arrays. */
+ for (keyno = 0; keyno < scan->numberOfKeys; keyno++)
+ {
+ ScanKey key = &scan->keyData[keyno];
+ AttrNumber keyattno = key->sk_attno;
+
+ /*
+ * The collation of the scan key must match the collation used in the
+ * index column (but only if the search is not IS NULL/ IS NOT NULL).
+ * Otherwise we shouldn't be using this index ...
+ */
+ Assert((key->sk_flags & SK_ISNULL) ||
+ (key->sk_collation ==
+ TupleDescAttr(bdesc->bd_tupdesc,
+ keyattno - 1)->attcollation));
+
+ /*
+ * First time we see this index attribute, so init as needed.
+ *
+ * This is a bit of an overkill - we don't know how many scan keys are
+ * there for this attribute, so we simply allocate the largest number
+ * possible (as if all keys were for this attribute). This may waste a
+ * bit of memory, but we only expect small number of scan keys in
+ * general, so this should be negligible, and repeated repalloc calls
+ * are not free either.
+ */
+ if (consistentFn[keyattno - 1].fn_oid == InvalidOid)
+ {
+ FmgrInfo *tmp;
+
+ /* First time we see this attribute, so no key/null keys. */
+ Assert(nkeys[keyattno - 1] == 0);
+ Assert(nnullkeys[keyattno - 1] == 0);
+
+ tmp = index_getprocinfo(idxRel, keyattno,
+ BRIN_PROCNUM_CONSISTENT);
+ fmgr_info_copy(&consistentFn[keyattno - 1], tmp,
+ CurrentMemoryContext);
+ }
+
+ /* Add key to the proper per-attribute array. */
+ if (key->sk_flags & SK_ISNULL)
+ {
+ nullkeys[keyattno - 1][nnullkeys[keyattno - 1]] = key;
+ nnullkeys[keyattno - 1]++;
+ }
+ else
+ {
+ keys[keyattno - 1][nkeys[keyattno - 1]] = key;
+ nkeys[keyattno - 1]++;
+ }
+ }
+
+ /* allocate an initial in-memory tuple, out of the per-range memcxt */
+ dtup = brin_new_memtuple(bdesc);
+
+ /*
+ * Setup and use a per-range memory context, which is reset every time we
+ * loop below. This avoids having to free the tuples within the loop.
+ */
+ perRangeCxt = AllocSetContextCreate(CurrentMemoryContext,
+ "bringetbitmap cxt",
+ ALLOCSET_DEFAULT_SIZES);
+ oldcxt = MemoryContextSwitchTo(perRangeCxt);
+
+ /*
+ * Now scan the revmap. We start by querying for heap page 0,
+ * incrementing by the number of pages per range; this gives us a full
+ * view of the table.
+ */
+ for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange)
+ {
+ bool addrange;
+ bool gottuple = false;
+ BrinTuple *tup;
+ OffsetNumber off;
+ Size size;
+
+ CHECK_FOR_INTERRUPTS();
+
+ MemoryContextResetAndDeleteChildren(perRangeCxt);
+
+ tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf,
+ &off, &size, BUFFER_LOCK_SHARE,
+ scan->xs_snapshot);
+ if (tup)
+ {
+ gottuple = true;
+ btup = brin_copy_tuple(tup, size, btup, &btupsz);
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ }
+
+ /*
+ * For page ranges with no indexed tuple, we must return the whole
+ * range; otherwise, compare it to the scan keys.
+ */
+ if (!gottuple)
+ {
+ addrange = true;
+ }
+ else
+ {
+ dtup = brin_deform_tuple(bdesc, btup, dtup);
+ if (dtup->bt_placeholder)
+ {
+ /*
+ * Placeholder tuples are always returned, regardless of the
+ * values stored in them.
+ */
+ addrange = true;
+ }
+ else
+ {
+ int attno;
+
+ /*
+ * Compare scan keys with summary values stored for the range.
+ * If scan keys are matched, the page range must be added to
+ * the bitmap. We initially assume the range needs to be
+ * added; in particular this serves the case where there are
+ * no keys.
+ */
+ addrange = true;
+ for (attno = 1; attno <= bdesc->bd_tupdesc->natts; attno++)
+ {
+ BrinValues *bval;
+ Datum add;
+ Oid collation;
+
+ /*
+ * skip attributes without any scan keys (both regular and
+ * IS [NOT] NULL)
+ */
+ if (nkeys[attno - 1] == 0 && nnullkeys[attno - 1] == 0)
+ continue;
+
+ bval = &dtup->bt_columns[attno - 1];
+
+ /*
+ * First check if there are any IS [NOT] NULL scan keys,
+ * and if we're violating them. In that case we can
+ * terminate early, without invoking the support function.
+ *
+ * As there may be more keys, we can only determine
+ * mismatch within this loop.
+ */
+ if (bdesc->bd_info[attno - 1]->oi_regular_nulls &&
+ !check_null_keys(bval, nullkeys[attno - 1],
+ nnullkeys[attno - 1]))
+ {
+ /*
+ * If any of the IS [NOT] NULL keys failed, the page
+ * range as a whole can't pass. So terminate the loop.
+ */
+ addrange = false;
+ break;
+ }
+
+ /*
+ * So either there are no IS [NOT] NULL keys, or all
+ * passed. If there are no regular scan keys, we're done -
+ * the page range matches. If there are regular keys, but
+ * the page range is marked as 'all nulls' it can't
+ * possibly pass (we're assuming the operators are
+ * strict).
+ */
+
+ /* No regular scan keys - page range as a whole passes. */
+ if (!nkeys[attno - 1])
+ continue;
+
+ Assert((nkeys[attno - 1] > 0) &&
+ (nkeys[attno - 1] <= scan->numberOfKeys));
+
+ /* If it is all nulls, it cannot possibly be consistent. */
+ if (bval->bv_allnulls)
+ {
+ addrange = false;
+ break;
+ }
+
+ /*
+ * Collation from the first key (has to be the same for
+ * all keys for the same attribute).
+ */
+ collation = keys[attno - 1][0]->sk_collation;
+
+ /*
+ * Check whether the scan key is consistent with the page
+ * range values; if so, have the pages in the range added
+ * to the output bitmap.
+ *
+ * The opclass may or may not support processing of
+ * multiple scan keys. We can determine that based on the
+ * number of arguments - functions with extra parameter
+ * (number of scan keys) do support this, otherwise we
+ * have to simply pass the scan keys one by one.
+ */
+ if (consistentFn[attno - 1].fn_nargs >= 4)
+ {
+ /* Check all keys at once */
+ add = FunctionCall4Coll(&consistentFn[attno - 1],
+ collation,
+ PointerGetDatum(bdesc),
+ PointerGetDatum(bval),
+ PointerGetDatum(keys[attno - 1]),
+ Int32GetDatum(nkeys[attno - 1]));
+ addrange = DatumGetBool(add);
+ }
+ else
+ {
+ /*
+ * Check keys one by one
+ *
+ * When there are multiple scan keys, failure to meet
+ * the criteria for a single one of them is enough to
+ * discard the range as a whole, so break out of the
+ * loop as soon as a false return value is obtained.
+ */
+ int keyno;
+
+ for (keyno = 0; keyno < nkeys[attno - 1]; keyno++)
+ {
+ add = FunctionCall3Coll(&consistentFn[attno - 1],
+ keys[attno - 1][keyno]->sk_collation,
+ PointerGetDatum(bdesc),
+ PointerGetDatum(bval),
+ PointerGetDatum(keys[attno - 1][keyno]));
+ addrange = DatumGetBool(add);
+ if (!addrange)
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ /* add the pages in the range to the output bitmap, if needed */
+ if (addrange)
+ {
+ BlockNumber pageno;
+
+ for (pageno = heapBlk;
+ pageno <= Min(nblocks, heapBlk + opaque->bo_pagesPerRange) - 1;
+ pageno++)
+ {
+ MemoryContextSwitchTo(oldcxt);
+ tbm_add_page(tbm, pageno);
+ totalpages++;
+ MemoryContextSwitchTo(perRangeCxt);
+ }
+ }
+ }
+
+ MemoryContextSwitchTo(oldcxt);
+ MemoryContextDelete(perRangeCxt);
+
+ if (buf != InvalidBuffer)
+ ReleaseBuffer(buf);
+
+ /*
+ * XXX We have an approximation of the number of *pages* that our scan
+ * returns, but we don't have a precise idea of the number of heap tuples
+ * involved.
+ */
+ return totalpages * 10;
+}
+
+/*
+ * Re-initialize state for a BRIN index scan
+ */
+void
+brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
+ ScanKey orderbys, int norderbys)
+{
+ /*
+ * Other index AMs preprocess the scan keys at this point, or sometime
+ * early during the scan; this lets them optimize by removing redundant
+ * keys, or doing early returns when they are impossible to satisfy; see
+ * _bt_preprocess_keys for an example. Something like that could be added
+ * here someday, too.
+ */
+
+ if (scankey && scan->numberOfKeys > 0)
+ memmove(scan->keyData, scankey,
+ scan->numberOfKeys * sizeof(ScanKeyData));
+}
+
+/*
+ * Close down a BRIN index scan
+ */
+void
+brinendscan(IndexScanDesc scan)
+{
+ BrinOpaque *opaque = (BrinOpaque *) scan->opaque;
+
+ brinRevmapTerminate(opaque->bo_rmAccess);
+ brin_free_desc(opaque->bo_bdesc);
+ pfree(opaque);
+}
+
+/*
+ * Per-heap-tuple callback for table_index_build_scan.
+ *
+ * Note we don't worry about the page range at the end of the table here; it is
+ * present in the build state struct after we're called the last time, but not
+ * inserted into the index. Caller must ensure to do so, if appropriate.
+ */
+static void
+brinbuildCallback(Relation index,
+ ItemPointer tid,
+ Datum *values,
+ bool *isnull,
+ bool tupleIsAlive,
+ void *brstate)
+{
+ BrinBuildState *state = (BrinBuildState *) brstate;
+ BlockNumber thisblock;
+
+ thisblock = ItemPointerGetBlockNumber(tid);
+
+ /*
+ * If we're in a block that belongs to a future range, summarize what
+ * we've got and start afresh. Note the scan might have skipped many
+ * pages, if they were devoid of live tuples; make sure to insert index
+ * tuples for those too.
+ */
+ while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1)
+ {
+
+ BRIN_elog((DEBUG2,
+ "brinbuildCallback: completed a range: %u--%u",
+ state->bs_currRangeStart,
+ state->bs_currRangeStart + state->bs_pagesPerRange));
+
+ /* create the index tuple and insert it */
+ form_and_insert_tuple(state);
+
+ /* set state to correspond to the next range */
+ state->bs_currRangeStart += state->bs_pagesPerRange;
+
+ /* re-initialize state for it */
+ brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
+ }
+
+ /* Accumulate the current tuple into the running state */
+ (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
+ values, isnull);
+}
+
+/*
+ * brinbuild() -- build a new BRIN index.
+ */
+IndexBuildResult *
+brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
+{
+ IndexBuildResult *result;
+ double reltuples;
+ double idxtuples;
+ BrinRevmap *revmap;
+ BrinBuildState *state;
+ Buffer meta;
+ BlockNumber pagesPerRange;
+
+ /*
+ * We expect to be called exactly once for any index relation.
+ */
+ if (RelationGetNumberOfBlocks(index) != 0)
+ elog(ERROR, "index \"%s\" already contains data",
+ RelationGetRelationName(index));
+
+ /*
+ * Critical section not required, because on error the creation of the
+ * whole relation will be rolled back.
+ */
+
+ meta = ReadBuffer(index, P_NEW);
+ Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO);
+ LockBuffer(meta, BUFFER_LOCK_EXCLUSIVE);
+
+ brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index),
+ BRIN_CURRENT_VERSION);
+ MarkBufferDirty(meta);
+
+ if (RelationNeedsWAL(index))
+ {
+ xl_brin_createidx xlrec;
+ XLogRecPtr recptr;
+ Page page;
+
+ xlrec.version = BRIN_CURRENT_VERSION;
+ xlrec.pagesPerRange = BrinGetPagesPerRange(index);
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, SizeOfBrinCreateIdx);
+ XLogRegisterBuffer(0, meta, REGBUF_WILL_INIT | REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX);
+
+ page = BufferGetPage(meta);
+ PageSetLSN(page, recptr);
+ }
+
+ UnlockReleaseBuffer(meta);
+
+ /*
+ * Initialize our state, including the deformed tuple state.
+ */
+ revmap = brinRevmapInitialize(index, &pagesPerRange, NULL);
+ state = initialize_brin_buildstate(index, revmap, pagesPerRange);
+
+ /*
+ * Now scan the relation. No syncscan allowed here because we want the
+ * heap blocks in physical order.
+ */
+ reltuples = table_index_build_scan(heap, index, indexInfo, false, true,
+ brinbuildCallback, (void *) state, NULL);
+
+ /* process the final batch */
+ form_and_insert_tuple(state);
+
+ /* release resources */
+ idxtuples = state->bs_numtuples;
+ brinRevmapTerminate(state->bs_rmAccess);
+ terminate_brin_buildstate(state);
+
+ /*
+ * Return statistics
+ */
+ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
+
+ result->heap_tuples = reltuples;
+ result->index_tuples = idxtuples;
+
+ return result;
+}
+
+void
+brinbuildempty(Relation index)
+{
+ Buffer metabuf;
+
+ /* An empty BRIN index has a metapage only. */
+ metabuf =
+ ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL);
+ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+
+ /* Initialize and xlog metabuffer. */
+ START_CRIT_SECTION();
+ brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index),
+ BRIN_CURRENT_VERSION);
+ MarkBufferDirty(metabuf);
+ log_newpage_buffer(metabuf, true);
+ END_CRIT_SECTION();
+
+ UnlockReleaseBuffer(metabuf);
+}
+
+/*
+ * brinbulkdelete
+ * Since there are no per-heap-tuple index tuples in BRIN indexes,
+ * there's not a lot we can do here.
+ *
+ * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap
+ * tuple is deleted), meaning the need to re-run summarization on the affected
+ * range. Would need to add an extra flag in brintuples for that.
+ */
+IndexBulkDeleteResult *
+brinbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback, void *callback_state)
+{
+ /* allocate stats if first time through, else re-use existing struct */
+ if (stats == NULL)
+ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+
+ return stats;
+}
+
+/*
+ * This routine is in charge of "vacuuming" a BRIN index: we just summarize
+ * ranges that are currently unsummarized.
+ */
+IndexBulkDeleteResult *
+brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
+{
+ Relation heapRel;
+
+ /* No-op in ANALYZE ONLY mode */
+ if (info->analyze_only)
+ return stats;
+
+ if (!stats)
+ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+ stats->num_pages = RelationGetNumberOfBlocks(info->index);
+ /* rest of stats is initialized by zeroing */
+
+ heapRel = table_open(IndexGetRelation(RelationGetRelid(info->index), false),
+ AccessShareLock);
+
+ brin_vacuum_scan(info->index, info->strategy);
+
+ brinsummarize(info->index, heapRel, BRIN_ALL_BLOCKRANGES, false,
+ &stats->num_index_tuples, &stats->num_index_tuples);
+
+ table_close(heapRel, AccessShareLock);
+
+ return stats;
+}
+
+/*
+ * reloptions processor for BRIN indexes
+ */
+bytea *
+brinoptions(Datum reloptions, bool validate)
+{
+ static const relopt_parse_elt tab[] = {
+ {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)},
+ {"autosummarize", RELOPT_TYPE_BOOL, offsetof(BrinOptions, autosummarize)}
+ };
+
+ return (bytea *) build_reloptions(reloptions, validate,
+ RELOPT_KIND_BRIN,
+ sizeof(BrinOptions),
+ tab, lengthof(tab));
+}
+
+/*
+ * SQL-callable function to scan through an index and summarize all ranges
+ * that are not currently summarized.
+ */
+Datum
+brin_summarize_new_values(PG_FUNCTION_ARGS)
+{
+ Datum relation = PG_GETARG_DATUM(0);
+
+ return DirectFunctionCall2(brin_summarize_range,
+ relation,
+ Int64GetDatum((int64) BRIN_ALL_BLOCKRANGES));
+}
+
+/*
+ * SQL-callable function to summarize the indicated page range, if not already
+ * summarized. If the second argument is BRIN_ALL_BLOCKRANGES, all
+ * unsummarized ranges are summarized.
+ */
+Datum
+brin_summarize_range(PG_FUNCTION_ARGS)
+{
+ Oid indexoid = PG_GETARG_OID(0);
+ int64 heapBlk64 = PG_GETARG_INT64(1);
+ BlockNumber heapBlk;
+ Oid heapoid;
+ Relation indexRel;
+ Relation heapRel;
+ Oid save_userid;
+ int save_sec_context;
+ int save_nestlevel;
+ double numSummarized = 0;
+
+ if (RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is in progress"),
+ errhint("BRIN control functions cannot be executed during recovery.")));
+
+ if (heapBlk64 > BRIN_ALL_BLOCKRANGES || heapBlk64 < 0)
+ {
+ char *blk = psprintf(INT64_FORMAT, heapBlk64);
+
+ ereport(ERROR,
+ (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+ errmsg("block number out of range: %s", blk)));
+ }
+ heapBlk = (BlockNumber) heapBlk64;
+
+ /*
+ * We must lock table before index to avoid deadlocks. However, if the
+ * passed indexoid isn't an index then IndexGetRelation() will fail.
+ * Rather than emitting a not-very-helpful error message, postpone
+ * complaining, expecting that the is-it-an-index test below will fail.
+ */
+ heapoid = IndexGetRelation(indexoid, true);
+ if (OidIsValid(heapoid))
+ {
+ heapRel = table_open(heapoid, ShareUpdateExclusiveLock);
+
+ /*
+ * Autovacuum calls us. For its benefit, switch to the table owner's
+ * userid, so that any index functions are run as that user. Also
+ * lock down security-restricted operations and arrange to make GUC
+ * variable changes local to this command. This is harmless, albeit
+ * unnecessary, when called from SQL, because we fail shortly if the
+ * user does not own the index.
+ */
+ GetUserIdAndSecContext(&save_userid, &save_sec_context);
+ SetUserIdAndSecContext(heapRel->rd_rel->relowner,
+ save_sec_context | SECURITY_RESTRICTED_OPERATION);
+ save_nestlevel = NewGUCNestLevel();
+ }
+ else
+ {
+ heapRel = NULL;
+ /* Set these just to suppress "uninitialized variable" warnings */
+ save_userid = InvalidOid;
+ save_sec_context = -1;
+ save_nestlevel = -1;
+ }
+
+ indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
+
+ /* Must be a BRIN index */
+ if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
+ indexRel->rd_rel->relam != BRIN_AM_OID)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("\"%s\" is not a BRIN index",
+ RelationGetRelationName(indexRel))));
+
+ /* User must own the index (comparable to privileges needed for VACUUM) */
+ if (heapRel != NULL && !pg_class_ownercheck(indexoid, save_userid))
+ aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,
+ RelationGetRelationName(indexRel));
+
+ /*
+ * Since we did the IndexGetRelation call above without any lock, it's
+ * barely possible that a race against an index drop/recreation could have
+ * netted us the wrong table. Recheck.
+ */
+ if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_TABLE),
+ errmsg("could not open parent table of index \"%s\"",
+ RelationGetRelationName(indexRel))));
+
+ /* OK, do it */
+ brinsummarize(indexRel, heapRel, heapBlk, true, &numSummarized, NULL);
+
+ /* Roll back any GUC changes executed by index functions */
+ AtEOXact_GUC(false, save_nestlevel);
+
+ /* Restore userid and security context */
+ SetUserIdAndSecContext(save_userid, save_sec_context);
+
+ relation_close(indexRel, ShareUpdateExclusiveLock);
+ relation_close(heapRel, ShareUpdateExclusiveLock);
+
+ PG_RETURN_INT32((int32) numSummarized);
+}
+
+/*
+ * SQL-callable interface to mark a range as no longer summarized
+ */
+Datum
+brin_desummarize_range(PG_FUNCTION_ARGS)
+{
+ Oid indexoid = PG_GETARG_OID(0);
+ int64 heapBlk64 = PG_GETARG_INT64(1);
+ BlockNumber heapBlk;
+ Oid heapoid;
+ Relation heapRel;
+ Relation indexRel;
+ bool done;
+
+ if (RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("recovery is in progress"),
+ errhint("BRIN control functions cannot be executed during recovery.")));
+
+ if (heapBlk64 > MaxBlockNumber || heapBlk64 < 0)
+ {
+ char *blk = psprintf(INT64_FORMAT, heapBlk64);
+
+ ereport(ERROR,
+ (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+ errmsg("block number out of range: %s", blk)));
+ }
+ heapBlk = (BlockNumber) heapBlk64;
+
+ /*
+ * We must lock table before index to avoid deadlocks. However, if the
+ * passed indexoid isn't an index then IndexGetRelation() will fail.
+ * Rather than emitting a not-very-helpful error message, postpone
+ * complaining, expecting that the is-it-an-index test below will fail.
+ *
+ * Unlike brin_summarize_range(), autovacuum never calls this. Hence, we
+ * don't switch userid.
+ */
+ heapoid = IndexGetRelation(indexoid, true);
+ if (OidIsValid(heapoid))
+ heapRel = table_open(heapoid, ShareUpdateExclusiveLock);
+ else
+ heapRel = NULL;
+
+ indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
+
+ /* Must be a BRIN index */
+ if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
+ indexRel->rd_rel->relam != BRIN_AM_OID)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("\"%s\" is not a BRIN index",
+ RelationGetRelationName(indexRel))));
+
+ /* User must own the index (comparable to privileges needed for VACUUM) */
+ if (!pg_class_ownercheck(indexoid, GetUserId()))
+ aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,
+ RelationGetRelationName(indexRel));
+
+ /*
+ * Since we did the IndexGetRelation call above without any lock, it's
+ * barely possible that a race against an index drop/recreation could have
+ * netted us the wrong table. Recheck.
+ */
+ if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_TABLE),
+ errmsg("could not open parent table of index \"%s\"",
+ RelationGetRelationName(indexRel))));
+
+ /* the revmap does the hard work */
+ do
+ {
+ done = brinRevmapDesummarizeRange(indexRel, heapBlk);
+ }
+ while (!done);
+
+ relation_close(indexRel, ShareUpdateExclusiveLock);
+ relation_close(heapRel, ShareUpdateExclusiveLock);
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * Build a BrinDesc used to create or scan a BRIN index
+ */
+BrinDesc *
+brin_build_desc(Relation rel)
+{
+ BrinOpcInfo **opcinfo;
+ BrinDesc *bdesc;
+ TupleDesc tupdesc;
+ int totalstored = 0;
+ int keyno;
+ long totalsize;
+ MemoryContext cxt;
+ MemoryContext oldcxt;
+
+ cxt = AllocSetContextCreate(CurrentMemoryContext,
+ "brin desc cxt",
+ ALLOCSET_SMALL_SIZES);
+ oldcxt = MemoryContextSwitchTo(cxt);
+ tupdesc = RelationGetDescr(rel);
+
+ /*
+ * Obtain BrinOpcInfo for each indexed column. While at it, accumulate
+ * the number of columns stored, since the number is opclass-defined.
+ */
+ opcinfo = (BrinOpcInfo **) palloc(sizeof(BrinOpcInfo *) * tupdesc->natts);
+ for (keyno = 0; keyno < tupdesc->natts; keyno++)
+ {
+ FmgrInfo *opcInfoFn;
+ Form_pg_attribute attr = TupleDescAttr(tupdesc, keyno);
+
+ opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO);
+
+ opcinfo[keyno] = (BrinOpcInfo *)
+ DatumGetPointer(FunctionCall1(opcInfoFn, attr->atttypid));
+ totalstored += opcinfo[keyno]->oi_nstored;
+ }
+
+ /* Allocate our result struct and fill it in */
+ totalsize = offsetof(BrinDesc, bd_info) +
+ sizeof(BrinOpcInfo *) * tupdesc->natts;
+
+ bdesc = palloc(totalsize);
+ bdesc->bd_context = cxt;
+ bdesc->bd_index = rel;
+ bdesc->bd_tupdesc = tupdesc;
+ bdesc->bd_disktdesc = NULL; /* generated lazily */
+ bdesc->bd_totalstored = totalstored;
+
+ for (keyno = 0; keyno < tupdesc->natts; keyno++)
+ bdesc->bd_info[keyno] = opcinfo[keyno];
+ pfree(opcinfo);
+
+ MemoryContextSwitchTo(oldcxt);
+
+ return bdesc;
+}
+
+void
+brin_free_desc(BrinDesc *bdesc)
+{
+ /* make sure the tupdesc is still valid */
+ Assert(bdesc->bd_tupdesc->tdrefcount >= 1);
+ /* no need for retail pfree */
+ MemoryContextDelete(bdesc->bd_context);
+}
+
+/*
+ * Fetch index's statistical data into *stats
+ */
+void
+brinGetStats(Relation index, BrinStatsData *stats)
+{
+ Buffer metabuffer;
+ Page metapage;
+ BrinMetaPageData *metadata;
+
+ metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO);
+ LockBuffer(metabuffer, BUFFER_LOCK_SHARE);
+ metapage = BufferGetPage(metabuffer);
+ metadata = (BrinMetaPageData *) PageGetContents(metapage);
+
+ stats->pagesPerRange = metadata->pagesPerRange;
+ stats->revmapNumPages = metadata->lastRevmapPage - 1;
+
+ UnlockReleaseBuffer(metabuffer);
+}
+
+/*
+ * Initialize a BrinBuildState appropriate to create tuples on the given index.
+ */
+static BrinBuildState *
+initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap,
+ BlockNumber pagesPerRange)
+{
+ BrinBuildState *state;
+
+ state = palloc(sizeof(BrinBuildState));
+
+ state->bs_irel = idxRel;
+ state->bs_numtuples = 0;
+ state->bs_currentInsertBuf = InvalidBuffer;
+ state->bs_pagesPerRange = pagesPerRange;
+ state->bs_currRangeStart = 0;
+ state->bs_rmAccess = revmap;
+ state->bs_bdesc = brin_build_desc(idxRel);
+ state->bs_dtuple = brin_new_memtuple(state->bs_bdesc);
+
+ brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
+
+ return state;
+}
+
+/*
+ * Release resources associated with a BrinBuildState.
+ */
+static void
+terminate_brin_buildstate(BrinBuildState *state)
+{
+ /*
+ * Release the last index buffer used. We might as well ensure that
+ * whatever free space remains in that page is available in FSM, too.
+ */
+ if (!BufferIsInvalid(state->bs_currentInsertBuf))
+ {
+ Page page;
+ Size freespace;
+ BlockNumber blk;
+
+ page = BufferGetPage(state->bs_currentInsertBuf);
+ freespace = PageGetFreeSpace(page);
+ blk = BufferGetBlockNumber(state->bs_currentInsertBuf);
+ ReleaseBuffer(state->bs_currentInsertBuf);
+ RecordPageWithFreeSpace(state->bs_irel, blk, freespace);
+ FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1);
+ }
+
+ brin_free_desc(state->bs_bdesc);
+ pfree(state->bs_dtuple);
+ pfree(state);
+}
+
+/*
+ * On the given BRIN index, summarize the heap page range that corresponds
+ * to the heap block number given.
+ *
+ * This routine can run in parallel with insertions into the heap. To avoid
+ * missing those values from the summary tuple, we first insert a placeholder
+ * index tuple into the index, then execute the heap scan; transactions
+ * concurrent with the scan update the placeholder tuple. After the scan, we
+ * union the placeholder tuple with the one computed by this routine. The
+ * update of the index value happens in a loop, so that if somebody updates
+ * the placeholder tuple after we read it, we detect the case and try again.
+ * This ensures that the concurrently inserted tuples are not lost.
+ *
+ * A further corner case is this routine being asked to summarize the partial
+ * range at the end of the table. heapNumBlocks is the (possibly outdated)
+ * table size; if we notice that the requested range lies beyond that size,
+ * we re-compute the table size after inserting the placeholder tuple, to
+ * avoid missing pages that were appended recently.
+ */
+static void
+summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel,
+ BlockNumber heapBlk, BlockNumber heapNumBlks)
+{
+ Buffer phbuf;
+ BrinTuple *phtup;
+ Size phsz;
+ OffsetNumber offset;
+ BlockNumber scanNumBlks;
+
+ /*
+ * Insert the placeholder tuple
+ */
+ phbuf = InvalidBuffer;
+ phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz);
+ offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange,
+ state->bs_rmAccess, &phbuf,
+ heapBlk, phtup, phsz);
+
+ /*
+ * Compute range end. We hold ShareUpdateExclusive lock on table, so it
+ * cannot shrink concurrently (but it can grow).
+ */
+ Assert(heapBlk % state->bs_pagesPerRange == 0);
+ if (heapBlk + state->bs_pagesPerRange > heapNumBlks)
+ {
+ /*
+ * If we're asked to scan what we believe to be the final range on the
+ * table (i.e. a range that might be partial) we need to recompute our
+ * idea of what the latest page is after inserting the placeholder
+ * tuple. Anyone that grows the table later will update the
+ * placeholder tuple, so it doesn't matter that we won't scan these
+ * pages ourselves. Careful: the table might have been extended
+ * beyond the current range, so clamp our result.
+ *
+ * Fortunately, this should occur infrequently.
+ */
+ scanNumBlks = Min(RelationGetNumberOfBlocks(heapRel) - heapBlk,
+ state->bs_pagesPerRange);
+ }
+ else
+ {
+ /* Easy case: range is known to be complete */
+ scanNumBlks = state->bs_pagesPerRange;
+ }
+
+ /*
+ * Execute the partial heap scan covering the heap blocks in the specified
+ * page range, summarizing the heap tuples in it. This scan stops just
+ * short of brinbuildCallback creating the new index entry.
+ *
+ * Note that it is critical we use the "any visible" mode of
+ * table_index_build_range_scan here: otherwise, we would miss tuples
+ * inserted by transactions that are still in progress, among other corner
+ * cases.
+ */
+ state->bs_currRangeStart = heapBlk;
+ table_index_build_range_scan(heapRel, state->bs_irel, indexInfo, false, true, false,
+ heapBlk, scanNumBlks,
+ brinbuildCallback, (void *) state, NULL);
+
+ /*
+ * Now we update the values obtained by the scan with the placeholder
+ * tuple. We do this in a loop which only terminates if we're able to
+ * update the placeholder tuple successfully; if we are not, this means
+ * somebody else modified the placeholder tuple after we read it.
+ */
+ for (;;)
+ {
+ BrinTuple *newtup;
+ Size newsize;
+ bool didupdate;
+ bool samepage;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * Update the summary tuple and try to update.
+ */
+ newtup = brin_form_tuple(state->bs_bdesc,
+ heapBlk, state->bs_dtuple, &newsize);
+ samepage = brin_can_do_samepage_update(phbuf, phsz, newsize);
+ didupdate =
+ brin_doupdate(state->bs_irel, state->bs_pagesPerRange,
+ state->bs_rmAccess, heapBlk, phbuf, offset,
+ phtup, phsz, newtup, newsize, samepage);
+ brin_free_tuple(phtup);
+ brin_free_tuple(newtup);
+
+ /* If the update succeeded, we're done. */
+ if (didupdate)
+ break;
+
+ /*
+ * If the update didn't work, it might be because somebody updated the
+ * placeholder tuple concurrently. Extract the new version, union it
+ * with the values we have from the scan, and start over. (There are
+ * other reasons for the update to fail, but it's simple to treat them
+ * the same.)
+ */
+ phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf,
+ &offset, &phsz, BUFFER_LOCK_SHARE,
+ NULL);
+ /* the placeholder tuple must exist */
+ if (phtup == NULL)
+ elog(ERROR, "missing placeholder tuple");
+ phtup = brin_copy_tuple(phtup, phsz, NULL, NULL);
+ LockBuffer(phbuf, BUFFER_LOCK_UNLOCK);
+
+ /* merge it into the tuple from the heap scan */
+ union_tuples(state->bs_bdesc, state->bs_dtuple, phtup);
+ }
+
+ ReleaseBuffer(phbuf);
+}
+
+/*
+ * Summarize page ranges that are not already summarized. If pageRange is
+ * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the
+ * page range containing the given heap page number is scanned.
+ * If include_partial is true, then the partial range at the end of the table
+ * is summarized, otherwise not.
+ *
+ * For each new index tuple inserted, *numSummarized (if not NULL) is
+ * incremented; for each existing tuple, *numExisting (if not NULL) is
+ * incremented.
+ */
+static void
+brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
+ bool include_partial, double *numSummarized, double *numExisting)
+{
+ BrinRevmap *revmap;
+ BrinBuildState *state = NULL;
+ IndexInfo *indexInfo = NULL;
+ BlockNumber heapNumBlocks;
+ BlockNumber pagesPerRange;
+ Buffer buf;
+ BlockNumber startBlk;
+
+ revmap = brinRevmapInitialize(index, &pagesPerRange, NULL);
+
+ /* determine range of pages to process */
+ heapNumBlocks = RelationGetNumberOfBlocks(heapRel);
+ if (pageRange == BRIN_ALL_BLOCKRANGES)
+ startBlk = 0;
+ else
+ {
+ startBlk = (pageRange / pagesPerRange) * pagesPerRange;
+ heapNumBlocks = Min(heapNumBlocks, startBlk + pagesPerRange);
+ }
+ if (startBlk > heapNumBlocks)
+ {
+ /* Nothing to do if start point is beyond end of table */
+ brinRevmapTerminate(revmap);
+ return;
+ }
+
+ /*
+ * Scan the revmap to find unsummarized items.
+ */
+ buf = InvalidBuffer;
+ for (; startBlk < heapNumBlocks; startBlk += pagesPerRange)
+ {
+ BrinTuple *tup;
+ OffsetNumber off;
+
+ /*
+ * Unless requested to summarize even a partial range, go away now if
+ * we think the next range is partial. Caller would pass true when it
+ * is typically run once bulk data loading is done
+ * (brin_summarize_new_values), and false when it is typically the
+ * result of arbitrarily-scheduled maintenance command (vacuuming).
+ */
+ if (!include_partial &&
+ (startBlk + pagesPerRange > heapNumBlocks))
+ break;
+
+ CHECK_FOR_INTERRUPTS();
+
+ tup = brinGetTupleForHeapBlock(revmap, startBlk, &buf, &off, NULL,
+ BUFFER_LOCK_SHARE, NULL);
+ if (tup == NULL)
+ {
+ /* no revmap entry for this heap range. Summarize it. */
+ if (state == NULL)
+ {
+ /* first time through */
+ Assert(!indexInfo);
+ state = initialize_brin_buildstate(index, revmap,
+ pagesPerRange);
+ indexInfo = BuildIndexInfo(index);
+ }
+ summarize_range(indexInfo, state, heapRel, startBlk, heapNumBlocks);
+
+ /* and re-initialize state for the next range */
+ brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
+
+ if (numSummarized)
+ *numSummarized += 1.0;
+ }
+ else
+ {
+ if (numExisting)
+ *numExisting += 1.0;
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ }
+ }
+
+ if (BufferIsValid(buf))
+ ReleaseBuffer(buf);
+
+ /* free resources */
+ brinRevmapTerminate(revmap);
+ if (state)
+ {
+ terminate_brin_buildstate(state);
+ pfree(indexInfo);
+ }
+}
+
+/*
+ * Given a deformed tuple in the build state, convert it into the on-disk
+ * format and insert it into the index, making the revmap point to it.
+ */
+static void
+form_and_insert_tuple(BrinBuildState *state)
+{
+ BrinTuple *tup;
+ Size size;
+
+ tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
+ state->bs_dtuple, &size);
+ brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
+ &state->bs_currentInsertBuf, state->bs_currRangeStart,
+ tup, size);
+ state->bs_numtuples++;
+
+ pfree(tup);
+}
+
+/*
+ * Given two deformed tuples, adjust the first one so that it's consistent
+ * with the summary values in both.
+ */
+static void
+union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
+{
+ int keyno;
+ BrinMemTuple *db;
+ MemoryContext cxt;
+ MemoryContext oldcxt;
+
+ /* Use our own memory context to avoid retail pfree */
+ cxt = AllocSetContextCreate(CurrentMemoryContext,
+ "brin union",
+ ALLOCSET_DEFAULT_SIZES);
+ oldcxt = MemoryContextSwitchTo(cxt);
+ db = brin_deform_tuple(bdesc, b, NULL);
+ MemoryContextSwitchTo(oldcxt);
+
+ for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
+ {
+ FmgrInfo *unionFn;
+ BrinValues *col_a = &a->bt_columns[keyno];
+ BrinValues *col_b = &db->bt_columns[keyno];
+ BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
+
+ if (opcinfo->oi_regular_nulls)
+ {
+ /* Adjust "hasnulls". */
+ if (!col_a->bv_hasnulls && col_b->bv_hasnulls)
+ col_a->bv_hasnulls = true;
+
+ /* If there are no values in B, there's nothing left to do. */
+ if (col_b->bv_allnulls)
+ continue;
+
+ /*
+ * Adjust "allnulls". If A doesn't have values, just copy the
+ * values from B into A, and we're done. We cannot run the
+ * operators in this case, because values in A might contain
+ * garbage. Note we already established that B contains values.
+ */
+ if (col_a->bv_allnulls)
+ {
+ int i;
+
+ col_a->bv_allnulls = false;
+
+ for (i = 0; i < opcinfo->oi_nstored; i++)
+ col_a->bv_values[i] =
+ datumCopy(col_b->bv_values[i],
+ opcinfo->oi_typcache[i]->typbyval,
+ opcinfo->oi_typcache[i]->typlen);
+
+ continue;
+ }
+ }
+
+ unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1,
+ BRIN_PROCNUM_UNION);
+ FunctionCall3Coll(unionFn,
+ bdesc->bd_index->rd_indcollation[keyno],
+ PointerGetDatum(bdesc),
+ PointerGetDatum(col_a),
+ PointerGetDatum(col_b));
+ }
+
+ MemoryContextDelete(cxt);
+}
+
+/*
+ * brin_vacuum_scan
+ * Do a complete scan of the index during VACUUM.
+ *
+ * This routine scans the complete index looking for uncatalogued index pages,
+ * i.e. those that might have been lost due to a crash after index extension
+ * and such.
+ */
+static void
+brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy)
+{
+ BlockNumber nblocks;
+ BlockNumber blkno;
+
+ /*
+ * Scan the index in physical order, and clean up any possible mess in
+ * each page.
+ */
+ nblocks = RelationGetNumberOfBlocks(idxrel);
+ for (blkno = 0; blkno < nblocks; blkno++)
+ {
+ Buffer buf;
+
+ CHECK_FOR_INTERRUPTS();
+
+ buf = ReadBufferExtended(idxrel, MAIN_FORKNUM, blkno,
+ RBM_NORMAL, strategy);
+
+ brin_page_cleanup(idxrel, buf);
+
+ ReleaseBuffer(buf);
+ }
+
+ /*
+ * Update all upper pages in the index's FSM, as well. This ensures not
+ * only that we propagate leaf-page FSM updates made by brin_page_cleanup,
+ * but also that any pre-existing damage or out-of-dateness is repaired.
+ */
+ FreeSpaceMapVacuum(idxrel);
+}
+
+static bool
+add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup,
+ Datum *values, bool *nulls)
+{
+ int keyno;
+ bool modified = false;
+
+ /*
+ * Compare the key values of the new tuple to the stored index values; our
+ * deformed tuple will get updated if the new tuple doesn't fit the
+ * original range (note this means we can't break out of the loop early).
+ * Make a note of whether this happens, so that we know to insert the
+ * modified tuple later.
+ */
+ for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
+ {
+ Datum result;
+ BrinValues *bval;
+ FmgrInfo *addValue;
+
+ bval = &dtup->bt_columns[keyno];
+
+ if (bdesc->bd_info[keyno]->oi_regular_nulls && nulls[keyno])
+ {
+ /*
+ * If the new value is null, we record that we saw it if it's the
+ * first one; otherwise, there's nothing to do.
+ */
+ if (!bval->bv_hasnulls)
+ {
+ bval->bv_hasnulls = true;
+ modified = true;
+ }
+
+ continue;
+ }
+
+ addValue = index_getprocinfo(idxRel, keyno + 1,
+ BRIN_PROCNUM_ADDVALUE);
+ result = FunctionCall4Coll(addValue,
+ idxRel->rd_indcollation[keyno],
+ PointerGetDatum(bdesc),
+ PointerGetDatum(bval),
+ values[keyno],
+ nulls[keyno]);
+ /* if that returned true, we need to insert the updated tuple */
+ modified |= DatumGetBool(result);
+ }
+
+ return modified;
+}
+
+static bool
+check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys)
+{
+ int keyno;
+
+ /*
+ * First check if there are any IS [NOT] NULL scan keys, and if we're
+ * violating them.
+ */
+ for (keyno = 0; keyno < nnullkeys; keyno++)
+ {
+ ScanKey key = nullkeys[keyno];
+
+ Assert(key->sk_attno == bval->bv_attno);
+
+ /* Handle only IS NULL/IS NOT NULL tests */
+ if (!(key->sk_flags & SK_ISNULL))
+ continue;
+
+ if (key->sk_flags & SK_SEARCHNULL)
+ {
+ /* IS NULL scan key, but range has no NULLs */
+ if (!bval->bv_allnulls && !bval->bv_hasnulls)
+ return false;
+ }
+ else if (key->sk_flags & SK_SEARCHNOTNULL)
+ {
+ /*
+ * For IS NOT NULL, we can only skip ranges that are known to have
+ * only nulls.
+ */
+ if (bval->bv_allnulls)
+ return false;
+ }
+ else
+ {
+ /*
+ * Neither IS NULL nor IS NOT NULL was used; assume all indexable
+ * operators are strict and thus return false with NULL value in
+ * the scan key.
+ */
+ return false;
+ }
+ }
+
+ return true;
+}
diff --git a/src/backend/access/brin/brin_bloom.c b/src/backend/access/brin/brin_bloom.c
new file mode 100644
index 0000000..2c8a20a
--- /dev/null
+++ b/src/backend/access/brin/brin_bloom.c
@@ -0,0 +1,809 @@
+/*
+ * brin_bloom.c
+ * Implementation of Bloom opclass for BRIN
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * A BRIN opclass summarizing page range into a bloom filter.
+ *
+ * Bloom filters allow efficient testing whether a given page range contains
+ * a particular value. Therefore, if we summarize each page range into a small
+ * bloom filter, we can easily (and cheaply) test whether it contains values
+ * we get later.
+ *
+ * The index only supports equality operators, similarly to hash indexes.
+ * Bloom indexes are however much smaller, and support only bitmap scans.
+ *
+ * Note: Don't confuse this with bloom indexes, implemented in a contrib
+ * module. That extension implements an entirely new AM, building a bloom
+ * filter on multiple columns in a single row. This opclass works with an
+ * existing AM (BRIN) and builds bloom filter on a column.
+ *
+ *
+ * values vs. hashes
+ * -----------------
+ *
+ * The original column values are not used directly, but are first hashed
+ * using the regular type-specific hash function, producing a uint32 hash.
+ * And this hash value is then added to the summary - i.e. it's hashed
+ * again and added to the bloom filter.
+ *
+ * This allows the code to treat all data types (byval/byref/...) the same
+ * way, with only minimal space requirements, because we're working with
+ * hashes and not the original values. Everything is uint32.
+ *
+ * Of course, this assumes the built-in hash function is reasonably good,
+ * without too many collisions etc. But that does seem to be the case, at
+ * least based on past experience. After all, the same hash functions are
+ * used for hash indexes, hash partitioning and so on.
+ *
+ *
+ * hashing scheme
+ * --------------
+ *
+ * Bloom filters require a number of independent hash functions. There are
+ * different schemes how to construct them - for example we might use
+ * hash_uint32_extended with random seeds, but that seems fairly expensive.
+ * We use a scheme requiring only two functions described in this paper:
+ *
+ * Less Hashing, Same Performance:Building a Better Bloom Filter
+ * Adam Kirsch, Michael Mitzenmacher†, Harvard School of Engineering and
+ * Applied Sciences, Cambridge, Massachusetts [DOI 10.1002/rsa.20208]
+ *
+ * The two hash functions h1 and h2 are calculated using hard-coded seeds,
+ * and then combined using (h1 + i * h2) to generate the hash functions.
+ *
+ *
+ * sizing the bloom filter
+ * -----------------------
+ *
+ * Size of a bloom filter depends on the number of distinct values we will
+ * store in it, and the desired false positive rate. The higher the number
+ * of distinct values and/or the lower the false positive rate, the larger
+ * the bloom filter. On the other hand, we want to keep the index as small
+ * as possible - that's one of the basic advantages of BRIN indexes.
+ *
+ * Although the number of distinct elements (in a page range) depends on
+ * the data, we can consider it fixed. This simplifies the trade-off to
+ * just false positive rate vs. size.
+ *
+ * At the page range level, false positive rate is a probability the bloom
+ * filter matches a random value. For the whole index (with sufficiently
+ * many page ranges) it represents the fraction of the index ranges (and
+ * thus fraction of the table to be scanned) matching the random value.
+ *
+ * Furthermore, the size of the bloom filter is subject to implementation
+ * limits - it has to fit onto a single index page (8kB by default). As
+ * the bitmap is inherently random (when "full" about half the bits is set
+ * to 1, randomly), compression can't help very much.
+ *
+ * To reduce the size of a filter (to fit to a page), we have to either
+ * accept higher false positive rate (undesirable), or reduce the number
+ * of distinct items to be stored in the filter. We can't alter the input
+ * data, of course, but we may make the BRIN page ranges smaller - instead
+ * of the default 128 pages (1MB) we may build index with 16-page ranges,
+ * or something like that. This should reduce the number of distinct values
+ * in the page range, making the filter smaller (with fixed false positive
+ * rate). Even for random data sets this should help, as the number of rows
+ * per heap page is limited (to ~290 with very narrow tables, likely ~20
+ * in practice).
+ *
+ * Of course, good sizing decisions depend on having the necessary data,
+ * i.e. number of distinct values in a page range (of a given size) and
+ * table size (to estimate cost change due to change in false positive
+ * rate due to having larger index vs. scanning larger indexes). We may
+ * not have that data - for example when building an index on empty table
+ * it's not really possible. And for some data we only have estimates for
+ * the whole table and we can only estimate per-range values (ndistinct).
+ *
+ * Another challenge is that while the bloom filter is per-column, it's
+ * the whole index tuple that has to fit into a page. And for multi-column
+ * indexes that may include pieces we have no control over (not necessarily
+ * bloom filters, the other columns may use other BRIN opclasses). So it's
+ * not entirely clear how to distribute the space between those columns.
+ *
+ * The current logic, implemented in brin_bloom_get_ndistinct, attempts to
+ * make some basic sizing decisions, based on the size of BRIN ranges, and
+ * the maximum number of rows per range.
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/brin/brin_bloom.c
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/brin.h"
+#include "access/brin_internal.h"
+#include "access/brin_page.h"
+#include "access/brin_tuple.h"
+#include "access/hash.h"
+#include "access/htup_details.h"
+#include "access/reloptions.h"
+#include "access/stratnum.h"
+#include "catalog/pg_type.h"
+#include "catalog/pg_amop.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+
+#include <math.h>
+
+#define BloomEqualStrategyNumber 1
+
+/*
+ * Additional SQL level support functions. We only have one, which is
+ * used to calculate hash of the input value.
+ *
+ * Procedure numbers must not use values reserved for BRIN itself; see
+ * brin_internal.h.
+ */
+#define BLOOM_MAX_PROCNUMS 1 /* maximum support procs we need */
+#define PROCNUM_HASH 11 /* required */
+
+/*
+ * Subtract this from procnum to obtain index in BloomOpaque arrays
+ * (Must be equal to minimum of private procnums).
+ */
+#define PROCNUM_BASE 11
+
+/*
+ * Storage type for BRIN's reloptions.
+ */
+typedef struct BloomOptions
+{
+ int32 vl_len_; /* varlena header (do not touch directly!) */
+ double nDistinctPerRange; /* number of distinct values per range */
+ double falsePositiveRate; /* false positive for bloom filter */
+} BloomOptions;
+
+/*
+ * The current min value (16) is somewhat arbitrary, but it's based
+ * on the fact that the filter header is ~20B alone, which is about
+ * the same as the filter bitmap for 16 distinct items with 1% false
+ * positive rate. So by allowing lower values we'd not gain much. In
+ * any case, the min should not be larger than MaxHeapTuplesPerPage
+ * (~290), which is the theoretical maximum for single-page ranges.
+ */
+#define BLOOM_MIN_NDISTINCT_PER_RANGE 16
+
+/*
+ * Used to determine number of distinct items, based on the number of rows
+ * in a page range. The 10% is somewhat similar to what estimate_num_groups
+ * does, so we use the same factor here.
+ */
+#define BLOOM_DEFAULT_NDISTINCT_PER_RANGE -0.1 /* 10% of values */
+
+/*
+ * Allowed range and default value for the false positive range. The exact
+ * values are somewhat arbitrary, but were chosen considering the various
+ * parameters (size of filter vs. page size, etc.).
+ *
+ * The lower the false-positive rate, the more accurate the filter is, but
+ * it also gets larger - at some point this eliminates the main advantage
+ * of BRIN indexes, which is the tiny size. At 0.01% the index is about
+ * 10% of the table (assuming 290 distinct values per 8kB page).
+ *
+ * On the other hand, as the false-positive rate increases, larger part of
+ * the table has to be scanned due to mismatches - at 25% we're probably
+ * close to sequential scan being cheaper.
+ */
+#define BLOOM_MIN_FALSE_POSITIVE_RATE 0.0001 /* 0.01% fp rate */
+#define BLOOM_MAX_FALSE_POSITIVE_RATE 0.25 /* 25% fp rate */
+#define BLOOM_DEFAULT_FALSE_POSITIVE_RATE 0.01 /* 1% fp rate */
+
+#define BloomGetNDistinctPerRange(opts) \
+ ((opts) && (((BloomOptions *) (opts))->nDistinctPerRange != 0) ? \
+ (((BloomOptions *) (opts))->nDistinctPerRange) : \
+ BLOOM_DEFAULT_NDISTINCT_PER_RANGE)
+
+#define BloomGetFalsePositiveRate(opts) \
+ ((opts) && (((BloomOptions *) (opts))->falsePositiveRate != 0.0) ? \
+ (((BloomOptions *) (opts))->falsePositiveRate) : \
+ BLOOM_DEFAULT_FALSE_POSITIVE_RATE)
+
+/*
+ * And estimate of the largest bloom we can fit onto a page. This is not
+ * a perfect guarantee, for a couple of reasons. For example, the row may
+ * be larger because the index has multiple columns.
+ */
+#define BloomMaxFilterSize \
+ MAXALIGN_DOWN(BLCKSZ - \
+ (MAXALIGN(SizeOfPageHeaderData + \
+ sizeof(ItemIdData)) + \
+ MAXALIGN(sizeof(BrinSpecialSpace)) + \
+ SizeOfBrinTuple))
+
+/*
+ * Seeds used to calculate two hash functions h1 and h2, which are then used
+ * to generate k hashes using the (h1 + i * h2) scheme.
+ */
+#define BLOOM_SEED_1 0x71d924af
+#define BLOOM_SEED_2 0xba48b314
+
+/*
+ * Bloom Filter
+ *
+ * Represents a bloom filter, built on hashes of the indexed values. That is,
+ * we compute a uint32 hash of the value, and then store this hash into the
+ * bloom filter (and compute additional hashes on it).
+ *
+ * XXX We could implement "sparse" bloom filters, keeping only the bytes that
+ * are not entirely 0. But while indexes don't support TOAST, the varlena can
+ * still be compressed. So this seems unnecessary, because the compression
+ * should do the same job.
+ *
+ * XXX We can also watch the number of bits set in the bloom filter, and then
+ * stop using it (and not store the bitmap, to save space) when the false
+ * positive rate gets too high. But even if the false positive rate exceeds the
+ * desired value, it still can eliminate some page ranges.
+ */
+typedef struct BloomFilter
+{
+ /* varlena header (do not touch directly!) */
+ int32 vl_len_;
+
+ /* space for various flags (unused for now) */
+ uint16 flags;
+
+ /* fields for the HASHED phase */
+ uint8 nhashes; /* number of hash functions */
+ uint32 nbits; /* number of bits in the bitmap (size) */
+ uint32 nbits_set; /* number of bits set to 1 */
+
+ /* data of the bloom filter */
+ char data[FLEXIBLE_ARRAY_MEMBER];
+
+} BloomFilter;
+
+
+/*
+ * bloom_init
+ * Initialize the Bloom Filter, allocate all the memory.
+ *
+ * The filter is initialized with optimal size for ndistinct expected values
+ * and the requested false positive rate. The filter is stored as varlena.
+ */
+static BloomFilter *
+bloom_init(int ndistinct, double false_positive_rate)
+{
+ Size len;
+ BloomFilter *filter;
+
+ int nbits; /* size of filter / number of bits */
+ int nbytes; /* size of filter / number of bytes */
+
+ double k; /* number of hash functions */
+
+ Assert(ndistinct > 0);
+ Assert((false_positive_rate >= BLOOM_MIN_FALSE_POSITIVE_RATE) &&
+ (false_positive_rate < BLOOM_MAX_FALSE_POSITIVE_RATE));
+
+ /* sizing bloom filter: -(n * ln(p)) / (ln(2))^2 */
+ nbits = ceil(-(ndistinct * log(false_positive_rate)) / pow(log(2.0), 2));
+
+ /* round m to whole bytes */
+ nbytes = ((nbits + 7) / 8);
+ nbits = nbytes * 8;
+
+ /*
+ * Reject filters that are obviously too large to store on a page.
+ *
+ * Initially the bloom filter is just zeroes and so very compressible, but
+ * as we add values it gets more and more random, and so less and less
+ * compressible. So initially everything fits on the page, but we might
+ * get surprising failures later - we want to prevent that, so we reject
+ * bloom filter that are obviously too large.
+ *
+ * XXX It's not uncommon to oversize the bloom filter a bit, to defend
+ * against unexpected data anomalies (parts of table with more distinct
+ * values per range etc.). But we still need to make sure even the
+ * oversized filter fits on page, if such need arises.
+ *
+ * XXX This check is not perfect, because the index may have multiple
+ * filters that are small individually, but too large when combined.
+ */
+ if (nbytes > BloomMaxFilterSize)
+ elog(ERROR, "the bloom filter is too large (%d > %zu)", nbytes,
+ BloomMaxFilterSize);
+
+ /*
+ * round(log(2.0) * m / ndistinct), but assume round() may not be
+ * available on Windows
+ */
+ k = log(2.0) * nbits / ndistinct;
+ k = (k - floor(k) >= 0.5) ? ceil(k) : floor(k);
+
+ /*
+ * We allocate the whole filter. Most of it is going to be 0 bits, so the
+ * varlena is easy to compress.
+ */
+ len = offsetof(BloomFilter, data) + nbytes;
+
+ filter = (BloomFilter *) palloc0(len);
+
+ filter->flags = 0;
+ filter->nhashes = (int) k;
+ filter->nbits = nbits;
+
+ SET_VARSIZE(filter, len);
+
+ return filter;
+}
+
+
+/*
+ * bloom_add_value
+ * Add value to the bloom filter.
+ */
+static BloomFilter *
+bloom_add_value(BloomFilter *filter, uint32 value, bool *updated)
+{
+ int i;
+ uint64 h1,
+ h2;
+
+ /* compute the hashes, used for the bloom filter */
+ h1 = hash_bytes_uint32_extended(value, BLOOM_SEED_1) % filter->nbits;
+ h2 = hash_bytes_uint32_extended(value, BLOOM_SEED_2) % filter->nbits;
+
+ /* compute the requested number of hashes */
+ for (i = 0; i < filter->nhashes; i++)
+ {
+ /* h1 + h2 + f(i) */
+ uint32 h = (h1 + i * h2) % filter->nbits;
+ uint32 byte = (h / 8);
+ uint32 bit = (h % 8);
+
+ /* if the bit is not set, set it and remember we did that */
+ if (!(filter->data[byte] & (0x01 << bit)))
+ {
+ filter->data[byte] |= (0x01 << bit);
+ filter->nbits_set++;
+ if (updated)
+ *updated = true;
+ }
+ }
+
+ return filter;
+}
+
+
+/*
+ * bloom_contains_value
+ * Check if the bloom filter contains a particular value.
+ */
+static bool
+bloom_contains_value(BloomFilter *filter, uint32 value)
+{
+ int i;
+ uint64 h1,
+ h2;
+
+ /* calculate the two hashes */
+ h1 = hash_bytes_uint32_extended(value, BLOOM_SEED_1) % filter->nbits;
+ h2 = hash_bytes_uint32_extended(value, BLOOM_SEED_2) % filter->nbits;
+
+ /* compute the requested number of hashes */
+ for (i = 0; i < filter->nhashes; i++)
+ {
+ /* h1 + h2 + f(i) */
+ uint32 h = (h1 + i * h2) % filter->nbits;
+ uint32 byte = (h / 8);
+ uint32 bit = (h % 8);
+
+ /* if the bit is not set, the value is not there */
+ if (!(filter->data[byte] & (0x01 << bit)))
+ return false;
+ }
+
+ /* all hashes found in bloom filter */
+ return true;
+}
+
+typedef struct BloomOpaque
+{
+ /*
+ * XXX At this point we only need a single proc (to compute the hash), but
+ * let's keep the array just like inclusion and minmax opclasses, for
+ * consistency. We may need additional procs in the future.
+ */
+ FmgrInfo extra_procinfos[BLOOM_MAX_PROCNUMS];
+ bool extra_proc_missing[BLOOM_MAX_PROCNUMS];
+} BloomOpaque;
+
+static FmgrInfo *bloom_get_procinfo(BrinDesc *bdesc, uint16 attno,
+ uint16 procnum);
+
+
+Datum
+brin_bloom_opcinfo(PG_FUNCTION_ARGS)
+{
+ BrinOpcInfo *result;
+
+ /*
+ * opaque->strategy_procinfos is initialized lazily; here it is set to
+ * all-uninitialized by palloc0 which sets fn_oid to InvalidOid.
+ *
+ * bloom indexes only store the filter as a single BYTEA column
+ */
+
+ result = palloc0(MAXALIGN(SizeofBrinOpcInfo(1)) +
+ sizeof(BloomOpaque));
+ result->oi_nstored = 1;
+ result->oi_regular_nulls = true;
+ result->oi_opaque = (BloomOpaque *)
+ MAXALIGN((char *) result + SizeofBrinOpcInfo(1));
+ result->oi_typcache[0] = lookup_type_cache(PG_BRIN_BLOOM_SUMMARYOID, 0);
+
+ PG_RETURN_POINTER(result);
+}
+
+/*
+ * brin_bloom_get_ndistinct
+ * Determine the ndistinct value used to size bloom filter.
+ *
+ * Adjust the ndistinct value based on the pagesPerRange value. First,
+ * if it's negative, it's assumed to be relative to maximum number of
+ * tuples in the range (assuming each page gets MaxHeapTuplesPerPage
+ * tuples, which is likely a significant over-estimate). We also clamp
+ * the value, not to over-size the bloom filter unnecessarily.
+ *
+ * XXX We can only do this when the pagesPerRange value was supplied.
+ * If it wasn't, it has to be a read-only access to the index, in which
+ * case we don't really care. But perhaps we should fall-back to the
+ * default pagesPerRange value?
+ *
+ * XXX We might also fetch info about ndistinct estimate for the column,
+ * and compute the expected number of distinct values in a range. But
+ * that may be tricky due to data being sorted in various ways, so it
+ * seems better to rely on the upper estimate.
+ *
+ * XXX We might also calculate a better estimate of rows per BRIN range,
+ * instead of using MaxHeapTuplesPerPage (which probably produces values
+ * much higher than reality).
+ */
+static int
+brin_bloom_get_ndistinct(BrinDesc *bdesc, BloomOptions *opts)
+{
+ double ndistinct;
+ double maxtuples;
+ BlockNumber pagesPerRange;
+
+ pagesPerRange = BrinGetPagesPerRange(bdesc->bd_index);
+ ndistinct = BloomGetNDistinctPerRange(opts);
+
+ Assert(BlockNumberIsValid(pagesPerRange));
+
+ maxtuples = MaxHeapTuplesPerPage * pagesPerRange;
+
+ /*
+ * Similarly to n_distinct, negative values are relative - in this case to
+ * maximum number of tuples in the page range (maxtuples).
+ */
+ if (ndistinct < 0)
+ ndistinct = (-ndistinct) * maxtuples;
+
+ /*
+ * Positive values are to be used directly, but we still apply a couple of
+ * safeties to avoid using unreasonably small bloom filters.
+ */
+ ndistinct = Max(ndistinct, BLOOM_MIN_NDISTINCT_PER_RANGE);
+
+ /*
+ * And don't use more than the maximum possible number of tuples, in the
+ * range, which would be entirely wasteful.
+ */
+ ndistinct = Min(ndistinct, maxtuples);
+
+ return (int) ndistinct;
+}
+
+/*
+ * Examine the given index tuple (which contains partial status of a certain
+ * page range) by comparing it to the given value that comes from another heap
+ * tuple. If the new value is outside the bloom filter specified by the
+ * existing tuple values, update the index tuple and return true. Otherwise,
+ * return false and do not modify in this case.
+ */
+Datum
+brin_bloom_add_value(PG_FUNCTION_ARGS)
+{
+ BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
+ BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
+ Datum newval = PG_GETARG_DATUM(2);
+ bool isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_DATUM(3);
+ BloomOptions *opts = (BloomOptions *) PG_GET_OPCLASS_OPTIONS();
+ Oid colloid = PG_GET_COLLATION();
+ FmgrInfo *hashFn;
+ uint32 hashValue;
+ bool updated = false;
+ AttrNumber attno;
+ BloomFilter *filter;
+
+ Assert(!isnull);
+
+ attno = column->bv_attno;
+
+ /*
+ * If this is the first non-null value, we need to initialize the bloom
+ * filter. Otherwise just extract the existing bloom filter from
+ * BrinValues.
+ */
+ if (column->bv_allnulls)
+ {
+ filter = bloom_init(brin_bloom_get_ndistinct(bdesc, opts),
+ BloomGetFalsePositiveRate(opts));
+ column->bv_values[0] = PointerGetDatum(filter);
+ column->bv_allnulls = false;
+ updated = true;
+ }
+ else
+ filter = (BloomFilter *) PG_DETOAST_DATUM(column->bv_values[0]);
+
+ /*
+ * Compute the hash of the new value, using the supplied hash function,
+ * and then add the hash value to the bloom filter.
+ */
+ hashFn = bloom_get_procinfo(bdesc, attno, PROCNUM_HASH);
+
+ hashValue = DatumGetUInt32(FunctionCall1Coll(hashFn, colloid, newval));
+
+ filter = bloom_add_value(filter, hashValue, &updated);
+
+ column->bv_values[0] = PointerGetDatum(filter);
+
+ PG_RETURN_BOOL(updated);
+}
+
+/*
+ * Given an index tuple corresponding to a certain page range and a scan key,
+ * return whether the scan key is consistent with the index tuple's bloom
+ * filter. Return true if so, false otherwise.
+ */
+Datum
+brin_bloom_consistent(PG_FUNCTION_ARGS)
+{
+ BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
+ BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
+ ScanKey *keys = (ScanKey *) PG_GETARG_POINTER(2);
+ int nkeys = PG_GETARG_INT32(3);
+ Oid colloid = PG_GET_COLLATION();
+ AttrNumber attno;
+ Datum value;
+ Datum matches;
+ FmgrInfo *finfo;
+ uint32 hashValue;
+ BloomFilter *filter;
+ int keyno;
+
+ filter = (BloomFilter *) PG_DETOAST_DATUM(column->bv_values[0]);
+
+ Assert(filter);
+
+ matches = true;
+
+ for (keyno = 0; keyno < nkeys; keyno++)
+ {
+ ScanKey key = keys[keyno];
+
+ /* NULL keys are handled and filtered-out in bringetbitmap */
+ Assert(!(key->sk_flags & SK_ISNULL));
+
+ attno = key->sk_attno;
+ value = key->sk_argument;
+
+ switch (key->sk_strategy)
+ {
+ case BloomEqualStrategyNumber:
+
+ /*
+ * In the equality case (WHERE col = someval), we want to
+ * return the current page range if the minimum value in the
+ * range <= scan key, and the maximum value >= scan key.
+ */
+ finfo = bloom_get_procinfo(bdesc, attno, PROCNUM_HASH);
+
+ hashValue = DatumGetUInt32(FunctionCall1Coll(finfo, colloid, value));
+ matches &= bloom_contains_value(filter, hashValue);
+
+ break;
+ default:
+ /* shouldn't happen */
+ elog(ERROR, "invalid strategy number %d", key->sk_strategy);
+ matches = 0;
+ break;
+ }
+
+ if (!matches)
+ break;
+ }
+
+ PG_RETURN_DATUM(matches);
+}
+
+/*
+ * Given two BrinValues, update the first of them as a union of the summary
+ * values contained in both. The second one is untouched.
+ *
+ * XXX We assume the bloom filters have the same parameters for now. In the
+ * future we should have 'can union' function, to decide if we can combine
+ * two particular bloom filters.
+ */
+Datum
+brin_bloom_union(PG_FUNCTION_ARGS)
+{
+ int i;
+ int nbytes;
+ BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1);
+ BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2);
+ BloomFilter *filter_a;
+ BloomFilter *filter_b;
+
+ Assert(col_a->bv_attno == col_b->bv_attno);
+ Assert(!col_a->bv_allnulls && !col_b->bv_allnulls);
+
+ filter_a = (BloomFilter *) PG_DETOAST_DATUM(col_a->bv_values[0]);
+ filter_b = (BloomFilter *) PG_DETOAST_DATUM(col_b->bv_values[0]);
+
+ /* make sure the filters use the same parameters */
+ Assert(filter_a && filter_b);
+ Assert(filter_a->nbits == filter_b->nbits);
+ Assert(filter_a->nhashes == filter_b->nhashes);
+ Assert((filter_a->nbits > 0) && (filter_a->nbits % 8 == 0));
+
+ nbytes = (filter_a->nbits) / 8;
+
+ /* simply OR the bitmaps */
+ for (i = 0; i < nbytes; i++)
+ filter_a->data[i] |= filter_b->data[i];
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * Cache and return inclusion opclass support procedure
+ *
+ * Return the procedure corresponding to the given function support number
+ * or null if it does not exist.
+ */
+static FmgrInfo *
+bloom_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum)
+{
+ BloomOpaque *opaque;
+ uint16 basenum = procnum - PROCNUM_BASE;
+
+ /*
+ * We cache these in the opaque struct, to avoid repetitive syscache
+ * lookups.
+ */
+ opaque = (BloomOpaque *) bdesc->bd_info[attno - 1]->oi_opaque;
+
+ /*
+ * If we already searched for this proc and didn't find it, don't bother
+ * searching again.
+ */
+ if (opaque->extra_proc_missing[basenum])
+ return NULL;
+
+ if (opaque->extra_procinfos[basenum].fn_oid == InvalidOid)
+ {
+ if (RegProcedureIsValid(index_getprocid(bdesc->bd_index, attno,
+ procnum)))
+ {
+ fmgr_info_copy(&opaque->extra_procinfos[basenum],
+ index_getprocinfo(bdesc->bd_index, attno, procnum),
+ bdesc->bd_context);
+ }
+ else
+ {
+ opaque->extra_proc_missing[basenum] = true;
+ return NULL;
+ }
+ }
+
+ return &opaque->extra_procinfos[basenum];
+}
+
+Datum
+brin_bloom_options(PG_FUNCTION_ARGS)
+{
+ local_relopts *relopts = (local_relopts *) PG_GETARG_POINTER(0);
+
+ init_local_reloptions(relopts, sizeof(BloomOptions));
+
+ add_local_real_reloption(relopts, "n_distinct_per_range",
+ "number of distinct items expected in a BRIN page range",
+ BLOOM_DEFAULT_NDISTINCT_PER_RANGE,
+ -1.0, INT_MAX, offsetof(BloomOptions, nDistinctPerRange));
+
+ add_local_real_reloption(relopts, "false_positive_rate",
+ "desired false-positive rate for the bloom filters",
+ BLOOM_DEFAULT_FALSE_POSITIVE_RATE,
+ BLOOM_MIN_FALSE_POSITIVE_RATE,
+ BLOOM_MAX_FALSE_POSITIVE_RATE,
+ offsetof(BloomOptions, falsePositiveRate));
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * brin_bloom_summary_in
+ * - input routine for type brin_bloom_summary.
+ *
+ * brin_bloom_summary is only used internally to represent summaries
+ * in BRIN bloom indexes, so it has no operations of its own, and we
+ * disallow input too.
+ */
+Datum
+brin_bloom_summary_in(PG_FUNCTION_ARGS)
+{
+ /*
+ * brin_bloom_summary stores the data in binary form and parsing text
+ * input is not needed, so disallow this.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot accept a value of type %s", "pg_brin_bloom_summary")));
+
+ PG_RETURN_VOID(); /* keep compiler quiet */
+}
+
+
+/*
+ * brin_bloom_summary_out
+ * - output routine for type brin_bloom_summary.
+ *
+ * BRIN bloom summaries are serialized into a bytea value, but we want
+ * to output something nicer humans can understand.
+ */
+Datum
+brin_bloom_summary_out(PG_FUNCTION_ARGS)
+{
+ BloomFilter *filter;
+ StringInfoData str;
+
+ /* detoast the data to get value with a full 4B header */
+ filter = (BloomFilter *) PG_DETOAST_DATUM(PG_GETARG_BYTEA_PP(0));
+
+ initStringInfo(&str);
+ appendStringInfoChar(&str, '{');
+
+ appendStringInfo(&str, "mode: hashed nhashes: %u nbits: %u nbits_set: %u",
+ filter->nhashes, filter->nbits, filter->nbits_set);
+
+ appendStringInfoChar(&str, '}');
+
+ PG_RETURN_CSTRING(str.data);
+}
+
+/*
+ * brin_bloom_summary_recv
+ * - binary input routine for type brin_bloom_summary.
+ */
+Datum
+brin_bloom_summary_recv(PG_FUNCTION_ARGS)
+{
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot accept a value of type %s", "pg_brin_bloom_summary")));
+
+ PG_RETURN_VOID(); /* keep compiler quiet */
+}
+
+/*
+ * brin_bloom_summary_send
+ * - binary output routine for type brin_bloom_summary.
+ *
+ * BRIN bloom summaries are serialized in a bytea value (although the
+ * type is named differently), so let's just send that.
+ */
+Datum
+brin_bloom_summary_send(PG_FUNCTION_ARGS)
+{
+ return byteasend(fcinfo);
+}
diff --git a/src/backend/access/brin/brin_inclusion.c b/src/backend/access/brin/brin_inclusion.c
new file mode 100644
index 0000000..0b384c0
--- /dev/null
+++ b/src/backend/access/brin/brin_inclusion.c
@@ -0,0 +1,657 @@
+/*
+ * brin_inclusion.c
+ * Implementation of inclusion opclasses for BRIN
+ *
+ * This module provides framework BRIN support functions for the "inclusion"
+ * operator classes. A few SQL-level support functions are also required for
+ * each opclass.
+ *
+ * The "inclusion" BRIN strategy is useful for types that support R-Tree
+ * operations. This implementation is a straight mapping of those operations
+ * to the block-range nature of BRIN, with two exceptions: (a) we explicitly
+ * support "empty" elements: at least with range types, we need to consider
+ * emptiness separately from regular R-Tree strategies; and (b) we need to
+ * consider "unmergeable" elements, that is, a set of elements for whose union
+ * no representation exists. The only case where that happens as of this
+ * writing is the INET type, where IPv6 values cannot be merged with IPv4
+ * values.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/brin/brin_inclusion.c
+ */
+#include "postgres.h"
+
+#include "access/brin_internal.h"
+#include "access/brin_tuple.h"
+#include "access/genam.h"
+#include "access/skey.h"
+#include "catalog/pg_amop.h"
+#include "catalog/pg_type.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+
+
+/*
+ * Additional SQL level support functions
+ *
+ * Procedure numbers must not use values reserved for BRIN itself; see
+ * brin_internal.h.
+ */
+#define INCLUSION_MAX_PROCNUMS 4 /* maximum support procs we need */
+#define PROCNUM_MERGE 11 /* required */
+#define PROCNUM_MERGEABLE 12 /* optional */
+#define PROCNUM_CONTAINS 13 /* optional */
+#define PROCNUM_EMPTY 14 /* optional */
+
+
+/*
+ * Subtract this from procnum to obtain index in InclusionOpaque arrays
+ * (Must be equal to minimum of private procnums).
+ */
+#define PROCNUM_BASE 11
+
+/*-
+ * The values stored in the bv_values arrays correspond to:
+ *
+ * INCLUSION_UNION
+ * the union of the values in the block range
+ * INCLUSION_UNMERGEABLE
+ * whether the values in the block range cannot be merged
+ * (e.g. an IPv6 address amidst IPv4 addresses)
+ * INCLUSION_CONTAINS_EMPTY
+ * whether an empty value is present in any tuple
+ * in the block range
+ */
+#define INCLUSION_UNION 0
+#define INCLUSION_UNMERGEABLE 1
+#define INCLUSION_CONTAINS_EMPTY 2
+
+
+typedef struct InclusionOpaque
+{
+ FmgrInfo extra_procinfos[INCLUSION_MAX_PROCNUMS];
+ bool extra_proc_missing[INCLUSION_MAX_PROCNUMS];
+ Oid cached_subtype;
+ FmgrInfo strategy_procinfos[RTMaxStrategyNumber];
+} InclusionOpaque;
+
+static FmgrInfo *inclusion_get_procinfo(BrinDesc *bdesc, uint16 attno,
+ uint16 procnum);
+static FmgrInfo *inclusion_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno,
+ Oid subtype, uint16 strategynum);
+
+
+/*
+ * BRIN inclusion OpcInfo function
+ */
+Datum
+brin_inclusion_opcinfo(PG_FUNCTION_ARGS)
+{
+ Oid typoid = PG_GETARG_OID(0);
+ BrinOpcInfo *result;
+ TypeCacheEntry *bool_typcache = lookup_type_cache(BOOLOID, 0);
+
+ /*
+ * All members of opaque are initialized lazily; both procinfo arrays
+ * start out as non-initialized by having fn_oid be InvalidOid, and
+ * "missing" to false, by zeroing here. strategy_procinfos elements can
+ * be invalidated when cached_subtype changes by zeroing fn_oid.
+ * extra_procinfo entries are never invalidated, but if a lookup fails
+ * (which is expected), extra_proc_missing is set to true, indicating not
+ * to look it up again.
+ */
+ result = palloc0(MAXALIGN(SizeofBrinOpcInfo(3)) + sizeof(InclusionOpaque));
+ result->oi_nstored = 3;
+ result->oi_regular_nulls = true;
+ result->oi_opaque = (InclusionOpaque *)
+ MAXALIGN((char *) result + SizeofBrinOpcInfo(3));
+
+ /* the union */
+ result->oi_typcache[INCLUSION_UNION] =
+ lookup_type_cache(typoid, 0);
+
+ /* includes elements that are not mergeable */
+ result->oi_typcache[INCLUSION_UNMERGEABLE] = bool_typcache;
+
+ /* includes the empty element */
+ result->oi_typcache[INCLUSION_CONTAINS_EMPTY] = bool_typcache;
+
+ PG_RETURN_POINTER(result);
+}
+
+/*
+ * BRIN inclusion add value function
+ *
+ * Examine the given index tuple (which contains partial status of a certain
+ * page range) by comparing it to the given value that comes from another heap
+ * tuple. If the new value is outside the union specified by the existing
+ * tuple values, update the index tuple and return true. Otherwise, return
+ * false and do not modify in this case.
+ */
+Datum
+brin_inclusion_add_value(PG_FUNCTION_ARGS)
+{
+ BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
+ BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
+ Datum newval = PG_GETARG_DATUM(2);
+ bool isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_BOOL(3);
+ Oid colloid = PG_GET_COLLATION();
+ FmgrInfo *finfo;
+ Datum result;
+ bool new = false;
+ AttrNumber attno;
+ Form_pg_attribute attr;
+
+ Assert(!isnull);
+
+ attno = column->bv_attno;
+ attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1);
+
+ /*
+ * If the recorded value is null, copy the new value (which we know to be
+ * not null), and we're almost done.
+ */
+ if (column->bv_allnulls)
+ {
+ column->bv_values[INCLUSION_UNION] =
+ datumCopy(newval, attr->attbyval, attr->attlen);
+ column->bv_values[INCLUSION_UNMERGEABLE] = BoolGetDatum(false);
+ column->bv_values[INCLUSION_CONTAINS_EMPTY] = BoolGetDatum(false);
+ column->bv_allnulls = false;
+ new = true;
+ }
+
+ /*
+ * No need for further processing if the block range is marked as
+ * containing unmergeable values.
+ */
+ if (DatumGetBool(column->bv_values[INCLUSION_UNMERGEABLE]))
+ PG_RETURN_BOOL(false);
+
+ /*
+ * If the opclass supports the concept of empty values, test the passed
+ * new value for emptiness; if it returns true, we need to set the
+ * "contains empty" flag in the element (unless already set).
+ */
+ finfo = inclusion_get_procinfo(bdesc, attno, PROCNUM_EMPTY);
+ if (finfo != NULL && DatumGetBool(FunctionCall1Coll(finfo, colloid, newval)))
+ {
+ if (!DatumGetBool(column->bv_values[INCLUSION_CONTAINS_EMPTY]))
+ {
+ column->bv_values[INCLUSION_CONTAINS_EMPTY] = BoolGetDatum(true);
+ PG_RETURN_BOOL(true);
+ }
+
+ PG_RETURN_BOOL(false);
+ }
+
+ if (new)
+ PG_RETURN_BOOL(true);
+
+ /* Check if the new value is already contained. */
+ finfo = inclusion_get_procinfo(bdesc, attno, PROCNUM_CONTAINS);
+ if (finfo != NULL &&
+ DatumGetBool(FunctionCall2Coll(finfo, colloid,
+ column->bv_values[INCLUSION_UNION],
+ newval)))
+ PG_RETURN_BOOL(false);
+
+ /*
+ * Check if the new value is mergeable to the existing union. If it is
+ * not, mark the value as containing unmergeable elements and get out.
+ *
+ * Note: at this point we could remove the value from the union, since
+ * it's not going to be used any longer. However, the BRIN framework
+ * doesn't allow for the value not being present. Improve someday.
+ */
+ finfo = inclusion_get_procinfo(bdesc, attno, PROCNUM_MERGEABLE);
+ if (finfo != NULL &&
+ !DatumGetBool(FunctionCall2Coll(finfo, colloid,
+ column->bv_values[INCLUSION_UNION],
+ newval)))
+ {
+ column->bv_values[INCLUSION_UNMERGEABLE] = BoolGetDatum(true);
+ PG_RETURN_BOOL(true);
+ }
+
+ /* Finally, merge the new value to the existing union. */
+ finfo = inclusion_get_procinfo(bdesc, attno, PROCNUM_MERGE);
+ Assert(finfo != NULL);
+ result = FunctionCall2Coll(finfo, colloid,
+ column->bv_values[INCLUSION_UNION], newval);
+ if (!attr->attbyval &&
+ DatumGetPointer(result) != DatumGetPointer(column->bv_values[INCLUSION_UNION]))
+ {
+ pfree(DatumGetPointer(column->bv_values[INCLUSION_UNION]));
+
+ if (result == newval)
+ result = datumCopy(result, attr->attbyval, attr->attlen);
+ }
+ column->bv_values[INCLUSION_UNION] = result;
+
+ PG_RETURN_BOOL(true);
+}
+
+/*
+ * BRIN inclusion consistent function
+ *
+ * We're no longer dealing with NULL keys in the consistent function, that is
+ * now handled by the AM code. That means we should not get any all-NULL ranges
+ * either, because those can't be consistent with regular (not [IS] NULL) keys.
+ *
+ * All of the strategies are optional.
+ */
+Datum
+brin_inclusion_consistent(PG_FUNCTION_ARGS)
+{
+ BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
+ BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
+ ScanKey key = (ScanKey) PG_GETARG_POINTER(2);
+ Oid colloid = PG_GET_COLLATION(),
+ subtype;
+ Datum unionval;
+ AttrNumber attno;
+ Datum query;
+ FmgrInfo *finfo;
+ Datum result;
+
+ /* This opclass uses the old signature with only three arguments. */
+ Assert(PG_NARGS() == 3);
+
+ /* Should not be dealing with all-NULL ranges. */
+ Assert(!column->bv_allnulls);
+
+ /* It has to be checked, if it contains elements that are not mergeable. */
+ if (DatumGetBool(column->bv_values[INCLUSION_UNMERGEABLE]))
+ PG_RETURN_BOOL(true);
+
+ attno = key->sk_attno;
+ subtype = key->sk_subtype;
+ query = key->sk_argument;
+ unionval = column->bv_values[INCLUSION_UNION];
+ switch (key->sk_strategy)
+ {
+ /*
+ * Placement strategies
+ *
+ * These are implemented by logically negating the result of the
+ * converse placement operator; for this to work, the converse
+ * operator must be part of the opclass. An error will be thrown
+ * by inclusion_get_strategy_procinfo() if the required strategy
+ * is not part of the opclass.
+ *
+ * These all return false if either argument is empty, so there is
+ * no need to check for empty elements.
+ */
+
+ case RTLeftStrategyNumber:
+ finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype,
+ RTOverRightStrategyNumber);
+ result = FunctionCall2Coll(finfo, colloid, unionval, query);
+ PG_RETURN_BOOL(!DatumGetBool(result));
+
+ case RTOverLeftStrategyNumber:
+ finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype,
+ RTRightStrategyNumber);
+ result = FunctionCall2Coll(finfo, colloid, unionval, query);
+ PG_RETURN_BOOL(!DatumGetBool(result));
+
+ case RTOverRightStrategyNumber:
+ finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype,
+ RTLeftStrategyNumber);
+ result = FunctionCall2Coll(finfo, colloid, unionval, query);
+ PG_RETURN_BOOL(!DatumGetBool(result));
+
+ case RTRightStrategyNumber:
+ finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype,
+ RTOverLeftStrategyNumber);
+ result = FunctionCall2Coll(finfo, colloid, unionval, query);
+ PG_RETURN_BOOL(!DatumGetBool(result));
+
+ case RTBelowStrategyNumber:
+ finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype,
+ RTOverAboveStrategyNumber);
+ result = FunctionCall2Coll(finfo, colloid, unionval, query);
+ PG_RETURN_BOOL(!DatumGetBool(result));
+
+ case RTOverBelowStrategyNumber:
+ finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype,
+ RTAboveStrategyNumber);
+ result = FunctionCall2Coll(finfo, colloid, unionval, query);
+ PG_RETURN_BOOL(!DatumGetBool(result));
+
+ case RTOverAboveStrategyNumber:
+ finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype,
+ RTBelowStrategyNumber);
+ result = FunctionCall2Coll(finfo, colloid, unionval, query);
+ PG_RETURN_BOOL(!DatumGetBool(result));
+
+ case RTAboveStrategyNumber:
+ finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype,
+ RTOverBelowStrategyNumber);
+ result = FunctionCall2Coll(finfo, colloid, unionval, query);
+ PG_RETURN_BOOL(!DatumGetBool(result));
+
+ /*
+ * Overlap and contains strategies
+ *
+ * These strategies are simple enough that we can simply call the
+ * operator and return its result. Empty elements don't change
+ * the result.
+ */
+
+ case RTOverlapStrategyNumber:
+ case RTContainsStrategyNumber:
+ case RTContainsElemStrategyNumber:
+ case RTSubStrategyNumber:
+ case RTSubEqualStrategyNumber:
+ finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype,
+ key->sk_strategy);
+ result = FunctionCall2Coll(finfo, colloid, unionval, query);
+ PG_RETURN_DATUM(result);
+
+ /*
+ * Contained by strategies
+ *
+ * We cannot just call the original operator for the contained by
+ * strategies because some elements can be contained even though
+ * the union is not; instead we use the overlap operator.
+ *
+ * We check for empty elements separately as they are not merged
+ * to the union but contained by everything.
+ */
+
+ case RTContainedByStrategyNumber:
+ case RTSuperStrategyNumber:
+ case RTSuperEqualStrategyNumber:
+ finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype,
+ RTOverlapStrategyNumber);
+ result = FunctionCall2Coll(finfo, colloid, unionval, query);
+ if (DatumGetBool(result))
+ PG_RETURN_BOOL(true);
+
+ PG_RETURN_DATUM(column->bv_values[INCLUSION_CONTAINS_EMPTY]);
+
+ /*
+ * Adjacent strategy
+ *
+ * We test for overlap first but to be safe we need to call the
+ * actual adjacent operator also.
+ *
+ * An empty element cannot be adjacent to any other, so there is
+ * no need to check for it.
+ */
+
+ case RTAdjacentStrategyNumber:
+ finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype,
+ RTOverlapStrategyNumber);
+ result = FunctionCall2Coll(finfo, colloid, unionval, query);
+ if (DatumGetBool(result))
+ PG_RETURN_BOOL(true);
+
+ finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype,
+ RTAdjacentStrategyNumber);
+ result = FunctionCall2Coll(finfo, colloid, unionval, query);
+ PG_RETURN_DATUM(result);
+
+ /*
+ * Basic comparison strategies
+ *
+ * It is straightforward to support the equality strategies with
+ * the contains operator. Generally, inequality strategies do not
+ * make much sense for the types which will be used with the
+ * inclusion BRIN family of opclasses, but it is possible to
+ * implement them with logical negation of the left-of and
+ * right-of operators.
+ *
+ * NB: These strategies cannot be used with geometric datatypes
+ * that use comparison of areas! The only exception is the "same"
+ * strategy.
+ *
+ * Empty elements are considered to be less than the others. We
+ * cannot use the empty support function to check the query is an
+ * empty element, because the query can be another data type than
+ * the empty support function argument. So we will return true,
+ * if there is a possibility that empty elements will change the
+ * result.
+ */
+
+ case RTLessStrategyNumber:
+ case RTLessEqualStrategyNumber:
+ finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype,
+ RTRightStrategyNumber);
+ result = FunctionCall2Coll(finfo, colloid, unionval, query);
+ if (!DatumGetBool(result))
+ PG_RETURN_BOOL(true);
+
+ PG_RETURN_DATUM(column->bv_values[INCLUSION_CONTAINS_EMPTY]);
+
+ case RTSameStrategyNumber:
+ case RTEqualStrategyNumber:
+ finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype,
+ RTContainsStrategyNumber);
+ result = FunctionCall2Coll(finfo, colloid, unionval, query);
+ if (DatumGetBool(result))
+ PG_RETURN_BOOL(true);
+
+ PG_RETURN_DATUM(column->bv_values[INCLUSION_CONTAINS_EMPTY]);
+
+ case RTGreaterEqualStrategyNumber:
+ finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype,
+ RTLeftStrategyNumber);
+ result = FunctionCall2Coll(finfo, colloid, unionval, query);
+ if (!DatumGetBool(result))
+ PG_RETURN_BOOL(true);
+
+ PG_RETURN_DATUM(column->bv_values[INCLUSION_CONTAINS_EMPTY]);
+
+ case RTGreaterStrategyNumber:
+ /* no need to check for empty elements */
+ finfo = inclusion_get_strategy_procinfo(bdesc, attno, subtype,
+ RTLeftStrategyNumber);
+ result = FunctionCall2Coll(finfo, colloid, unionval, query);
+ PG_RETURN_BOOL(!DatumGetBool(result));
+
+ default:
+ /* shouldn't happen */
+ elog(ERROR, "invalid strategy number %d", key->sk_strategy);
+ PG_RETURN_BOOL(false);
+ }
+}
+
+/*
+ * BRIN inclusion union function
+ *
+ * Given two BrinValues, update the first of them as a union of the summary
+ * values contained in both. The second one is untouched.
+ */
+Datum
+brin_inclusion_union(PG_FUNCTION_ARGS)
+{
+ BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
+ BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1);
+ BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2);
+ Oid colloid = PG_GET_COLLATION();
+ AttrNumber attno;
+ Form_pg_attribute attr;
+ FmgrInfo *finfo;
+ Datum result;
+
+ Assert(col_a->bv_attno == col_b->bv_attno);
+ Assert(!col_a->bv_allnulls && !col_b->bv_allnulls);
+
+ attno = col_a->bv_attno;
+ attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1);
+
+ /* If B includes empty elements, mark A similarly, if needed. */
+ if (!DatumGetBool(col_a->bv_values[INCLUSION_CONTAINS_EMPTY]) &&
+ DatumGetBool(col_b->bv_values[INCLUSION_CONTAINS_EMPTY]))
+ col_a->bv_values[INCLUSION_CONTAINS_EMPTY] = BoolGetDatum(true);
+
+ /* Check if A includes elements that are not mergeable. */
+ if (DatumGetBool(col_a->bv_values[INCLUSION_UNMERGEABLE]))
+ PG_RETURN_VOID();
+
+ /* If B includes elements that are not mergeable, mark A similarly. */
+ if (DatumGetBool(col_b->bv_values[INCLUSION_UNMERGEABLE]))
+ {
+ col_a->bv_values[INCLUSION_UNMERGEABLE] = BoolGetDatum(true);
+ PG_RETURN_VOID();
+ }
+
+ /* Check if A and B are mergeable; if not, mark A unmergeable. */
+ finfo = inclusion_get_procinfo(bdesc, attno, PROCNUM_MERGEABLE);
+ if (finfo != NULL &&
+ !DatumGetBool(FunctionCall2Coll(finfo, colloid,
+ col_a->bv_values[INCLUSION_UNION],
+ col_b->bv_values[INCLUSION_UNION])))
+ {
+ col_a->bv_values[INCLUSION_UNMERGEABLE] = BoolGetDatum(true);
+ PG_RETURN_VOID();
+ }
+
+ /* Finally, merge B to A. */
+ finfo = inclusion_get_procinfo(bdesc, attno, PROCNUM_MERGE);
+ Assert(finfo != NULL);
+ result = FunctionCall2Coll(finfo, colloid,
+ col_a->bv_values[INCLUSION_UNION],
+ col_b->bv_values[INCLUSION_UNION]);
+ if (!attr->attbyval &&
+ DatumGetPointer(result) != DatumGetPointer(col_a->bv_values[INCLUSION_UNION]))
+ {
+ pfree(DatumGetPointer(col_a->bv_values[INCLUSION_UNION]));
+
+ if (result == col_b->bv_values[INCLUSION_UNION])
+ result = datumCopy(result, attr->attbyval, attr->attlen);
+ }
+ col_a->bv_values[INCLUSION_UNION] = result;
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * Cache and return inclusion opclass support procedure
+ *
+ * Return the procedure corresponding to the given function support number
+ * or null if it is not exists.
+ */
+static FmgrInfo *
+inclusion_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum)
+{
+ InclusionOpaque *opaque;
+ uint16 basenum = procnum - PROCNUM_BASE;
+
+ /*
+ * We cache these in the opaque struct, to avoid repetitive syscache
+ * lookups.
+ */
+ opaque = (InclusionOpaque *) bdesc->bd_info[attno - 1]->oi_opaque;
+
+ /*
+ * If we already searched for this proc and didn't find it, don't bother
+ * searching again.
+ */
+ if (opaque->extra_proc_missing[basenum])
+ return NULL;
+
+ if (opaque->extra_procinfos[basenum].fn_oid == InvalidOid)
+ {
+ if (RegProcedureIsValid(index_getprocid(bdesc->bd_index, attno,
+ procnum)))
+ {
+ fmgr_info_copy(&opaque->extra_procinfos[basenum],
+ index_getprocinfo(bdesc->bd_index, attno, procnum),
+ bdesc->bd_context);
+ }
+ else
+ {
+ opaque->extra_proc_missing[basenum] = true;
+ return NULL;
+ }
+ }
+
+ return &opaque->extra_procinfos[basenum];
+}
+
+/*
+ * Cache and return the procedure of the given strategy
+ *
+ * Return the procedure corresponding to the given sub-type and strategy
+ * number. The data type of the index will be used as the left hand side of
+ * the operator and the given sub-type will be used as the right hand side.
+ * Throws an error if the pg_amop row does not exist, but that should not
+ * happen with a properly configured opclass.
+ *
+ * It always throws an error when the data type of the opclass is different
+ * from the data type of the column or the expression. That happens when the
+ * column data type has implicit cast to the opclass data type. We don't
+ * bother casting types, because this situation can easily be avoided by
+ * setting storage data type to that of the opclass. The same problem does not
+ * apply to the data type of the right hand side, because the type in the
+ * ScanKey always matches the opclass' one.
+ *
+ * Note: this function mirrors minmax_get_strategy_procinfo; if changes are
+ * made here, see that function too.
+ */
+static FmgrInfo *
+inclusion_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno, Oid subtype,
+ uint16 strategynum)
+{
+ InclusionOpaque *opaque;
+
+ Assert(strategynum >= 1 &&
+ strategynum <= RTMaxStrategyNumber);
+
+ opaque = (InclusionOpaque *) bdesc->bd_info[attno - 1]->oi_opaque;
+
+ /*
+ * We cache the procedures for the last sub-type in the opaque struct, to
+ * avoid repetitive syscache lookups. If the sub-type is changed,
+ * invalidate all the cached entries.
+ */
+ if (opaque->cached_subtype != subtype)
+ {
+ uint16 i;
+
+ for (i = 1; i <= RTMaxStrategyNumber; i++)
+ opaque->strategy_procinfos[i - 1].fn_oid = InvalidOid;
+ opaque->cached_subtype = subtype;
+ }
+
+ if (opaque->strategy_procinfos[strategynum - 1].fn_oid == InvalidOid)
+ {
+ Form_pg_attribute attr;
+ HeapTuple tuple;
+ Oid opfamily,
+ oprid;
+ bool isNull;
+
+ opfamily = bdesc->bd_index->rd_opfamily[attno - 1];
+ attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1);
+ tuple = SearchSysCache4(AMOPSTRATEGY, ObjectIdGetDatum(opfamily),
+ ObjectIdGetDatum(attr->atttypid),
+ ObjectIdGetDatum(subtype),
+ Int16GetDatum(strategynum));
+
+ if (!HeapTupleIsValid(tuple))
+ elog(ERROR, "missing operator %d(%u,%u) in opfamily %u",
+ strategynum, attr->atttypid, subtype, opfamily);
+
+ oprid = DatumGetObjectId(SysCacheGetAttr(AMOPSTRATEGY, tuple,
+ Anum_pg_amop_amopopr, &isNull));
+ ReleaseSysCache(tuple);
+ Assert(!isNull && RegProcedureIsValid(oprid));
+
+ fmgr_info_cxt(get_opcode(oprid),
+ &opaque->strategy_procinfos[strategynum - 1],
+ bdesc->bd_context);
+ }
+
+ return &opaque->strategy_procinfos[strategynum - 1];
+}
diff --git a/src/backend/access/brin/brin_minmax.c b/src/backend/access/brin/brin_minmax.c
new file mode 100644
index 0000000..798f06c
--- /dev/null
+++ b/src/backend/access/brin/brin_minmax.c
@@ -0,0 +1,317 @@
+/*
+ * brin_minmax.c
+ * Implementation of Min/Max opclass for BRIN
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/brin/brin_minmax.c
+ */
+#include "postgres.h"
+
+#include "access/brin_internal.h"
+#include "access/brin_tuple.h"
+#include "access/genam.h"
+#include "access/stratnum.h"
+#include "catalog/pg_amop.h"
+#include "catalog/pg_type.h"
+#include "utils/builtins.h"
+#include "utils/datum.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+
+typedef struct MinmaxOpaque
+{
+ Oid cached_subtype;
+ FmgrInfo strategy_procinfos[BTMaxStrategyNumber];
+} MinmaxOpaque;
+
+static FmgrInfo *minmax_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno,
+ Oid subtype, uint16 strategynum);
+
+
+Datum
+brin_minmax_opcinfo(PG_FUNCTION_ARGS)
+{
+ Oid typoid = PG_GETARG_OID(0);
+ BrinOpcInfo *result;
+
+ /*
+ * opaque->strategy_procinfos is initialized lazily; here it is set to
+ * all-uninitialized by palloc0 which sets fn_oid to InvalidOid.
+ */
+
+ result = palloc0(MAXALIGN(SizeofBrinOpcInfo(2)) +
+ sizeof(MinmaxOpaque));
+ result->oi_nstored = 2;
+ result->oi_regular_nulls = true;
+ result->oi_opaque = (MinmaxOpaque *)
+ MAXALIGN((char *) result + SizeofBrinOpcInfo(2));
+ result->oi_typcache[0] = result->oi_typcache[1] =
+ lookup_type_cache(typoid, 0);
+
+ PG_RETURN_POINTER(result);
+}
+
+/*
+ * Examine the given index tuple (which contains partial status of a certain
+ * page range) by comparing it to the given value that comes from another heap
+ * tuple. If the new value is outside the min/max range specified by the
+ * existing tuple values, update the index tuple and return true. Otherwise,
+ * return false and do not modify in this case.
+ */
+Datum
+brin_minmax_add_value(PG_FUNCTION_ARGS)
+{
+ BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
+ BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
+ Datum newval = PG_GETARG_DATUM(2);
+ bool isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_DATUM(3);
+ Oid colloid = PG_GET_COLLATION();
+ FmgrInfo *cmpFn;
+ Datum compar;
+ bool updated = false;
+ Form_pg_attribute attr;
+ AttrNumber attno;
+
+ Assert(!isnull);
+
+ attno = column->bv_attno;
+ attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1);
+
+ /*
+ * If the recorded value is null, store the new value (which we know to be
+ * not null) as both minimum and maximum, and we're done.
+ */
+ if (column->bv_allnulls)
+ {
+ column->bv_values[0] = datumCopy(newval, attr->attbyval, attr->attlen);
+ column->bv_values[1] = datumCopy(newval, attr->attbyval, attr->attlen);
+ column->bv_allnulls = false;
+ PG_RETURN_BOOL(true);
+ }
+
+ /*
+ * Otherwise, need to compare the new value with the existing boundaries
+ * and update them accordingly. First check if it's less than the
+ * existing minimum.
+ */
+ cmpFn = minmax_get_strategy_procinfo(bdesc, attno, attr->atttypid,
+ BTLessStrategyNumber);
+ compar = FunctionCall2Coll(cmpFn, colloid, newval, column->bv_values[0]);
+ if (DatumGetBool(compar))
+ {
+ if (!attr->attbyval)
+ pfree(DatumGetPointer(column->bv_values[0]));
+ column->bv_values[0] = datumCopy(newval, attr->attbyval, attr->attlen);
+ updated = true;
+ }
+
+ /*
+ * And now compare it to the existing maximum.
+ */
+ cmpFn = minmax_get_strategy_procinfo(bdesc, attno, attr->atttypid,
+ BTGreaterStrategyNumber);
+ compar = FunctionCall2Coll(cmpFn, colloid, newval, column->bv_values[1]);
+ if (DatumGetBool(compar))
+ {
+ if (!attr->attbyval)
+ pfree(DatumGetPointer(column->bv_values[1]));
+ column->bv_values[1] = datumCopy(newval, attr->attbyval, attr->attlen);
+ updated = true;
+ }
+
+ PG_RETURN_BOOL(updated);
+}
+
+/*
+ * Given an index tuple corresponding to a certain page range and a scan key,
+ * return whether the scan key is consistent with the index tuple's min/max
+ * values. Return true if so, false otherwise.
+ *
+ * We're no longer dealing with NULL keys in the consistent function, that is
+ * now handled by the AM code. That means we should not get any all-NULL ranges
+ * either, because those can't be consistent with regular (not [IS] NULL) keys.
+ */
+Datum
+brin_minmax_consistent(PG_FUNCTION_ARGS)
+{
+ BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
+ BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
+ ScanKey key = (ScanKey) PG_GETARG_POINTER(2);
+ Oid colloid = PG_GET_COLLATION(),
+ subtype;
+ AttrNumber attno;
+ Datum value;
+ Datum matches;
+ FmgrInfo *finfo;
+
+ /* This opclass uses the old signature with only three arguments. */
+ Assert(PG_NARGS() == 3);
+
+ /* Should not be dealing with all-NULL ranges. */
+ Assert(!column->bv_allnulls);
+
+ attno = key->sk_attno;
+ subtype = key->sk_subtype;
+ value = key->sk_argument;
+ switch (key->sk_strategy)
+ {
+ case BTLessStrategyNumber:
+ case BTLessEqualStrategyNumber:
+ finfo = minmax_get_strategy_procinfo(bdesc, attno, subtype,
+ key->sk_strategy);
+ matches = FunctionCall2Coll(finfo, colloid, column->bv_values[0],
+ value);
+ break;
+ case BTEqualStrategyNumber:
+
+ /*
+ * In the equality case (WHERE col = someval), we want to return
+ * the current page range if the minimum value in the range <=
+ * scan key, and the maximum value >= scan key.
+ */
+ finfo = minmax_get_strategy_procinfo(bdesc, attno, subtype,
+ BTLessEqualStrategyNumber);
+ matches = FunctionCall2Coll(finfo, colloid, column->bv_values[0],
+ value);
+ if (!DatumGetBool(matches))
+ break;
+ /* max() >= scankey */
+ finfo = minmax_get_strategy_procinfo(bdesc, attno, subtype,
+ BTGreaterEqualStrategyNumber);
+ matches = FunctionCall2Coll(finfo, colloid, column->bv_values[1],
+ value);
+ break;
+ case BTGreaterEqualStrategyNumber:
+ case BTGreaterStrategyNumber:
+ finfo = minmax_get_strategy_procinfo(bdesc, attno, subtype,
+ key->sk_strategy);
+ matches = FunctionCall2Coll(finfo, colloid, column->bv_values[1],
+ value);
+ break;
+ default:
+ /* shouldn't happen */
+ elog(ERROR, "invalid strategy number %d", key->sk_strategy);
+ matches = 0;
+ break;
+ }
+
+ PG_RETURN_DATUM(matches);
+}
+
+/*
+ * Given two BrinValues, update the first of them as a union of the summary
+ * values contained in both. The second one is untouched.
+ */
+Datum
+brin_minmax_union(PG_FUNCTION_ARGS)
+{
+ BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
+ BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1);
+ BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2);
+ Oid colloid = PG_GET_COLLATION();
+ AttrNumber attno;
+ Form_pg_attribute attr;
+ FmgrInfo *finfo;
+ bool needsadj;
+
+ Assert(col_a->bv_attno == col_b->bv_attno);
+ Assert(!col_a->bv_allnulls && !col_b->bv_allnulls);
+
+ attno = col_a->bv_attno;
+ attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1);
+
+ /* Adjust minimum, if B's min is less than A's min */
+ finfo = minmax_get_strategy_procinfo(bdesc, attno, attr->atttypid,
+ BTLessStrategyNumber);
+ needsadj = FunctionCall2Coll(finfo, colloid, col_b->bv_values[0],
+ col_a->bv_values[0]);
+ if (needsadj)
+ {
+ if (!attr->attbyval)
+ pfree(DatumGetPointer(col_a->bv_values[0]));
+ col_a->bv_values[0] = datumCopy(col_b->bv_values[0],
+ attr->attbyval, attr->attlen);
+ }
+
+ /* Adjust maximum, if B's max is greater than A's max */
+ finfo = minmax_get_strategy_procinfo(bdesc, attno, attr->atttypid,
+ BTGreaterStrategyNumber);
+ needsadj = FunctionCall2Coll(finfo, colloid, col_b->bv_values[1],
+ col_a->bv_values[1]);
+ if (needsadj)
+ {
+ if (!attr->attbyval)
+ pfree(DatumGetPointer(col_a->bv_values[1]));
+ col_a->bv_values[1] = datumCopy(col_b->bv_values[1],
+ attr->attbyval, attr->attlen);
+ }
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * Cache and return the procedure for the given strategy.
+ *
+ * Note: this function mirrors inclusion_get_strategy_procinfo; see notes
+ * there. If changes are made here, see that function too.
+ */
+static FmgrInfo *
+minmax_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno, Oid subtype,
+ uint16 strategynum)
+{
+ MinmaxOpaque *opaque;
+
+ Assert(strategynum >= 1 &&
+ strategynum <= BTMaxStrategyNumber);
+
+ opaque = (MinmaxOpaque *) bdesc->bd_info[attno - 1]->oi_opaque;
+
+ /*
+ * We cache the procedures for the previous subtype in the opaque struct,
+ * to avoid repetitive syscache lookups. If the subtype changed,
+ * invalidate all the cached entries.
+ */
+ if (opaque->cached_subtype != subtype)
+ {
+ uint16 i;
+
+ for (i = 1; i <= BTMaxStrategyNumber; i++)
+ opaque->strategy_procinfos[i - 1].fn_oid = InvalidOid;
+ opaque->cached_subtype = subtype;
+ }
+
+ if (opaque->strategy_procinfos[strategynum - 1].fn_oid == InvalidOid)
+ {
+ Form_pg_attribute attr;
+ HeapTuple tuple;
+ Oid opfamily,
+ oprid;
+ bool isNull;
+
+ opfamily = bdesc->bd_index->rd_opfamily[attno - 1];
+ attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1);
+ tuple = SearchSysCache4(AMOPSTRATEGY, ObjectIdGetDatum(opfamily),
+ ObjectIdGetDatum(attr->atttypid),
+ ObjectIdGetDatum(subtype),
+ Int16GetDatum(strategynum));
+
+ if (!HeapTupleIsValid(tuple))
+ elog(ERROR, "missing operator %d(%u,%u) in opfamily %u",
+ strategynum, attr->atttypid, subtype, opfamily);
+
+ oprid = DatumGetObjectId(SysCacheGetAttr(AMOPSTRATEGY, tuple,
+ Anum_pg_amop_amopopr, &isNull));
+ ReleaseSysCache(tuple);
+ Assert(!isNull && RegProcedureIsValid(oprid));
+
+ fmgr_info_cxt(get_opcode(oprid),
+ &opaque->strategy_procinfos[strategynum - 1],
+ bdesc->bd_context);
+ }
+
+ return &opaque->strategy_procinfos[strategynum - 1];
+}
diff --git a/src/backend/access/brin/brin_minmax_multi.c b/src/backend/access/brin/brin_minmax_multi.c
new file mode 100644
index 0000000..5200916
--- /dev/null
+++ b/src/backend/access/brin/brin_minmax_multi.c
@@ -0,0 +1,3163 @@
+/*
+ * brin_minmax_multi.c
+ * Implementation of Multi Min/Max opclass for BRIN
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * Implements a variant of minmax opclass, where the summary is composed of
+ * multiple smaller intervals. This allows us to handle outliers, which
+ * usually make the simple minmax opclass inefficient.
+ *
+ * Consider for example page range with simple minmax interval [1000,2000],
+ * and assume a new row gets inserted into the range with value 1000000.
+ * Due to that the interval gets [1000,1000000]. I.e. the minmax interval
+ * got 1000x wider and won't be useful to eliminate scan keys between 2001
+ * and 1000000.
+ *
+ * With minmax-multi opclass, we may have [1000,2000] interval initially,
+ * but after adding the new row we start tracking it as two interval:
+ *
+ * [1000,2000] and [1000000,1000000]
+ *
+ * This allows us to still eliminate the page range when the scan keys hit
+ * the gap between 2000 and 1000000, making it useful in cases when the
+ * simple minmax opclass gets inefficient.
+ *
+ * The number of intervals tracked per page range is somewhat flexible.
+ * What is restricted is the number of values per page range, and the limit
+ * is currently 32 (see values_per_range reloption). Collapsed intervals
+ * (with equal minimum and maximum value) are stored as a single value,
+ * while regular intervals require two values.
+ *
+ * When the number of values gets too high (by adding new values to the
+ * summary), we merge some of the intervals to free space for more values.
+ * This is done in a greedy way - we simply pick the two closest intervals,
+ * merge them, and repeat this until the number of values to store gets
+ * sufficiently low (below 50% of maximum values), but that is mostly
+ * arbitrary threshold and may be changed easily).
+ *
+ * To pick the closest intervals we use the "distance" support procedure,
+ * which measures space between two ranges (i.e. the length of an interval).
+ * The computed value may be an approximation - in the worst case we will
+ * merge two ranges that are slightly less optimal at that step, but the
+ * index should still produce correct results.
+ *
+ * The compactions (reducing the number of values) is fairly expensive, as
+ * it requires calling the distance functions, sorting etc. So when building
+ * the summary, we use a significantly larger buffer, and only enforce the
+ * exact limit at the very end. This improves performance, and it also helps
+ * with building better ranges (due to the greedy approach).
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/brin/brin_minmax_multi.c
+ */
+#include "postgres.h"
+
+/* needed for PGSQL_AF_INET */
+#include <sys/socket.h>
+
+#include "access/genam.h"
+#include "access/brin.h"
+#include "access/brin_internal.h"
+#include "access/brin_tuple.h"
+#include "access/reloptions.h"
+#include "access/stratnum.h"
+#include "access/htup_details.h"
+#include "catalog/pg_type.h"
+#include "catalog/pg_am.h"
+#include "catalog/pg_amop.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/date.h"
+#include "utils/datum.h"
+#include "utils/float.h"
+#include "utils/inet.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/numeric.h"
+#include "utils/pg_lsn.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+#include "utils/timestamp.h"
+#include "utils/uuid.h"
+
+/*
+ * Additional SQL level support functions
+ *
+ * Procedure numbers must not use values reserved for BRIN itself; see
+ * brin_internal.h.
+ */
+#define MINMAX_MAX_PROCNUMS 1 /* maximum support procs we need */
+#define PROCNUM_DISTANCE 11 /* required, distance between values */
+
+/*
+ * Subtract this from procnum to obtain index in MinmaxMultiOpaque arrays
+ * (Must be equal to minimum of private procnums).
+ */
+#define PROCNUM_BASE 11
+
+/*
+ * Sizing the insert buffer - we use 10x the number of values specified
+ * in the reloption, but we cap it to 8192 not to get too large. When
+ * the buffer gets full, we reduce the number of values by half.
+ */
+#define MINMAX_BUFFER_FACTOR 10
+#define MINMAX_BUFFER_MIN 256
+#define MINMAX_BUFFER_MAX 8192
+#define MINMAX_BUFFER_LOAD_FACTOR 0.5
+
+typedef struct MinmaxMultiOpaque
+{
+ FmgrInfo extra_procinfos[MINMAX_MAX_PROCNUMS];
+ bool extra_proc_missing[MINMAX_MAX_PROCNUMS];
+ Oid cached_subtype;
+ FmgrInfo strategy_procinfos[BTMaxStrategyNumber];
+} MinmaxMultiOpaque;
+
+/*
+ * Storage type for BRIN's minmax reloptions
+ */
+typedef struct MinMaxMultiOptions
+{
+ int32 vl_len_; /* varlena header (do not touch directly!) */
+ int valuesPerRange; /* number of values per range */
+} MinMaxMultiOptions;
+
+#define MINMAX_MULTI_DEFAULT_VALUES_PER_PAGE 32
+
+#define MinMaxMultiGetValuesPerRange(opts) \
+ ((opts) && (((MinMaxMultiOptions *) (opts))->valuesPerRange != 0) ? \
+ ((MinMaxMultiOptions *) (opts))->valuesPerRange : \
+ MINMAX_MULTI_DEFAULT_VALUES_PER_PAGE)
+
+#define SAMESIGN(a,b) (((a) < 0) == ((b) < 0))
+
+/*
+ * The summary of minmax-multi indexes has two representations - Ranges for
+ * convenient processing, and SerializedRanges for storage in bytea value.
+ *
+ * The Ranges struct stores the boundary values in a single array, but we
+ * treat regular and single-point ranges differently to save space. For
+ * regular ranges (with different boundary values) we have to store both
+ * values, while for "single-point ranges" we only need to save one value.
+ *
+ * The 'values' array stores boundary values for regular ranges first (there
+ * are 2*nranges values to store), and then the nvalues boundary values for
+ * single-point ranges. That is, we have (2*nranges + nvalues) boundary
+ * values in the array.
+ *
+ * +---------------------------------+-------------------------------+
+ * | ranges (sorted pairs of values) | sorted values (single points) |
+ * +---------------------------------+-------------------------------+
+ *
+ * This allows us to quickly add new values, and store outliers without
+ * making the other ranges very wide.
+ *
+ * We never store more than maxvalues values (as set by values_per_range
+ * reloption). If needed we merge some of the ranges.
+ *
+ * To minimize palloc overhead, we always allocate the full array with
+ * space for maxvalues elements. This should be fine as long as the
+ * maxvalues is reasonably small (64 seems fine), which is the case
+ * thanks to values_per_range reloption being limited to 256.
+ */
+typedef struct Ranges
+{
+ /* Cache information that we need quite often. */
+ Oid typid;
+ Oid colloid;
+ AttrNumber attno;
+ FmgrInfo *cmp;
+
+ /* (2*nranges + nvalues) <= maxvalues */
+ int nranges; /* number of ranges in the array (stored) */
+ int nsorted; /* number of sorted values (ranges + points) */
+ int nvalues; /* number of values in the data array (all) */
+ int maxvalues; /* maximum number of values (reloption) */
+
+ /*
+ * We simply add the values into a large buffer, without any expensive
+ * steps (sorting, deduplication, ...). The buffer is a multiple of the
+ * target number of values, so the compaction happens less often,
+ * amortizing the costs. We keep the actual target and compact to the
+ * requested number of values at the very end, before serializing to
+ * on-disk representation.
+ */
+ /* requested number of values */
+ int target_maxvalues;
+
+ /* values stored for this range - either raw values, or ranges */
+ Datum values[FLEXIBLE_ARRAY_MEMBER];
+} Ranges;
+
+/*
+ * On-disk the summary is stored as a bytea value, with a simple header
+ * with basic metadata, followed by the boundary values. It has a varlena
+ * header, so can be treated as varlena directly.
+ *
+ * See range_serialize/range_deserialize for serialization details.
+ */
+typedef struct SerializedRanges
+{
+ /* varlena header (do not touch directly!) */
+ int32 vl_len_;
+
+ /* type of values stored in the data array */
+ Oid typid;
+
+ /* (2*nranges + nvalues) <= maxvalues */
+ int nranges; /* number of ranges in the array (stored) */
+ int nvalues; /* number of values in the data array (all) */
+ int maxvalues; /* maximum number of values (reloption) */
+
+ /* contains the actual data */
+ char data[FLEXIBLE_ARRAY_MEMBER];
+} SerializedRanges;
+
+static SerializedRanges *range_serialize(Ranges *range);
+
+static Ranges *range_deserialize(int maxvalues, SerializedRanges *range);
+
+
+/*
+ * Used to represent ranges expanded to make merging and combining easier.
+ *
+ * Each expanded range is essentially an interval, represented by min/max
+ * values, along with a flag whether it's a collapsed range (in which case
+ * the min and max values are equal). We have the flag to handle by-ref
+ * data types - we can't simply compare the datums, and this saves some
+ * calls to the type-specific comparator function.
+ */
+typedef struct ExpandedRange
+{
+ Datum minval; /* lower boundary */
+ Datum maxval; /* upper boundary */
+ bool collapsed; /* true if minval==maxval */
+} ExpandedRange;
+
+/*
+ * Represents a distance between two ranges (identified by index into
+ * an array of extended ranges).
+ */
+typedef struct DistanceValue
+{
+ int index;
+ double value;
+} DistanceValue;
+
+
+/* Cache for support and strategy procedures. */
+
+static FmgrInfo *minmax_multi_get_procinfo(BrinDesc *bdesc, uint16 attno,
+ uint16 procnum);
+
+static FmgrInfo *minmax_multi_get_strategy_procinfo(BrinDesc *bdesc,
+ uint16 attno, Oid subtype,
+ uint16 strategynum);
+
+typedef struct compare_context
+{
+ FmgrInfo *cmpFn;
+ Oid colloid;
+} compare_context;
+
+static int compare_values(const void *a, const void *b, void *arg);
+
+
+#ifdef USE_ASSERT_CHECKING
+/*
+ * Check that the order of the array values is correct, using the cmp
+ * function (which should be BTLessStrategyNumber).
+ */
+static void
+AssertArrayOrder(FmgrInfo *cmp, Oid colloid, Datum *values, int nvalues)
+{
+ int i;
+ Datum lt;
+
+ for (i = 0; i < (nvalues - 1); i++)
+ {
+ lt = FunctionCall2Coll(cmp, colloid, values[i], values[i + 1]);
+ Assert(DatumGetBool(lt));
+ }
+}
+#endif
+
+/*
+ * Comprehensive check of the Ranges structure.
+ */
+static void
+AssertCheckRanges(Ranges *ranges, FmgrInfo *cmpFn, Oid colloid)
+{
+#ifdef USE_ASSERT_CHECKING
+ int i;
+
+ /* some basic sanity checks */
+ Assert(ranges->nranges >= 0);
+ Assert(ranges->nsorted >= 0);
+ Assert(ranges->nvalues >= ranges->nsorted);
+ Assert(ranges->maxvalues >= 2 * ranges->nranges + ranges->nvalues);
+ Assert(ranges->typid != InvalidOid);
+
+ /*
+ * First the ranges - there are 2*nranges boundary values, and the values
+ * have to be strictly ordered (equal values would mean the range is
+ * collapsed, and should be stored as a point). This also guarantees that
+ * the ranges do not overlap.
+ */
+ AssertArrayOrder(cmpFn, colloid, ranges->values, 2 * ranges->nranges);
+
+ /* then the single-point ranges (with nvalues boundar values ) */
+ AssertArrayOrder(cmpFn, colloid, &ranges->values[2 * ranges->nranges],
+ ranges->nsorted);
+
+ /*
+ * Check that none of the values are not covered by ranges (both sorted
+ * and unsorted)
+ */
+ for (i = 0; i < ranges->nvalues; i++)
+ {
+ Datum compar;
+ int start,
+ end;
+ Datum minvalue,
+ maxvalue;
+
+ Datum value = ranges->values[2 * ranges->nranges + i];
+
+ if (ranges->nranges == 0)
+ break;
+
+ minvalue = ranges->values[0];
+ maxvalue = ranges->values[2 * ranges->nranges - 1];
+
+ /*
+ * Is the value smaller than the minval? If yes, we'll recurse to the
+ * left side of range array.
+ */
+ compar = FunctionCall2Coll(cmpFn, colloid, value, minvalue);
+
+ /* smaller than the smallest value in the first range */
+ if (DatumGetBool(compar))
+ continue;
+
+ /*
+ * Is the value greater than the maxval? If yes, we'll recurse to the
+ * right side of range array.
+ */
+ compar = FunctionCall2Coll(cmpFn, colloid, maxvalue, value);
+
+ /* larger than the largest value in the last range */
+ if (DatumGetBool(compar))
+ continue;
+
+ start = 0; /* first range */
+ end = ranges->nranges - 1; /* last range */
+ while (true)
+ {
+ int midpoint = (start + end) / 2;
+
+ /* this means we ran out of ranges in the last step */
+ if (start > end)
+ break;
+
+ /* copy the min/max values from the ranges */
+ minvalue = ranges->values[2 * midpoint];
+ maxvalue = ranges->values[2 * midpoint + 1];
+
+ /*
+ * Is the value smaller than the minval? If yes, we'll recurse to
+ * the left side of range array.
+ */
+ compar = FunctionCall2Coll(cmpFn, colloid, value, minvalue);
+
+ /* smaller than the smallest value in this range */
+ if (DatumGetBool(compar))
+ {
+ end = (midpoint - 1);
+ continue;
+ }
+
+ /*
+ * Is the value greater than the minval? If yes, we'll recurse to
+ * the right side of range array.
+ */
+ compar = FunctionCall2Coll(cmpFn, colloid, maxvalue, value);
+
+ /* larger than the largest value in this range */
+ if (DatumGetBool(compar))
+ {
+ start = (midpoint + 1);
+ continue;
+ }
+
+ /* hey, we found a matching range */
+ Assert(false);
+ }
+ }
+
+ /* and values in the unsorted part must not be in sorted part */
+ for (i = ranges->nsorted; i < ranges->nvalues; i++)
+ {
+ compare_context cxt;
+ Datum value = ranges->values[2 * ranges->nranges + i];
+
+ if (ranges->nsorted == 0)
+ break;
+
+ cxt.colloid = ranges->colloid;
+ cxt.cmpFn = ranges->cmp;
+
+ Assert(bsearch_arg(&value, &ranges->values[2 * ranges->nranges],
+ ranges->nsorted, sizeof(Datum),
+ compare_values, (void *) &cxt) == NULL);
+ }
+#endif
+}
+
+/*
+ * Check that the expanded ranges (built when reducing the number of ranges
+ * by combining some of them) are correctly sorted and do not overlap.
+ */
+static void
+AssertCheckExpandedRanges(BrinDesc *bdesc, Oid colloid, AttrNumber attno,
+ Form_pg_attribute attr, ExpandedRange *ranges,
+ int nranges)
+{
+#ifdef USE_ASSERT_CHECKING
+ int i;
+ FmgrInfo *eq;
+ FmgrInfo *lt;
+
+ eq = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid,
+ BTEqualStrategyNumber);
+
+ lt = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid,
+ BTLessStrategyNumber);
+
+ /*
+ * Each range independently should be valid, i.e. that for the boundary
+ * values (lower <= upper).
+ */
+ for (i = 0; i < nranges; i++)
+ {
+ Datum r;
+ Datum minval = ranges[i].minval;
+ Datum maxval = ranges[i].maxval;
+
+ if (ranges[i].collapsed) /* collapsed: minval == maxval */
+ r = FunctionCall2Coll(eq, colloid, minval, maxval);
+ else /* non-collapsed: minval < maxval */
+ r = FunctionCall2Coll(lt, colloid, minval, maxval);
+
+ Assert(DatumGetBool(r));
+ }
+
+ /*
+ * And the ranges should be ordered and must not overlap, i.e. upper <
+ * lower for boundaries of consecutive ranges.
+ */
+ for (i = 0; i < nranges - 1; i++)
+ {
+ Datum r;
+ Datum maxval = ranges[i].maxval;
+ Datum minval = ranges[i + 1].minval;
+
+ r = FunctionCall2Coll(lt, colloid, maxval, minval);
+
+ Assert(DatumGetBool(r));
+ }
+#endif
+}
+
+
+/*
+ * minmax_multi_init
+ * Initialize the deserialized range list, allocate all the memory.
+ *
+ * This is only in-memory representation of the ranges, so we allocate
+ * enough space for the maximum number of values (so as not to have to do
+ * repallocs as the ranges grow).
+ */
+static Ranges *
+minmax_multi_init(int maxvalues)
+{
+ Size len;
+ Ranges *ranges;
+
+ Assert(maxvalues > 0);
+
+ len = offsetof(Ranges, values); /* fixed header */
+ len += maxvalues * sizeof(Datum); /* Datum values */
+
+ ranges = (Ranges *) palloc0(len);
+
+ ranges->maxvalues = maxvalues;
+
+ return ranges;
+}
+
+
+/*
+ * range_deduplicate_values
+ * Deduplicate the part with values in the simple points.
+ *
+ * This is meant to be a cheaper way of reducing the size of the ranges. It
+ * does not touch the ranges, and only sorts the other values - it does not
+ * call the distance functions, which may be quite expensive, etc.
+ *
+ * We do know the values are not duplicate with the ranges, because we check
+ * that before adding a new value. Same for the sorted part of values.
+ */
+static void
+range_deduplicate_values(Ranges *range)
+{
+ int i,
+ n;
+ int start;
+ compare_context cxt;
+
+ /*
+ * If there are no unsorted values, we're done (this probably can't
+ * happen, as we're adding values to unsorted part).
+ */
+ if (range->nsorted == range->nvalues)
+ return;
+
+ /* sort the values */
+ cxt.colloid = range->colloid;
+ cxt.cmpFn = range->cmp;
+
+ /* the values start right after the ranges (which are always sorted) */
+ start = 2 * range->nranges;
+
+ /*
+ * XXX This might do a merge sort, to leverage that the first part of the
+ * array is already sorted. If the sorted part is large, it might be quite
+ * a bit faster.
+ */
+ qsort_arg(&range->values[start],
+ range->nvalues, sizeof(Datum),
+ compare_values, (void *) &cxt);
+
+ n = 1;
+ for (i = 1; i < range->nvalues; i++)
+ {
+ /* same as preceding value, so store it */
+ if (compare_values(&range->values[start + i - 1],
+ &range->values[start + i],
+ (void *) &cxt) == 0)
+ continue;
+
+ range->values[start + n] = range->values[start + i];
+
+ n++;
+ }
+
+ /* now all the values are sorted */
+ range->nvalues = n;
+ range->nsorted = n;
+
+ AssertCheckRanges(range, range->cmp, range->colloid);
+}
+
+
+/*
+ * range_serialize
+ * Serialize the in-memory representation into a compact varlena value.
+ *
+ * Simply copy the header and then also the individual values, as stored
+ * in the in-memory value array.
+ */
+static SerializedRanges *
+range_serialize(Ranges *range)
+{
+ Size len;
+ int nvalues;
+ SerializedRanges *serialized;
+ Oid typid;
+ int typlen;
+ bool typbyval;
+
+ int i;
+ char *ptr;
+
+ /* simple sanity checks */
+ Assert(range->nranges >= 0);
+ Assert(range->nsorted >= 0);
+ Assert(range->nvalues >= 0);
+ Assert(range->maxvalues > 0);
+ Assert(range->target_maxvalues > 0);
+
+ /* at this point the range should be compacted to the target size */
+ Assert(2 * range->nranges + range->nvalues <= range->target_maxvalues);
+
+ Assert(range->target_maxvalues <= range->maxvalues);
+
+ /* range boundaries are always sorted */
+ Assert(range->nvalues >= range->nsorted);
+
+ /* deduplicate values, if there's unsorted part */
+ range_deduplicate_values(range);
+
+ /* see how many Datum values we actually have */
+ nvalues = 2 * range->nranges + range->nvalues;
+
+ typid = range->typid;
+ typbyval = get_typbyval(typid);
+ typlen = get_typlen(typid);
+
+ /* header is always needed */
+ len = offsetof(SerializedRanges, data);
+
+ /*
+ * The space needed depends on data type - for fixed-length data types
+ * (by-value and some by-reference) it's pretty simple, just multiply
+ * (attlen * nvalues) and we're done. For variable-length by-reference
+ * types we need to actually walk all the values and sum the lengths.
+ */
+ if (typlen == -1) /* varlena */
+ {
+ int i;
+
+ for (i = 0; i < nvalues; i++)
+ {
+ len += VARSIZE_ANY(range->values[i]);
+ }
+ }
+ else if (typlen == -2) /* cstring */
+ {
+ int i;
+
+ for (i = 0; i < nvalues; i++)
+ {
+ /* don't forget to include the null terminator ;-) */
+ len += strlen(DatumGetCString(range->values[i])) + 1;
+ }
+ }
+ else /* fixed-length types (even by-reference) */
+ {
+ Assert(typlen > 0);
+ len += nvalues * typlen;
+ }
+
+ /*
+ * Allocate the serialized object, copy the basic information. The
+ * serialized object is a varlena, so update the header.
+ */
+ serialized = (SerializedRanges *) palloc0(len);
+ SET_VARSIZE(serialized, len);
+
+ serialized->typid = typid;
+ serialized->nranges = range->nranges;
+ serialized->nvalues = range->nvalues;
+ serialized->maxvalues = range->target_maxvalues;
+
+ /*
+ * And now copy also the boundary values (like the length calculation this
+ * depends on the particular data type).
+ */
+ ptr = serialized->data; /* start of the serialized data */
+
+ for (i = 0; i < nvalues; i++)
+ {
+ if (typbyval) /* simple by-value data types */
+ {
+ Datum tmp;
+
+ /*
+ * For byval types, we need to copy just the significant bytes -
+ * we can't use memcpy directly, as that assumes little-endian
+ * behavior. store_att_byval does almost what we need, but it
+ * requires a properly aligned buffer - the output buffer does not
+ * guarantee that. So we simply use a local Datum variable (which
+ * guarantees proper alignment), and then copy the value from it.
+ */
+ store_att_byval(&tmp, range->values[i], typlen);
+
+ memcpy(ptr, &tmp, typlen);
+ ptr += typlen;
+ }
+ else if (typlen > 0) /* fixed-length by-ref types */
+ {
+ memcpy(ptr, DatumGetPointer(range->values[i]), typlen);
+ ptr += typlen;
+ }
+ else if (typlen == -1) /* varlena */
+ {
+ int tmp = VARSIZE_ANY(DatumGetPointer(range->values[i]));
+
+ memcpy(ptr, DatumGetPointer(range->values[i]), tmp);
+ ptr += tmp;
+ }
+ else if (typlen == -2) /* cstring */
+ {
+ int tmp = strlen(DatumGetCString(range->values[i])) + 1;
+
+ memcpy(ptr, DatumGetCString(range->values[i]), tmp);
+ ptr += tmp;
+ }
+
+ /* make sure we haven't overflown the buffer end */
+ Assert(ptr <= ((char *) serialized + len));
+ }
+
+ /* exact size */
+ Assert(ptr == ((char *) serialized + len));
+
+ return serialized;
+}
+
+/*
+ * range_deserialize
+ * Serialize the in-memory representation into a compact varlena value.
+ *
+ * Simply copy the header and then also the individual values, as stored
+ * in the in-memory value array.
+ */
+static Ranges *
+range_deserialize(int maxvalues, SerializedRanges *serialized)
+{
+ int i,
+ nvalues;
+ char *ptr,
+ *dataptr;
+ bool typbyval;
+ int typlen;
+ Size datalen;
+
+ Ranges *range;
+
+ Assert(serialized->nranges >= 0);
+ Assert(serialized->nvalues >= 0);
+ Assert(serialized->maxvalues > 0);
+
+ nvalues = 2 * serialized->nranges + serialized->nvalues;
+
+ Assert(nvalues <= serialized->maxvalues);
+ Assert(serialized->maxvalues <= maxvalues);
+
+ range = minmax_multi_init(maxvalues);
+
+ /* copy the header info */
+ range->nranges = serialized->nranges;
+ range->nvalues = serialized->nvalues;
+ range->nsorted = serialized->nvalues;
+ range->maxvalues = maxvalues;
+ range->target_maxvalues = serialized->maxvalues;
+
+ range->typid = serialized->typid;
+
+ typbyval = get_typbyval(serialized->typid);
+ typlen = get_typlen(serialized->typid);
+
+ /*
+ * And now deconstruct the values into Datum array. We have to copy the
+ * data because the serialized representation ignores alignment, and we
+ * don't want to rely on it being kept around anyway.
+ */
+ ptr = serialized->data;
+
+ /*
+ * We don't want to allocate many pieces, so we just allocate everything
+ * in one chunk. How much space will we need?
+ *
+ * XXX We don't need to copy simple by-value data types.
+ */
+ datalen = 0;
+ dataptr = NULL;
+ for (i = 0; (i < nvalues) && (!typbyval); i++)
+ {
+ if (typlen > 0) /* fixed-length by-ref types */
+ datalen += MAXALIGN(typlen);
+ else if (typlen == -1) /* varlena */
+ {
+ datalen += MAXALIGN(VARSIZE_ANY(DatumGetPointer(ptr)));
+ ptr += VARSIZE_ANY(DatumGetPointer(ptr));
+ }
+ else if (typlen == -2) /* cstring */
+ {
+ Size slen = strlen(DatumGetCString(ptr)) + 1;
+
+ datalen += MAXALIGN(slen);
+ ptr += slen;
+ }
+ }
+
+ if (datalen > 0)
+ dataptr = palloc(datalen);
+
+ /*
+ * Restore the source pointer (might have been modified when calculating
+ * the space we need to allocate).
+ */
+ ptr = serialized->data;
+
+ for (i = 0; i < nvalues; i++)
+ {
+ if (typbyval) /* simple by-value data types */
+ {
+ Datum v = 0;
+
+ memcpy(&v, ptr, typlen);
+
+ range->values[i] = fetch_att(&v, true, typlen);
+ ptr += typlen;
+ }
+ else if (typlen > 0) /* fixed-length by-ref types */
+ {
+ range->values[i] = PointerGetDatum(dataptr);
+
+ memcpy(dataptr, ptr, typlen);
+ dataptr += MAXALIGN(typlen);
+
+ ptr += typlen;
+ }
+ else if (typlen == -1) /* varlena */
+ {
+ range->values[i] = PointerGetDatum(dataptr);
+
+ memcpy(dataptr, ptr, VARSIZE_ANY(ptr));
+ dataptr += MAXALIGN(VARSIZE_ANY(ptr));
+ ptr += VARSIZE_ANY(ptr);
+ }
+ else if (typlen == -2) /* cstring */
+ {
+ Size slen = strlen(ptr) + 1;
+
+ range->values[i] = PointerGetDatum(dataptr);
+
+ memcpy(dataptr, ptr, slen);
+ dataptr += MAXALIGN(slen);
+ ptr += slen;
+ }
+
+ /* make sure we haven't overflown the buffer end */
+ Assert(ptr <= ((char *) serialized + VARSIZE_ANY(serialized)));
+ }
+
+ /* should have consumed the whole input value exactly */
+ Assert(ptr == ((char *) serialized + VARSIZE_ANY(serialized)));
+
+ /* return the deserialized value */
+ return range;
+}
+
+/*
+ * compare_expanded_ranges
+ * Compare the expanded ranges - first by minimum, then by maximum.
+ *
+ * We do guarantee that ranges in a single Ranges object do not overlap, so it
+ * may seem strange that we don't order just by minimum. But when merging two
+ * Ranges (which happens in the union function), the ranges may in fact
+ * overlap. So we do compare both.
+ */
+static int
+compare_expanded_ranges(const void *a, const void *b, void *arg)
+{
+ ExpandedRange *ra = (ExpandedRange *) a;
+ ExpandedRange *rb = (ExpandedRange *) b;
+ Datum r;
+
+ compare_context *cxt = (compare_context *) arg;
+
+ /* first compare minvals */
+ r = FunctionCall2Coll(cxt->cmpFn, cxt->colloid, ra->minval, rb->minval);
+
+ if (DatumGetBool(r))
+ return -1;
+
+ r = FunctionCall2Coll(cxt->cmpFn, cxt->colloid, rb->minval, ra->minval);
+
+ if (DatumGetBool(r))
+ return 1;
+
+ /* then compare maxvals */
+ r = FunctionCall2Coll(cxt->cmpFn, cxt->colloid, ra->maxval, rb->maxval);
+
+ if (DatumGetBool(r))
+ return -1;
+
+ r = FunctionCall2Coll(cxt->cmpFn, cxt->colloid, rb->maxval, ra->maxval);
+
+ if (DatumGetBool(r))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * compare_values
+ * Compare the values.
+ */
+static int
+compare_values(const void *a, const void *b, void *arg)
+{
+ Datum *da = (Datum *) a;
+ Datum *db = (Datum *) b;
+ Datum r;
+
+ compare_context *cxt = (compare_context *) arg;
+
+ r = FunctionCall2Coll(cxt->cmpFn, cxt->colloid, *da, *db);
+
+ if (DatumGetBool(r))
+ return -1;
+
+ r = FunctionCall2Coll(cxt->cmpFn, cxt->colloid, *db, *da);
+
+ if (DatumGetBool(r))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Check if the new value matches one of the existing ranges.
+ */
+static bool
+has_matching_range(BrinDesc *bdesc, Oid colloid, Ranges *ranges,
+ Datum newval, AttrNumber attno, Oid typid)
+{
+ Datum compar;
+
+ Datum minvalue = ranges->values[0];
+ Datum maxvalue = ranges->values[2 * ranges->nranges - 1];
+
+ FmgrInfo *cmpLessFn;
+ FmgrInfo *cmpGreaterFn;
+
+ /* binary search on ranges */
+ int start,
+ end;
+
+ if (ranges->nranges == 0)
+ return false;
+
+ /*
+ * Otherwise, need to compare the new value with boundaries of all the
+ * ranges. First check if it's less than the absolute minimum, which is
+ * the first value in the array.
+ */
+ cmpLessFn = minmax_multi_get_strategy_procinfo(bdesc, attno, typid,
+ BTLessStrategyNumber);
+ compar = FunctionCall2Coll(cmpLessFn, colloid, newval, minvalue);
+
+ /* smaller than the smallest value in the range list */
+ if (DatumGetBool(compar))
+ return false;
+
+ /*
+ * And now compare it to the existing maximum (last value in the data
+ * array). But only if we haven't already ruled out a possible match in
+ * the minvalue check.
+ */
+ cmpGreaterFn = minmax_multi_get_strategy_procinfo(bdesc, attno, typid,
+ BTGreaterStrategyNumber);
+ compar = FunctionCall2Coll(cmpGreaterFn, colloid, newval, maxvalue);
+
+ if (DatumGetBool(compar))
+ return false;
+
+ /*
+ * So we know it's in the general min/max, the question is whether it
+ * falls in one of the ranges or gaps. We'll do a binary search on
+ * individual ranges - for each range we check equality (value falls into
+ * the range), and then check ranges either above or below the current
+ * range.
+ */
+ start = 0; /* first range */
+ end = (ranges->nranges - 1); /* last range */
+ while (true)
+ {
+ int midpoint = (start + end) / 2;
+
+ /* this means we ran out of ranges in the last step */
+ if (start > end)
+ return false;
+
+ /* copy the min/max values from the ranges */
+ minvalue = ranges->values[2 * midpoint];
+ maxvalue = ranges->values[2 * midpoint + 1];
+
+ /*
+ * Is the value smaller than the minval? If yes, we'll recurse to the
+ * left side of range array.
+ */
+ compar = FunctionCall2Coll(cmpLessFn, colloid, newval, minvalue);
+
+ /* smaller than the smallest value in this range */
+ if (DatumGetBool(compar))
+ {
+ end = (midpoint - 1);
+ continue;
+ }
+
+ /*
+ * Is the value greater than the minval? If yes, we'll recurse to the
+ * right side of range array.
+ */
+ compar = FunctionCall2Coll(cmpGreaterFn, colloid, newval, maxvalue);
+
+ /* larger than the largest value in this range */
+ if (DatumGetBool(compar))
+ {
+ start = (midpoint + 1);
+ continue;
+ }
+
+ /* hey, we found a matching range */
+ return true;
+ }
+
+ return false;
+}
+
+
+/*
+ * range_contains_value
+ * See if the new value is already contained in the range list.
+ *
+ * We first inspect the list of intervals. We use a small trick - we check
+ * the value against min/max of the whole range (min of the first interval,
+ * max of the last one) first, and only inspect the individual intervals if
+ * this passes.
+ *
+ * If the value matches none of the intervals, we check the exact values.
+ * We simply loop through them and invoke equality operator on them.
+ *
+ * The last parameter (full) determines whether we need to search all the
+ * values, including the unsorted part. With full=false, the unsorted part
+ * is not searched, which may produce false negatives and duplicate values
+ * (in the unsorted part only), but when we're building the range that's
+ * fine - we'll deduplicate before serialization, and it can only happen
+ * if there already are unsorted values (so it was already modified).
+ *
+ * Serialized ranges don't have any unsorted values, so this can't cause
+ * false negatives during querying.
+ */
+static bool
+range_contains_value(BrinDesc *bdesc, Oid colloid,
+ AttrNumber attno, Form_pg_attribute attr,
+ Ranges *ranges, Datum newval, bool full)
+{
+ int i;
+ FmgrInfo *cmpEqualFn;
+ Oid typid = attr->atttypid;
+
+ /*
+ * First inspect the ranges, if there are any. We first check the whole
+ * range, and only when there's still a chance of getting a match we
+ * inspect the individual ranges.
+ */
+ if (has_matching_range(bdesc, colloid, ranges, newval, attno, typid))
+ return true;
+
+ cmpEqualFn = minmax_multi_get_strategy_procinfo(bdesc, attno, typid,
+ BTEqualStrategyNumber);
+
+ /*
+ * There is no matching range, so let's inspect the sorted values.
+ *
+ * We do a sequential search for small numbers of values, and binary
+ * search once we have more than 16 values. This threshold is somewhat
+ * arbitrary, as it depends on how expensive the comparison function is.
+ *
+ * XXX If we use the threshold here, maybe we should do the same thing in
+ * has_matching_range? Or maybe we should do the bin search all the time?
+ *
+ * XXX We could use the same optimization as for ranges, to check if the
+ * value is between min/max, to maybe rule out all sorted values without
+ * having to inspect all of them.
+ */
+ if (ranges->nsorted >= 16)
+ {
+ compare_context cxt;
+
+ cxt.colloid = ranges->colloid;
+ cxt.cmpFn = ranges->cmp;
+
+ if (bsearch_arg(&newval, &ranges->values[2 * ranges->nranges],
+ ranges->nsorted, sizeof(Datum),
+ compare_values, (void *) &cxt) != NULL)
+ return true;
+ }
+ else
+ {
+ for (i = 2 * ranges->nranges; i < 2 * ranges->nranges + ranges->nsorted; i++)
+ {
+ Datum compar;
+
+ compar = FunctionCall2Coll(cmpEqualFn, colloid, newval, ranges->values[i]);
+
+ /* found an exact match */
+ if (DatumGetBool(compar))
+ return true;
+ }
+ }
+
+ /* If not asked to inspect the unsorted part, we're done. */
+ if (!full)
+ return false;
+
+ /* Inspect the unsorted part. */
+ for (i = 2 * ranges->nranges + ranges->nsorted; i < 2 * ranges->nranges + ranges->nvalues; i++)
+ {
+ Datum compar;
+
+ compar = FunctionCall2Coll(cmpEqualFn, colloid, newval, ranges->values[i]);
+
+ /* found an exact match */
+ if (DatumGetBool(compar))
+ return true;
+ }
+
+ /* the value is not covered by this BRIN tuple */
+ return false;
+}
+
+/*
+ * Expand ranges from Ranges into ExpandedRange array. This expects the
+ * eranges to be pre-allocated and with the correct size - there needs to be
+ * (nranges + nvalues) elements.
+ *
+ * The order of expanded ranges is arbitrary. We do expand the ranges first,
+ * and this part is sorted. But then we expand the values, and this part may
+ * be unsorted.
+ */
+static void
+fill_expanded_ranges(ExpandedRange *eranges, int neranges, Ranges *ranges)
+{
+ int idx;
+ int i;
+
+ /* Check that the output array has the right size. */
+ Assert(neranges == (ranges->nranges + ranges->nvalues));
+
+ idx = 0;
+ for (i = 0; i < ranges->nranges; i++)
+ {
+ eranges[idx].minval = ranges->values[2 * i];
+ eranges[idx].maxval = ranges->values[2 * i + 1];
+ eranges[idx].collapsed = false;
+ idx++;
+
+ Assert(idx <= neranges);
+ }
+
+ for (i = 0; i < ranges->nvalues; i++)
+ {
+ eranges[idx].minval = ranges->values[2 * ranges->nranges + i];
+ eranges[idx].maxval = ranges->values[2 * ranges->nranges + i];
+ eranges[idx].collapsed = true;
+ idx++;
+
+ Assert(idx <= neranges);
+ }
+
+ /* Did we produce the expected number of elements? */
+ Assert(idx == neranges);
+
+ return;
+}
+
+/*
+ * Sort and deduplicate expanded ranges.
+ *
+ * The ranges may be deduplicated - we're simply appending values, without
+ * checking for duplicates etc. So maybe the deduplication will reduce the
+ * number of ranges enough, and we won't have to compute the distances etc.
+ *
+ * Returns the number of expanded ranges.
+ */
+static int
+sort_expanded_ranges(FmgrInfo *cmp, Oid colloid,
+ ExpandedRange *eranges, int neranges)
+{
+ int n;
+ int i;
+ compare_context cxt;
+
+ Assert(neranges > 0);
+
+ /* sort the values */
+ cxt.colloid = colloid;
+ cxt.cmpFn = cmp;
+
+ /*
+ * XXX We do qsort on all the values, but we could also leverage the fact
+ * that some of the input data is already sorted (all the ranges and maybe
+ * some of the points) and do merge sort.
+ */
+ qsort_arg(eranges, neranges, sizeof(ExpandedRange),
+ compare_expanded_ranges, (void *) &cxt);
+
+ /*
+ * Deduplicate the ranges - simply compare each range to the preceding
+ * one, and skip the duplicate ones.
+ */
+ n = 1;
+ for (i = 1; i < neranges; i++)
+ {
+ /* if the current range is equal to the preceding one, do nothing */
+ if (!compare_expanded_ranges(&eranges[i - 1], &eranges[i], (void *) &cxt))
+ continue;
+
+ /* otherwise, copy it to n-th place (if not already there) */
+ if (i != n)
+ memcpy(&eranges[n], &eranges[i], sizeof(ExpandedRange));
+
+ n++;
+ }
+
+ Assert((n > 0) && (n <= neranges));
+
+ return n;
+}
+
+/*
+ * When combining multiple Range values (in union function), some of the
+ * ranges may overlap. We simply merge the overlapping ranges to fix that.
+ *
+ * XXX This assumes the expanded ranges were previously sorted (by minval
+ * and then maxval). We leverage this when detecting overlap.
+ */
+static int
+merge_overlapping_ranges(FmgrInfo *cmp, Oid colloid,
+ ExpandedRange *eranges, int neranges)
+{
+ int idx;
+
+ /* Merge ranges (idx) and (idx+1) if they overlap. */
+ idx = 0;
+ while (idx < (neranges - 1))
+ {
+ Datum r;
+
+ /*
+ * comparing [?,maxval] vs. [minval,?] - the ranges overlap if (minval
+ * < maxval)
+ */
+ r = FunctionCall2Coll(cmp, colloid,
+ eranges[idx].maxval,
+ eranges[idx + 1].minval);
+
+ /*
+ * Nope, maxval < minval, so no overlap. And we know the ranges are
+ * ordered, so there are no more overlaps, because all the remaining
+ * ranges have greater or equal minval.
+ */
+ if (DatumGetBool(r))
+ {
+ /* proceed to the next range */
+ idx += 1;
+ continue;
+ }
+
+ /*
+ * So ranges 'idx' and 'idx+1' do overlap, but we don't know if
+ * 'idx+1' is contained in 'idx', or if they overlap only partially.
+ * So compare the upper bounds and keep the larger one.
+ */
+ r = FunctionCall2Coll(cmp, colloid,
+ eranges[idx].maxval,
+ eranges[idx + 1].maxval);
+
+ if (DatumGetBool(r))
+ eranges[idx].maxval = eranges[idx + 1].maxval;
+
+ /*
+ * The range certainly is no longer collapsed (irrespectively of the
+ * previous state).
+ */
+ eranges[idx].collapsed = false;
+
+ /*
+ * Now get rid of the (idx+1) range entirely by shifting the remaining
+ * ranges by 1. There are neranges elements, and we need to move
+ * elements from (idx+2). That means the number of elements to move is
+ * [ncranges - (idx+2)].
+ */
+ memmove(&eranges[idx + 1], &eranges[idx + 2],
+ (neranges - (idx + 2)) * sizeof(ExpandedRange));
+
+ /*
+ * Decrease the number of ranges, and repeat (with the same range, as
+ * it might overlap with additional ranges thanks to the merge).
+ */
+ neranges--;
+ }
+
+ return neranges;
+}
+
+/*
+ * Simple comparator for distance values, comparing the double value.
+ * This is intentionally sorting the distances in descending order, i.e.
+ * the longer gaps will be at the front.
+ */
+static int
+compare_distances(const void *a, const void *b)
+{
+ DistanceValue *da = (DistanceValue *) a;
+ DistanceValue *db = (DistanceValue *) b;
+
+ if (da->value < db->value)
+ return 1;
+ else if (da->value > db->value)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Given an array of expanded ranges, compute size of the gaps between each
+ * range. For neranges there are (neranges-1) gaps.
+ *
+ * We simply call the "distance" function to compute the (max-min) for pairs
+ * of consecutive ranges. The function may be fairly expensive, so we do that
+ * just once (and then use it to pick as many ranges to merge as possible).
+ *
+ * See reduce_expanded_ranges for details.
+ */
+static DistanceValue *
+build_distances(FmgrInfo *distanceFn, Oid colloid,
+ ExpandedRange *eranges, int neranges)
+{
+ int i;
+ int ndistances;
+ DistanceValue *distances;
+
+ Assert(neranges >= 2);
+
+ ndistances = (neranges - 1);
+ distances = (DistanceValue *) palloc0(sizeof(DistanceValue) * ndistances);
+
+ /*
+ * Walk through the ranges once and compute the distance between the
+ * ranges so that we can sort them once.
+ */
+ for (i = 0; i < ndistances; i++)
+ {
+ Datum a1,
+ a2,
+ r;
+
+ a1 = eranges[i].maxval;
+ a2 = eranges[i + 1].minval;
+
+ /* compute length of the gap (between max/min) */
+ r = FunctionCall2Coll(distanceFn, colloid, a1, a2);
+
+ /* remember the index of the gap the distance is for */
+ distances[i].index = i;
+ distances[i].value = DatumGetFloat8(r);
+ }
+
+ /*
+ * Sort the distances in descending order, so that the longest gaps are at
+ * the front.
+ */
+ pg_qsort(distances, ndistances, sizeof(DistanceValue), compare_distances);
+
+ return distances;
+}
+
+/*
+ * Builds expanded ranges for the existing ranges (and single-point ranges),
+ * and also the new value (which did not fit into the array). This expanded
+ * representation makes the processing a bit easier, as it allows handling
+ * ranges and points the same way.
+ *
+ * We sort and deduplicate the expanded ranges - this is necessary, because
+ * the points may be unsorted. And moreover the two parts (ranges and
+ * points) are sorted on their own.
+ */
+static ExpandedRange *
+build_expanded_ranges(FmgrInfo *cmp, Oid colloid, Ranges *ranges,
+ int *nranges)
+{
+ int neranges;
+ ExpandedRange *eranges;
+
+ /* both ranges and points are expanded into a separate element */
+ neranges = ranges->nranges + ranges->nvalues;
+
+ eranges = (ExpandedRange *) palloc0(neranges * sizeof(ExpandedRange));
+
+ /* fill the expanded ranges */
+ fill_expanded_ranges(eranges, neranges, ranges);
+
+ /* sort and deduplicate the expanded ranges */
+ neranges = sort_expanded_ranges(cmp, colloid, eranges, neranges);
+
+ /* remember how many ranges we built */
+ *nranges = neranges;
+
+ return eranges;
+}
+
+#ifdef USE_ASSERT_CHECKING
+/*
+ * Counts boundary values needed to store the ranges. Each single-point
+ * range is stored using a single value, each regular range needs two.
+ */
+static int
+count_values(ExpandedRange *cranges, int ncranges)
+{
+ int i;
+ int count;
+
+ count = 0;
+ for (i = 0; i < ncranges; i++)
+ {
+ if (cranges[i].collapsed)
+ count += 1;
+ else
+ count += 2;
+ }
+
+ return count;
+}
+#endif
+
+/*
+ * reduce_expanded_ranges
+ * reduce the ranges until the number of values is low enough
+ *
+ * Combines ranges until the number of boundary values drops below the
+ * threshold specified by max_values. This happens by merging enough
+ * ranges by the distance between them.
+ *
+ * Returns the number of result ranges.
+ *
+ * We simply use the global min/max and then add boundaries for enough
+ * largest gaps. Each gap adds 2 values, so we simply use (target/2-1)
+ * distances. Then we simply sort all the values - each two values are
+ * a boundary of a range (possibly collapsed).
+ *
+ * XXX Some of the ranges may be collapsed (i.e. the min/max values are
+ * equal), but we ignore that for now. We could repeat the process,
+ * adding a couple more gaps recursively.
+ *
+ * XXX The ranges to merge are selected solely using the distance. But
+ * that may not be the best strategy, for example when multiple gaps
+ * are of equal (or very similar) length.
+ *
+ * Consider for example points 1, 2, 3, .., 64, which have gaps of the
+ * same length 1 of course. In that case, we tend to pick the first
+ * gap of that length, which leads to this:
+ *
+ * step 1: [1, 2], 3, 4, 5, .., 64
+ * step 2: [1, 3], 4, 5, .., 64
+ * step 3: [1, 4], 5, .., 64
+ * ...
+ *
+ * So in the end we'll have one "large" range and multiple small points.
+ * That may be fine, but it seems a bit strange and non-optimal. Maybe
+ * we should consider other things when picking ranges to merge - e.g.
+ * length of the ranges? Or perhaps randomize the choice of ranges, with
+ * probability inversely proportional to the distance (the gap lengths
+ * may be very close, but not exactly the same).
+ *
+ * XXX Or maybe we could just handle this by using random value as a
+ * tie-break, or by adding random noise to the actual distance.
+ */
+static int
+reduce_expanded_ranges(ExpandedRange *eranges, int neranges,
+ DistanceValue *distances, int max_values,
+ FmgrInfo *cmp, Oid colloid)
+{
+ int i;
+ int nvalues;
+ Datum *values;
+
+ compare_context cxt;
+
+ /* total number of gaps between ranges */
+ int ndistances = (neranges - 1);
+
+ /* number of gaps to keep */
+ int keep = (max_values / 2 - 1);
+
+ /*
+ * Maybe we have a sufficiently low number of ranges already?
+ *
+ * XXX This should happen before we actually do the expensive stuff like
+ * sorting, so maybe this should be just an assert.
+ */
+ if (keep >= ndistances)
+ return neranges;
+
+ /* sort the values */
+ cxt.colloid = colloid;
+ cxt.cmpFn = cmp;
+
+ /* allocate space for the boundary values */
+ nvalues = 0;
+ values = (Datum *) palloc(sizeof(Datum) * max_values);
+
+ /* add the global min/max values, from the first/last range */
+ values[nvalues++] = eranges[0].minval;
+ values[nvalues++] = eranges[neranges - 1].maxval;
+
+ /* add boundary values for enough gaps */
+ for (i = 0; i < keep; i++)
+ {
+ /* index of the gap between (index) and (index+1) ranges */
+ int index = distances[i].index;
+
+ Assert((index >= 0) && ((index + 1) < neranges));
+
+ /* add max from the preceding range, minval from the next one */
+ values[nvalues++] = eranges[index].maxval;
+ values[nvalues++] = eranges[index + 1].minval;
+
+ Assert(nvalues <= max_values);
+ }
+
+ /* We should have an even number of range values. */
+ Assert(nvalues % 2 == 0);
+
+ /*
+ * Sort the values using the comparator function, and form ranges from the
+ * sorted result.
+ */
+ qsort_arg(values, nvalues, sizeof(Datum),
+ compare_values, (void *) &cxt);
+
+ /* We have nvalues boundary values, which means nvalues/2 ranges. */
+ for (i = 0; i < (nvalues / 2); i++)
+ {
+ eranges[i].minval = values[2 * i];
+ eranges[i].maxval = values[2 * i + 1];
+
+ /* if the boundary values are the same, it's a collapsed range */
+ eranges[i].collapsed = (compare_values(&values[2 * i],
+ &values[2 * i + 1],
+ &cxt) == 0);
+ }
+
+ return (nvalues / 2);
+}
+
+/*
+ * Store the boundary values from ExpandedRanges back into 'ranges' (using
+ * only the minimal number of values needed).
+ */
+static void
+store_expanded_ranges(Ranges *ranges, ExpandedRange *eranges, int neranges)
+{
+ int i;
+ int idx = 0;
+
+ /* first copy in the regular ranges */
+ ranges->nranges = 0;
+ for (i = 0; i < neranges; i++)
+ {
+ if (!eranges[i].collapsed)
+ {
+ ranges->values[idx++] = eranges[i].minval;
+ ranges->values[idx++] = eranges[i].maxval;
+ ranges->nranges++;
+ }
+ }
+
+ /* now copy in the collapsed ones */
+ ranges->nvalues = 0;
+ for (i = 0; i < neranges; i++)
+ {
+ if (eranges[i].collapsed)
+ {
+ ranges->values[idx++] = eranges[i].minval;
+ ranges->nvalues++;
+ }
+ }
+
+ /* all the values are sorted */
+ ranges->nsorted = ranges->nvalues;
+
+ Assert(count_values(eranges, neranges) == 2 * ranges->nranges + ranges->nvalues);
+ Assert(2 * ranges->nranges + ranges->nvalues <= ranges->maxvalues);
+}
+
+
+/*
+ * Consider freeing space in the ranges. Checks if there's space for at least
+ * one new value, and performs compaction if needed.
+ *
+ * Returns true if the value was actually modified.
+ */
+static bool
+ensure_free_space_in_buffer(BrinDesc *bdesc, Oid colloid,
+ AttrNumber attno, Form_pg_attribute attr,
+ Ranges *range)
+{
+ MemoryContext ctx;
+ MemoryContext oldctx;
+
+ FmgrInfo *cmpFn,
+ *distanceFn;
+
+ /* expanded ranges */
+ ExpandedRange *eranges;
+ int neranges;
+ DistanceValue *distances;
+
+ /*
+ * If there is free space in the buffer, we're done without having to
+ * modify anything.
+ */
+ if (2 * range->nranges + range->nvalues < range->maxvalues)
+ return false;
+
+ /* we'll certainly need the comparator, so just look it up now */
+ cmpFn = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid,
+ BTLessStrategyNumber);
+
+ /* deduplicate values, if there's an unsorted part */
+ range_deduplicate_values(range);
+
+ /*
+ * Did we reduce enough free space by just the deduplication?
+ *
+ * We don't simply check against range->maxvalues again. The deduplication
+ * might have freed very little space (e.g. just one value), forcing us to
+ * do deduplication very often. In that case, it's better to do the
+ * compaction and reduce more space.
+ */
+ if (2 * range->nranges + range->nvalues <= range->maxvalues * MINMAX_BUFFER_LOAD_FACTOR)
+ return true;
+
+ /*
+ * We need to combine some of the existing ranges, to reduce the number of
+ * values we have to store.
+ *
+ * The distanceFn calls (which may internally call e.g. numeric_le) may
+ * allocate quite a bit of memory, and we must not leak it (we might have
+ * to do this repeatedly, even for a single BRIN page range). Otherwise
+ * we'd have problems e.g. when building new indexes. So we use a memory
+ * context and make sure we free the memory at the end (so if we call the
+ * distance function many times, it might be an issue, but meh).
+ */
+ ctx = AllocSetContextCreate(CurrentMemoryContext,
+ "minmax-multi context",
+ ALLOCSET_DEFAULT_SIZES);
+
+ oldctx = MemoryContextSwitchTo(ctx);
+
+ /* build the expanded ranges */
+ eranges = build_expanded_ranges(cmpFn, colloid, range, &neranges);
+
+ /* and we'll also need the 'distance' procedure */
+ distanceFn = minmax_multi_get_procinfo(bdesc, attno, PROCNUM_DISTANCE);
+
+ /* build array of gap distances and sort them in ascending order */
+ distances = build_distances(distanceFn, colloid, eranges, neranges);
+
+ /*
+ * Combine ranges until we release at least 50% of the space. This
+ * threshold is somewhat arbitrary, perhaps needs tuning. We must not use
+ * too low or high value.
+ */
+ neranges = reduce_expanded_ranges(eranges, neranges, distances,
+ range->maxvalues * MINMAX_BUFFER_LOAD_FACTOR,
+ cmpFn, colloid);
+
+ /* Make sure we've sufficiently reduced the number of ranges. */
+ Assert(count_values(eranges, neranges) <= range->maxvalues * MINMAX_BUFFER_LOAD_FACTOR);
+
+ /* decompose the expanded ranges into regular ranges and single values */
+ store_expanded_ranges(range, eranges, neranges);
+
+ MemoryContextSwitchTo(oldctx);
+ MemoryContextDelete(ctx);
+
+ /* Did we break the ranges somehow? */
+ AssertCheckRanges(range, cmpFn, colloid);
+
+ return true;
+}
+
+/*
+ * range_add_value
+ * Add the new value to the minmax-multi range.
+ */
+static bool
+range_add_value(BrinDesc *bdesc, Oid colloid,
+ AttrNumber attno, Form_pg_attribute attr,
+ Ranges *ranges, Datum newval)
+{
+ FmgrInfo *cmpFn;
+ bool modified = false;
+
+ /* we'll certainly need the comparator, so just look it up now */
+ cmpFn = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid,
+ BTLessStrategyNumber);
+
+ /* comprehensive checks of the input ranges */
+ AssertCheckRanges(ranges, cmpFn, colloid);
+
+ /*
+ * Make sure there's enough free space in the buffer. We only trigger this
+ * when the buffer is full, which means it had to be modified as we size
+ * it to be larger than what is stored on disk.
+ *
+ * This needs to happen before we check if the value is contained in the
+ * range, because the value might be in the unsorted part, and we don't
+ * check that in range_contains_value. The deduplication would then move
+ * it to the sorted part, and we'd add the value too, which violates the
+ * rule that we never have duplicates with the ranges or sorted values.
+ *
+ * We might also deduplicate and recheck if the value is contained, but
+ * that seems like overkill. We'd need to deduplicate anyway, so why not
+ * do it now.
+ */
+ modified = ensure_free_space_in_buffer(bdesc, colloid,
+ attno, attr, ranges);
+
+ /*
+ * Bail out if the value already is covered by the range.
+ *
+ * We could also add values until we hit values_per_range, and then do the
+ * deduplication in a batch, hoping for better efficiency. But that would
+ * mean we actually modify the range every time, which means having to
+ * serialize the value, which does palloc, walks the values, copies them,
+ * etc. Not exactly cheap.
+ *
+ * So instead we do the check, which should be fairly cheap - assuming the
+ * comparator function is not very expensive.
+ *
+ * This also implies the values array can't contain duplicate values.
+ */
+ if (range_contains_value(bdesc, colloid, attno, attr, ranges, newval, false))
+ return modified;
+
+ /* Make a copy of the value, if needed. */
+ newval = datumCopy(newval, attr->attbyval, attr->attlen);
+
+ /*
+ * If there's space in the values array, copy it in and we're done.
+ *
+ * We do want to keep the values sorted (to speed up searches), so we do a
+ * simple insertion sort. We could do something more elaborate, e.g. by
+ * sorting the values only now and then, but for small counts (e.g. when
+ * maxvalues is 64) this should be fine.
+ */
+ ranges->values[2 * ranges->nranges + ranges->nvalues] = newval;
+ ranges->nvalues++;
+
+ /* If we added the first value, we can consider it as sorted. */
+ if (ranges->nvalues == 1)
+ ranges->nsorted = 1;
+
+ /*
+ * Check we haven't broken the ordering of boundary values (checks both
+ * parts, but that doesn't hurt).
+ */
+ AssertCheckRanges(ranges, cmpFn, colloid);
+
+ /* Check the range contains the value we just added. */
+ Assert(range_contains_value(bdesc, colloid, attno, attr, ranges, newval, true));
+
+ /* yep, we've modified the range */
+ return true;
+}
+
+/*
+ * Generate range representation of data collected during "batch mode".
+ * This is similar to reduce_expanded_ranges, except that we can't assume
+ * the values are sorted and there may be duplicate values.
+ */
+static void
+compactify_ranges(BrinDesc *bdesc, Ranges *ranges, int max_values)
+{
+ FmgrInfo *cmpFn,
+ *distanceFn;
+
+ /* expanded ranges */
+ ExpandedRange *eranges;
+ int neranges;
+ DistanceValue *distances;
+
+ MemoryContext ctx;
+ MemoryContext oldctx;
+
+ /*
+ * Do we need to actually compactify anything?
+ *
+ * There are two reasons why compaction may be needed - firstly, there may
+ * be too many values, or some of the values may be unsorted.
+ */
+ if ((ranges->nranges * 2 + ranges->nvalues <= max_values) &&
+ (ranges->nsorted == ranges->nvalues))
+ return;
+
+ /* we'll certainly need the comparator, so just look it up now */
+ cmpFn = minmax_multi_get_strategy_procinfo(bdesc, ranges->attno, ranges->typid,
+ BTLessStrategyNumber);
+
+ /* and we'll also need the 'distance' procedure */
+ distanceFn = minmax_multi_get_procinfo(bdesc, ranges->attno, PROCNUM_DISTANCE);
+
+ /*
+ * The distanceFn calls (which may internally call e.g. numeric_le) may
+ * allocate quite a bit of memory, and we must not leak it. Otherwise,
+ * we'd have problems e.g. when building indexes. So we create a local
+ * memory context and make sure we free the memory before leaving this
+ * function (not after every call).
+ */
+ ctx = AllocSetContextCreate(CurrentMemoryContext,
+ "minmax-multi context",
+ ALLOCSET_DEFAULT_SIZES);
+
+ oldctx = MemoryContextSwitchTo(ctx);
+
+ /* build the expanded ranges */
+ eranges = build_expanded_ranges(cmpFn, ranges->colloid, ranges, &neranges);
+
+ /* build array of gap distances and sort them in ascending order */
+ distances = build_distances(distanceFn, ranges->colloid,
+ eranges, neranges);
+
+ /*
+ * Combine ranges until we get below max_values. We don't use any scale
+ * factor, because this is used during serialization, and we don't expect
+ * more tuples to be inserted anytime soon.
+ */
+ neranges = reduce_expanded_ranges(eranges, neranges, distances,
+ max_values, cmpFn, ranges->colloid);
+
+ Assert(count_values(eranges, neranges) <= max_values);
+
+ /* transform back into regular ranges and single values */
+ store_expanded_ranges(ranges, eranges, neranges);
+
+ /* check all the range invariants */
+ AssertCheckRanges(ranges, cmpFn, ranges->colloid);
+
+ MemoryContextSwitchTo(oldctx);
+ MemoryContextDelete(ctx);
+}
+
+Datum
+brin_minmax_multi_opcinfo(PG_FUNCTION_ARGS)
+{
+ BrinOpcInfo *result;
+
+ /*
+ * opaque->strategy_procinfos is initialized lazily; here it is set to
+ * all-uninitialized by palloc0 which sets fn_oid to InvalidOid.
+ */
+
+ result = palloc0(MAXALIGN(SizeofBrinOpcInfo(1)) +
+ sizeof(MinmaxMultiOpaque));
+ result->oi_nstored = 1;
+ result->oi_regular_nulls = true;
+ result->oi_opaque = (MinmaxMultiOpaque *)
+ MAXALIGN((char *) result + SizeofBrinOpcInfo(1));
+ result->oi_typcache[0] = lookup_type_cache(PG_BRIN_MINMAX_MULTI_SUMMARYOID, 0);
+
+ PG_RETURN_POINTER(result);
+}
+
+/*
+ * Compute the distance between two float4 values (plain subtraction).
+ */
+Datum
+brin_minmax_multi_distance_float4(PG_FUNCTION_ARGS)
+{
+ float a1 = PG_GETARG_FLOAT4(0);
+ float a2 = PG_GETARG_FLOAT4(1);
+
+ /* if both values are NaN, then we consider them the same */
+ if (isnan(a1) && isnan(a2))
+ PG_RETURN_FLOAT8(0.0);
+
+ /* if one value is NaN, use infinite distance */
+ if (isnan(a1) || isnan(a2))
+ PG_RETURN_FLOAT8(get_float8_infinity());
+
+ /*
+ * We know the values are range boundaries, but the range may be collapsed
+ * (i.e. single points), with equal values.
+ */
+ Assert(a1 <= a2);
+
+ PG_RETURN_FLOAT8((double) a2 - (double) a1);
+}
+
+/*
+ * Compute the distance between two float8 values (plain subtraction).
+ */
+Datum
+brin_minmax_multi_distance_float8(PG_FUNCTION_ARGS)
+{
+ double a1 = PG_GETARG_FLOAT8(0);
+ double a2 = PG_GETARG_FLOAT8(1);
+
+ /* if both values are NaN, then we consider them the same */
+ if (isnan(a1) && isnan(a2))
+ PG_RETURN_FLOAT8(0.0);
+
+ /* if one value is NaN, use infinite distance */
+ if (isnan(a1) || isnan(a2))
+ PG_RETURN_FLOAT8(get_float8_infinity());
+
+ /*
+ * We know the values are range boundaries, but the range may be collapsed
+ * (i.e. single points), with equal values.
+ */
+ Assert(a1 <= a2);
+
+ PG_RETURN_FLOAT8(a2 - a1);
+}
+
+/*
+ * Compute the distance between two int2 values (plain subtraction).
+ */
+Datum
+brin_minmax_multi_distance_int2(PG_FUNCTION_ARGS)
+{
+ int16 a1 = PG_GETARG_INT16(0);
+ int16 a2 = PG_GETARG_INT16(1);
+
+ /*
+ * We know the values are range boundaries, but the range may be collapsed
+ * (i.e. single points), with equal values.
+ */
+ Assert(a1 <= a2);
+
+ PG_RETURN_FLOAT8((double) a2 - (double) a1);
+}
+
+/*
+ * Compute the distance between two int4 values (plain subtraction).
+ */
+Datum
+brin_minmax_multi_distance_int4(PG_FUNCTION_ARGS)
+{
+ int32 a1 = PG_GETARG_INT32(0);
+ int32 a2 = PG_GETARG_INT32(1);
+
+ /*
+ * We know the values are range boundaries, but the range may be collapsed
+ * (i.e. single points), with equal values.
+ */
+ Assert(a1 <= a2);
+
+ PG_RETURN_FLOAT8((double) a2 - (double) a1);
+}
+
+/*
+ * Compute the distance between two int8 values (plain subtraction).
+ */
+Datum
+brin_minmax_multi_distance_int8(PG_FUNCTION_ARGS)
+{
+ int64 a1 = PG_GETARG_INT64(0);
+ int64 a2 = PG_GETARG_INT64(1);
+
+ /*
+ * We know the values are range boundaries, but the range may be collapsed
+ * (i.e. single points), with equal values.
+ */
+ Assert(a1 <= a2);
+
+ PG_RETURN_FLOAT8((double) a2 - (double) a1);
+}
+
+/*
+ * Compute the distance between two tid values (by mapping them to float8 and
+ * then subtracting them).
+ */
+Datum
+brin_minmax_multi_distance_tid(PG_FUNCTION_ARGS)
+{
+ double da1,
+ da2;
+
+ ItemPointer pa1 = (ItemPointer) PG_GETARG_DATUM(0);
+ ItemPointer pa2 = (ItemPointer) PG_GETARG_DATUM(1);
+
+ /*
+ * We know the values are range boundaries, but the range may be collapsed
+ * (i.e. single points), with equal values.
+ */
+ Assert(ItemPointerCompare(pa1, pa2) <= 0);
+
+ /*
+ * We use the no-check variants here, because user-supplied values may
+ * have (ip_posid == 0). See ItemPointerCompare.
+ */
+ da1 = ItemPointerGetBlockNumberNoCheck(pa1) * MaxHeapTuplesPerPage +
+ ItemPointerGetOffsetNumberNoCheck(pa1);
+
+ da2 = ItemPointerGetBlockNumberNoCheck(pa2) * MaxHeapTuplesPerPage +
+ ItemPointerGetOffsetNumberNoCheck(pa2);
+
+ PG_RETURN_FLOAT8(da2 - da1);
+}
+
+/*
+ * Compute the distance between two numeric values (plain subtraction).
+ */
+Datum
+brin_minmax_multi_distance_numeric(PG_FUNCTION_ARGS)
+{
+ Datum d;
+ Datum a1 = PG_GETARG_DATUM(0);
+ Datum a2 = PG_GETARG_DATUM(1);
+
+ /*
+ * We know the values are range boundaries, but the range may be collapsed
+ * (i.e. single points), with equal values.
+ */
+ Assert(DatumGetBool(DirectFunctionCall2(numeric_le, a1, a2)));
+
+ d = DirectFunctionCall2(numeric_sub, a2, a1); /* a2 - a1 */
+
+ PG_RETURN_FLOAT8(DirectFunctionCall1(numeric_float8, d));
+}
+
+/*
+ * Compute the approximate distance between two UUID values.
+ *
+ * XXX We do not need a perfectly accurate value, so we approximate the
+ * deltas (which would have to be 128-bit integers) with a 64-bit float.
+ * The small inaccuracies do not matter in practice, in the worst case
+ * we'll decide to merge ranges that are not the closest ones.
+ */
+Datum
+brin_minmax_multi_distance_uuid(PG_FUNCTION_ARGS)
+{
+ int i;
+ float8 delta = 0;
+
+ Datum a1 = PG_GETARG_DATUM(0);
+ Datum a2 = PG_GETARG_DATUM(1);
+
+ pg_uuid_t *u1 = DatumGetUUIDP(a1);
+ pg_uuid_t *u2 = DatumGetUUIDP(a2);
+
+ /*
+ * We know the values are range boundaries, but the range may be collapsed
+ * (i.e. single points), with equal values.
+ */
+ Assert(DatumGetBool(DirectFunctionCall2(uuid_le, a1, a2)));
+
+ /* compute approximate delta as a double precision value */
+ for (i = UUID_LEN - 1; i >= 0; i--)
+ {
+ delta += (int) u2->data[i] - (int) u1->data[i];
+ delta /= 256;
+ }
+
+ Assert(delta >= 0);
+
+ PG_RETURN_FLOAT8(delta);
+}
+
+/*
+ * Compute the approximate distance between two dates.
+ */
+Datum
+brin_minmax_multi_distance_date(PG_FUNCTION_ARGS)
+{
+ DateADT dateVal1 = PG_GETARG_DATEADT(0);
+ DateADT dateVal2 = PG_GETARG_DATEADT(1);
+
+ if (DATE_NOT_FINITE(dateVal1) || DATE_NOT_FINITE(dateVal2))
+ PG_RETURN_FLOAT8(0);
+
+ PG_RETURN_FLOAT8(dateVal1 - dateVal2);
+}
+
+/*
+ * Compute the approximate distance between two time (without tz) values.
+ *
+ * TimeADT is just an int64, so we simply subtract the values directly.
+ */
+Datum
+brin_minmax_multi_distance_time(PG_FUNCTION_ARGS)
+{
+ float8 delta = 0;
+
+ TimeADT ta = PG_GETARG_TIMEADT(0);
+ TimeADT tb = PG_GETARG_TIMEADT(1);
+
+ delta = (tb - ta);
+
+ Assert(delta >= 0);
+
+ PG_RETURN_FLOAT8(delta);
+}
+
+/*
+ * Compute the approximate distance between two timetz values.
+ *
+ * Simply subtracts the TimeADT (int64) values embedded in TimeTzADT.
+ */
+Datum
+brin_minmax_multi_distance_timetz(PG_FUNCTION_ARGS)
+{
+ float8 delta = 0;
+
+ TimeTzADT *ta = PG_GETARG_TIMETZADT_P(0);
+ TimeTzADT *tb = PG_GETARG_TIMETZADT_P(1);
+
+ delta = (tb->time - ta->time) + (tb->zone - ta->zone) * USECS_PER_SEC;
+
+ Assert(delta >= 0);
+
+ PG_RETURN_FLOAT8(delta);
+}
+
+/*
+ * Compute the distance between two timestamp values.
+ */
+Datum
+brin_minmax_multi_distance_timestamp(PG_FUNCTION_ARGS)
+{
+ float8 delta = 0;
+
+ Timestamp dt1 = PG_GETARG_TIMESTAMP(0);
+ Timestamp dt2 = PG_GETARG_TIMESTAMP(1);
+
+ if (TIMESTAMP_NOT_FINITE(dt1) || TIMESTAMP_NOT_FINITE(dt2))
+ PG_RETURN_FLOAT8(0);
+
+ delta = dt2 - dt1;
+
+ Assert(delta >= 0);
+
+ PG_RETURN_FLOAT8(delta);
+}
+
+/*
+ * Compute the distance between two interval values.
+ */
+Datum
+brin_minmax_multi_distance_interval(PG_FUNCTION_ARGS)
+{
+ float8 delta = 0;
+
+ Interval *ia = PG_GETARG_INTERVAL_P(0);
+ Interval *ib = PG_GETARG_INTERVAL_P(1);
+ Interval *result;
+
+ int64 dayfraction;
+ int64 days;
+
+ result = (Interval *) palloc(sizeof(Interval));
+
+ result->month = ib->month - ia->month;
+ /* overflow check copied from int4mi */
+ if (!SAMESIGN(ib->month, ia->month) &&
+ !SAMESIGN(result->month, ib->month))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
+ errmsg("interval out of range")));
+
+ result->day = ib->day - ia->day;
+ if (!SAMESIGN(ib->day, ia->day) &&
+ !SAMESIGN(result->day, ib->day))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
+ errmsg("interval out of range")));
+
+ result->time = ib->time - ia->time;
+ if (!SAMESIGN(ib->time, ia->time) &&
+ !SAMESIGN(result->time, ib->time))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
+ errmsg("interval out of range")));
+
+ /*
+ * Delta is (fractional) number of days between the intervals. Assume
+ * months have 30 days for consistency with interval_cmp_internal. We
+ * don't need to be exact, in the worst case we'll build a bit less
+ * efficient ranges. But we should not contradict interval_cmp.
+ */
+ dayfraction = result->time % USECS_PER_DAY;
+ days = result->time / USECS_PER_DAY;
+ days += result->month * INT64CONST(30);
+ days += result->day;
+
+ /* convert to double precision */
+ delta = (double) days + dayfraction / (double) USECS_PER_DAY;
+
+ Assert(delta >= 0);
+
+ PG_RETURN_FLOAT8(delta);
+}
+
+/*
+ * Compute the distance between two pg_lsn values.
+ *
+ * LSN is just an int64 encoding position in the stream, so just subtract
+ * those int64 values directly.
+ */
+Datum
+brin_minmax_multi_distance_pg_lsn(PG_FUNCTION_ARGS)
+{
+ float8 delta = 0;
+
+ XLogRecPtr lsna = PG_GETARG_LSN(0);
+ XLogRecPtr lsnb = PG_GETARG_LSN(1);
+
+ delta = (lsnb - lsna);
+
+ Assert(delta >= 0);
+
+ PG_RETURN_FLOAT8(delta);
+}
+
+/*
+ * Compute the distance between two macaddr values.
+ *
+ * mac addresses are treated as 6 unsigned chars, so do the same thing we
+ * already do for UUID values.
+ */
+Datum
+brin_minmax_multi_distance_macaddr(PG_FUNCTION_ARGS)
+{
+ float8 delta;
+
+ macaddr *a = PG_GETARG_MACADDR_P(0);
+ macaddr *b = PG_GETARG_MACADDR_P(1);
+
+ delta = ((float8) b->f - (float8) a->f);
+ delta /= 256;
+
+ delta += ((float8) b->e - (float8) a->e);
+ delta /= 256;
+
+ delta += ((float8) b->d - (float8) a->d);
+ delta /= 256;
+
+ delta += ((float8) b->c - (float8) a->c);
+ delta /= 256;
+
+ delta += ((float8) b->b - (float8) a->b);
+ delta /= 256;
+
+ delta += ((float8) b->a - (float8) a->a);
+ delta /= 256;
+
+ Assert(delta >= 0);
+
+ PG_RETURN_FLOAT8(delta);
+}
+
+/*
+ * Compute the distance between two macaddr8 values.
+ *
+ * macaddr8 addresses are 8 unsigned chars, so do the same thing we
+ * already do for UUID values.
+ */
+Datum
+brin_minmax_multi_distance_macaddr8(PG_FUNCTION_ARGS)
+{
+ float8 delta;
+
+ macaddr8 *a = PG_GETARG_MACADDR8_P(0);
+ macaddr8 *b = PG_GETARG_MACADDR8_P(1);
+
+ delta = ((float8) b->h - (float8) a->h);
+ delta /= 256;
+
+ delta += ((float8) b->g - (float8) a->g);
+ delta /= 256;
+
+ delta += ((float8) b->f - (float8) a->f);
+ delta /= 256;
+
+ delta += ((float8) b->e - (float8) a->e);
+ delta /= 256;
+
+ delta += ((float8) b->d - (float8) a->d);
+ delta /= 256;
+
+ delta += ((float8) b->c - (float8) a->c);
+ delta /= 256;
+
+ delta += ((float8) b->b - (float8) a->b);
+ delta /= 256;
+
+ delta += ((float8) b->a - (float8) a->a);
+ delta /= 256;
+
+ Assert(delta >= 0);
+
+ PG_RETURN_FLOAT8(delta);
+}
+
+/*
+ * Compute the distance between two inet values.
+ *
+ * The distance is defined as the difference between 32-bit/128-bit values,
+ * depending on the IP version. The distance is computed by subtracting
+ * the bytes and normalizing it to [0,1] range for each IP family.
+ * Addresses from different families are considered to be in maximum
+ * distance, which is 1.0.
+ *
+ * XXX Does this need to consider the mask (bits)? For now, it's ignored.
+ */
+Datum
+brin_minmax_multi_distance_inet(PG_FUNCTION_ARGS)
+{
+ float8 delta;
+ int i;
+ int len;
+ unsigned char *addra,
+ *addrb;
+
+ inet *ipa = PG_GETARG_INET_PP(0);
+ inet *ipb = PG_GETARG_INET_PP(1);
+
+ int lena,
+ lenb;
+
+ /*
+ * If the addresses are from different families, consider them to be in
+ * maximal possible distance (which is 1.0).
+ */
+ if (ip_family(ipa) != ip_family(ipb))
+ PG_RETURN_FLOAT8(1.0);
+
+ addra = (unsigned char *) palloc(ip_addrsize(ipa));
+ memcpy(addra, ip_addr(ipa), ip_addrsize(ipa));
+
+ addrb = (unsigned char *) palloc(ip_addrsize(ipb));
+ memcpy(addrb, ip_addr(ipb), ip_addrsize(ipb));
+
+ /*
+ * The length is calculated from the mask length, because we sort the
+ * addresses by first address in the range, so A.B.C.D/24 < A.B.C.1 (the
+ * first range starts at A.B.C.0, which is before A.B.C.1). We don't want
+ * to produce a negative delta in this case, so we just cut the extra
+ * bytes.
+ *
+ * XXX Maybe this should be a bit more careful and cut the bits, not just
+ * whole bytes.
+ */
+ lena = ip_bits(ipa);
+ lenb = ip_bits(ipb);
+
+ len = ip_addrsize(ipa);
+
+ /* apply the network mask to both addresses */
+ for (i = 0; i < len; i++)
+ {
+ unsigned char mask;
+ int nbits;
+
+ nbits = lena - (i * 8);
+ if (nbits < 8)
+ {
+ mask = (0xFF << (8 - nbits));
+ addra[i] = (addra[i] & mask);
+ }
+
+ nbits = lenb - (i * 8);
+ if (nbits < 8)
+ {
+ mask = (0xFF << (8 - nbits));
+ addrb[i] = (addrb[i] & mask);
+ }
+ }
+
+ /* Calculate the difference between the addresses. */
+ delta = 0;
+ for (i = len - 1; i >= 0; i--)
+ {
+ unsigned char a = addra[i];
+ unsigned char b = addrb[i];
+
+ delta += (float8) b - (float8) a;
+ delta /= 256;
+ }
+
+ Assert((delta >= 0) && (delta <= 1));
+
+ pfree(addra);
+ pfree(addrb);
+
+ PG_RETURN_FLOAT8(delta);
+}
+
+static void
+brin_minmax_multi_serialize(BrinDesc *bdesc, Datum src, Datum *dst)
+{
+ Ranges *ranges = (Ranges *) DatumGetPointer(src);
+ SerializedRanges *s;
+
+ /*
+ * In batch mode, we need to compress the accumulated values to the
+ * actually requested number of values/ranges.
+ */
+ compactify_ranges(bdesc, ranges, ranges->target_maxvalues);
+
+ /* At this point everything has to be fully sorted. */
+ Assert(ranges->nsorted == ranges->nvalues);
+
+ s = range_serialize(ranges);
+ dst[0] = PointerGetDatum(s);
+}
+
+static int
+brin_minmax_multi_get_values(BrinDesc *bdesc, MinMaxMultiOptions *opts)
+{
+ return MinMaxMultiGetValuesPerRange(opts);
+}
+
+/*
+ * Examine the given index tuple (which contains the partial status of a
+ * certain page range) by comparing it to the given value that comes from
+ * another heap tuple. If the new value is outside the min/max range
+ * specified by the existing tuple values, update the index tuple and return
+ * true. Otherwise, return false and do not modify in this case.
+ */
+Datum
+brin_minmax_multi_add_value(PG_FUNCTION_ARGS)
+{
+ BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
+ BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
+ Datum newval = PG_GETARG_DATUM(2);
+ bool isnull PG_USED_FOR_ASSERTS_ONLY = PG_GETARG_DATUM(3);
+ MinMaxMultiOptions *opts = (MinMaxMultiOptions *) PG_GET_OPCLASS_OPTIONS();
+ Oid colloid = PG_GET_COLLATION();
+ bool modified = false;
+ Form_pg_attribute attr;
+ AttrNumber attno;
+ Ranges *ranges;
+ SerializedRanges *serialized = NULL;
+
+ Assert(!isnull);
+
+ attno = column->bv_attno;
+ attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1);
+
+ /* use the already deserialized value, if possible */
+ ranges = (Ranges *) DatumGetPointer(column->bv_mem_value);
+
+ /*
+ * If this is the first non-null value, we need to initialize the range
+ * list. Otherwise, just extract the existing range list from BrinValues.
+ *
+ * When starting with an empty range, we assume this is a batch mode and
+ * we use a larger buffer. The buffer size is derived from the BRIN range
+ * size, number of rows per page, with some sensible min/max values. A
+ * small buffer would be bad for performance, but a large buffer might
+ * require a lot of memory (because of keeping all the values).
+ */
+ if (column->bv_allnulls)
+ {
+ MemoryContext oldctx;
+
+ int target_maxvalues;
+ int maxvalues;
+ BlockNumber pagesPerRange = BrinGetPagesPerRange(bdesc->bd_index);
+
+ /* what was specified as a reloption? */
+ target_maxvalues = brin_minmax_multi_get_values(bdesc, opts);
+
+ /*
+ * Determine the insert buffer size - we use 10x the target, capped to
+ * the maximum number of values in the heap range. This is more than
+ * enough, considering the actual number of rows per page is likely
+ * much lower, but meh.
+ */
+ maxvalues = Min(target_maxvalues * MINMAX_BUFFER_FACTOR,
+ MaxHeapTuplesPerPage * pagesPerRange);
+
+ /* but always at least the original value */
+ maxvalues = Max(maxvalues, target_maxvalues);
+
+ /* always cap by MIN/MAX */
+ maxvalues = Max(maxvalues, MINMAX_BUFFER_MIN);
+ maxvalues = Min(maxvalues, MINMAX_BUFFER_MAX);
+
+ oldctx = MemoryContextSwitchTo(column->bv_context);
+ ranges = minmax_multi_init(maxvalues);
+ ranges->attno = attno;
+ ranges->colloid = colloid;
+ ranges->typid = attr->atttypid;
+ ranges->target_maxvalues = target_maxvalues;
+
+ /* we'll certainly need the comparator, so just look it up now */
+ ranges->cmp = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid,
+ BTLessStrategyNumber);
+
+ MemoryContextSwitchTo(oldctx);
+
+ column->bv_allnulls = false;
+ modified = true;
+
+ column->bv_mem_value = PointerGetDatum(ranges);
+ column->bv_serialize = brin_minmax_multi_serialize;
+ }
+ else if (!ranges)
+ {
+ MemoryContext oldctx;
+
+ int maxvalues;
+ BlockNumber pagesPerRange = BrinGetPagesPerRange(bdesc->bd_index);
+
+ oldctx = MemoryContextSwitchTo(column->bv_context);
+
+ serialized = (SerializedRanges *) PG_DETOAST_DATUM(column->bv_values[0]);
+
+ /*
+ * Determine the insert buffer size - we use 10x the target, capped to
+ * the maximum number of values in the heap range. This is more than
+ * enough, considering the actual number of rows per page is likely
+ * much lower, but meh.
+ */
+ maxvalues = Min(serialized->maxvalues * MINMAX_BUFFER_FACTOR,
+ MaxHeapTuplesPerPage * pagesPerRange);
+
+ /* but always at least the original value */
+ maxvalues = Max(maxvalues, serialized->maxvalues);
+
+ /* always cap by MIN/MAX */
+ maxvalues = Max(maxvalues, MINMAX_BUFFER_MIN);
+ maxvalues = Min(maxvalues, MINMAX_BUFFER_MAX);
+
+ ranges = range_deserialize(maxvalues, serialized);
+
+ ranges->attno = attno;
+ ranges->colloid = colloid;
+ ranges->typid = attr->atttypid;
+
+ /* we'll certainly need the comparator, so just look it up now */
+ ranges->cmp = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid,
+ BTLessStrategyNumber);
+
+ column->bv_mem_value = PointerGetDatum(ranges);
+ column->bv_serialize = brin_minmax_multi_serialize;
+
+ MemoryContextSwitchTo(oldctx);
+ }
+
+ /*
+ * Try to add the new value to the range. We need to update the modified
+ * flag, so that we serialize the updated summary later.
+ */
+ modified |= range_add_value(bdesc, colloid, attno, attr, ranges, newval);
+
+
+ PG_RETURN_BOOL(modified);
+}
+
+/*
+ * Given an index tuple corresponding to a certain page range and a scan key,
+ * return whether the scan key is consistent with the index tuple's min/max
+ * values. Return true if so, false otherwise.
+ */
+Datum
+brin_minmax_multi_consistent(PG_FUNCTION_ARGS)
+{
+ BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
+ BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1);
+ ScanKey *keys = (ScanKey *) PG_GETARG_POINTER(2);
+ int nkeys = PG_GETARG_INT32(3);
+
+ Oid colloid = PG_GET_COLLATION(),
+ subtype;
+ AttrNumber attno;
+ Datum value;
+ FmgrInfo *finfo;
+ SerializedRanges *serialized;
+ Ranges *ranges;
+ int keyno;
+ int rangeno;
+ int i;
+
+ attno = column->bv_attno;
+
+ serialized = (SerializedRanges *) PG_DETOAST_DATUM(column->bv_values[0]);
+ ranges = range_deserialize(serialized->maxvalues, serialized);
+
+ /* inspect the ranges, and for each one evaluate the scan keys */
+ for (rangeno = 0; rangeno < ranges->nranges; rangeno++)
+ {
+ Datum minval = ranges->values[2 * rangeno];
+ Datum maxval = ranges->values[2 * rangeno + 1];
+
+ /* assume the range is matching, and we'll try to prove otherwise */
+ bool matching = true;
+
+ for (keyno = 0; keyno < nkeys; keyno++)
+ {
+ Datum matches;
+ ScanKey key = keys[keyno];
+
+ /* NULL keys are handled and filtered-out in bringetbitmap */
+ Assert(!(key->sk_flags & SK_ISNULL));
+
+ attno = key->sk_attno;
+ subtype = key->sk_subtype;
+ value = key->sk_argument;
+ switch (key->sk_strategy)
+ {
+ case BTLessStrategyNumber:
+ case BTLessEqualStrategyNumber:
+ finfo = minmax_multi_get_strategy_procinfo(bdesc, attno, subtype,
+ key->sk_strategy);
+ /* first value from the array */
+ matches = FunctionCall2Coll(finfo, colloid, minval, value);
+ break;
+
+ case BTEqualStrategyNumber:
+ {
+ Datum compar;
+ FmgrInfo *cmpFn;
+
+ /* by default this range does not match */
+ matches = false;
+
+ /*
+ * Otherwise, need to compare the new value with
+ * boundaries of all the ranges. First check if it's
+ * less than the absolute minimum, which is the first
+ * value in the array.
+ */
+ cmpFn = minmax_multi_get_strategy_procinfo(bdesc, attno, subtype,
+ BTGreaterStrategyNumber);
+ compar = FunctionCall2Coll(cmpFn, colloid, minval, value);
+
+ /* smaller than the smallest value in this range */
+ if (DatumGetBool(compar))
+ break;
+
+ cmpFn = minmax_multi_get_strategy_procinfo(bdesc, attno, subtype,
+ BTLessStrategyNumber);
+ compar = FunctionCall2Coll(cmpFn, colloid, maxval, value);
+
+ /* larger than the largest value in this range */
+ if (DatumGetBool(compar))
+ break;
+
+ /*
+ * We haven't managed to eliminate this range, so
+ * consider it matching.
+ */
+ matches = true;
+
+ break;
+ }
+ case BTGreaterEqualStrategyNumber:
+ case BTGreaterStrategyNumber:
+ finfo = minmax_multi_get_strategy_procinfo(bdesc, attno, subtype,
+ key->sk_strategy);
+ /* last value from the array */
+ matches = FunctionCall2Coll(finfo, colloid, maxval, value);
+ break;
+
+ default:
+ /* shouldn't happen */
+ elog(ERROR, "invalid strategy number %d", key->sk_strategy);
+ matches = 0;
+ break;
+ }
+
+ /* the range has to match all the scan keys */
+ matching &= DatumGetBool(matches);
+
+ /* once we find a non-matching key, we're done */
+ if (!matching)
+ break;
+ }
+
+ /*
+ * have we found a range matching all scan keys? if yes, we're done
+ */
+ if (matching)
+ PG_RETURN_DATUM(BoolGetDatum(true));
+ }
+
+ /*
+ * And now inspect the values. We don't bother with doing a binary search
+ * here, because we're dealing with serialized / fully compacted ranges,
+ * so there should be only very few values.
+ */
+ for (i = 0; i < ranges->nvalues; i++)
+ {
+ Datum val = ranges->values[2 * ranges->nranges + i];
+
+ /* assume the range is matching, and we'll try to prove otherwise */
+ bool matching = true;
+
+ for (keyno = 0; keyno < nkeys; keyno++)
+ {
+ Datum matches;
+ ScanKey key = keys[keyno];
+
+ /* we've already dealt with NULL keys at the beginning */
+ if (key->sk_flags & SK_ISNULL)
+ continue;
+
+ attno = key->sk_attno;
+ subtype = key->sk_subtype;
+ value = key->sk_argument;
+ switch (key->sk_strategy)
+ {
+ case BTLessStrategyNumber:
+ case BTLessEqualStrategyNumber:
+ case BTEqualStrategyNumber:
+ case BTGreaterEqualStrategyNumber:
+ case BTGreaterStrategyNumber:
+
+ finfo = minmax_multi_get_strategy_procinfo(bdesc, attno, subtype,
+ key->sk_strategy);
+ matches = FunctionCall2Coll(finfo, colloid, val, value);
+ break;
+
+ default:
+ /* shouldn't happen */
+ elog(ERROR, "invalid strategy number %d", key->sk_strategy);
+ matches = 0;
+ break;
+ }
+
+ /* the range has to match all the scan keys */
+ matching &= DatumGetBool(matches);
+
+ /* once we find a non-matching key, we're done */
+ if (!matching)
+ break;
+ }
+
+ /* have we found a range matching all scan keys? if yes, we're done */
+ if (matching)
+ PG_RETURN_DATUM(BoolGetDatum(true));
+ }
+
+ PG_RETURN_DATUM(BoolGetDatum(false));
+}
+
+/*
+ * Given two BrinValues, update the first of them as a union of the summary
+ * values contained in both. The second one is untouched.
+ */
+Datum
+brin_minmax_multi_union(PG_FUNCTION_ARGS)
+{
+ BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0);
+ BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1);
+ BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2);
+
+ Oid colloid = PG_GET_COLLATION();
+ SerializedRanges *serialized_a;
+ SerializedRanges *serialized_b;
+ Ranges *ranges_a;
+ Ranges *ranges_b;
+ AttrNumber attno;
+ Form_pg_attribute attr;
+ ExpandedRange *eranges;
+ int neranges;
+ FmgrInfo *cmpFn,
+ *distanceFn;
+ DistanceValue *distances;
+ MemoryContext ctx;
+ MemoryContext oldctx;
+
+ Assert(col_a->bv_attno == col_b->bv_attno);
+ Assert(!col_a->bv_allnulls && !col_b->bv_allnulls);
+
+ attno = col_a->bv_attno;
+ attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1);
+
+ serialized_a = (SerializedRanges *) PG_DETOAST_DATUM(col_a->bv_values[0]);
+ serialized_b = (SerializedRanges *) PG_DETOAST_DATUM(col_b->bv_values[0]);
+
+ ranges_a = range_deserialize(serialized_a->maxvalues, serialized_a);
+ ranges_b = range_deserialize(serialized_b->maxvalues, serialized_b);
+
+ /* make sure neither of the ranges is NULL */
+ Assert(ranges_a && ranges_b);
+
+ neranges = (ranges_a->nranges + ranges_a->nvalues) +
+ (ranges_b->nranges + ranges_b->nvalues);
+
+ /*
+ * The distanceFn calls (which may internally call e.g. numeric_le) may
+ * allocate quite a bit of memory, and we must not leak it. Otherwise,
+ * we'd have problems e.g. when building indexes. So we create a local
+ * memory context and make sure we free the memory before leaving this
+ * function (not after every call).
+ */
+ ctx = AllocSetContextCreate(CurrentMemoryContext,
+ "minmax-multi context",
+ ALLOCSET_DEFAULT_SIZES);
+
+ oldctx = MemoryContextSwitchTo(ctx);
+
+ /* allocate and fill */
+ eranges = (ExpandedRange *) palloc0(neranges * sizeof(ExpandedRange));
+
+ /* fill the expanded ranges with entries for the first range */
+ fill_expanded_ranges(eranges, ranges_a->nranges + ranges_a->nvalues,
+ ranges_a);
+
+ /* and now add combine ranges for the second range */
+ fill_expanded_ranges(&eranges[ranges_a->nranges + ranges_a->nvalues],
+ ranges_b->nranges + ranges_b->nvalues,
+ ranges_b);
+
+ cmpFn = minmax_multi_get_strategy_procinfo(bdesc, attno, attr->atttypid,
+ BTLessStrategyNumber);
+
+ /* sort the expanded ranges */
+ neranges = sort_expanded_ranges(cmpFn, colloid, eranges, neranges);
+
+ /*
+ * We've loaded two different lists of expanded ranges, so some of them
+ * may be overlapping. So walk through them and merge them.
+ */
+ neranges = merge_overlapping_ranges(cmpFn, colloid, eranges, neranges);
+
+ /* check that the combine ranges are correct (no overlaps, ordering) */
+ AssertCheckExpandedRanges(bdesc, colloid, attno, attr, eranges, neranges);
+
+ /*
+ * If needed, reduce some of the ranges.
+ *
+ * XXX This may be fairly expensive, so maybe we should do it only when
+ * it's actually needed (when we have too many ranges).
+ */
+
+ /* build array of gap distances and sort them in ascending order */
+ distanceFn = minmax_multi_get_procinfo(bdesc, attno, PROCNUM_DISTANCE);
+ distances = build_distances(distanceFn, colloid, eranges, neranges);
+
+ /*
+ * See how many values would be needed to store the current ranges, and if
+ * needed combine as many of them to get below the threshold. The
+ * collapsed ranges will be stored as a single value.
+ *
+ * XXX This does not apply the load factor, as we don't expect to add more
+ * values to the range, so we prefer to keep as many ranges as possible.
+ *
+ * XXX Can the maxvalues be different in the two ranges? Perhaps we should
+ * use maximum of those?
+ */
+ neranges = reduce_expanded_ranges(eranges, neranges, distances,
+ ranges_a->maxvalues,
+ cmpFn, colloid);
+
+ /* update the first range summary */
+ store_expanded_ranges(ranges_a, eranges, neranges);
+
+ MemoryContextSwitchTo(oldctx);
+ MemoryContextDelete(ctx);
+
+ /* cleanup and update the serialized value */
+ pfree(serialized_a);
+ col_a->bv_values[0] = PointerGetDatum(range_serialize(ranges_a));
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * Cache and return minmax multi opclass support procedure
+ *
+ * Return the procedure corresponding to the given function support number
+ * or null if it does not exist.
+ */
+static FmgrInfo *
+minmax_multi_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum)
+{
+ MinmaxMultiOpaque *opaque;
+ uint16 basenum = procnum - PROCNUM_BASE;
+
+ /*
+ * We cache these in the opaque struct, to avoid repetitive syscache
+ * lookups.
+ */
+ opaque = (MinmaxMultiOpaque *) bdesc->bd_info[attno - 1]->oi_opaque;
+
+ /*
+ * If we already searched for this proc and didn't find it, don't bother
+ * searching again.
+ */
+ if (opaque->extra_proc_missing[basenum])
+ return NULL;
+
+ if (opaque->extra_procinfos[basenum].fn_oid == InvalidOid)
+ {
+ if (RegProcedureIsValid(index_getprocid(bdesc->bd_index, attno,
+ procnum)))
+ {
+ fmgr_info_copy(&opaque->extra_procinfos[basenum],
+ index_getprocinfo(bdesc->bd_index, attno, procnum),
+ bdesc->bd_context);
+ }
+ else
+ {
+ opaque->extra_proc_missing[basenum] = true;
+ return NULL;
+ }
+ }
+
+ return &opaque->extra_procinfos[basenum];
+}
+
+/*
+ * Cache and return the procedure for the given strategy.
+ *
+ * Note: this function mirrors minmax_multi_get_strategy_procinfo; see notes
+ * there. If changes are made here, see that function too.
+ */
+static FmgrInfo *
+minmax_multi_get_strategy_procinfo(BrinDesc *bdesc, uint16 attno, Oid subtype,
+ uint16 strategynum)
+{
+ MinmaxMultiOpaque *opaque;
+
+ Assert(strategynum >= 1 &&
+ strategynum <= BTMaxStrategyNumber);
+
+ opaque = (MinmaxMultiOpaque *) bdesc->bd_info[attno - 1]->oi_opaque;
+
+ /*
+ * We cache the procedures for the previous subtype in the opaque struct,
+ * to avoid repetitive syscache lookups. If the subtype changed,
+ * invalidate all the cached entries.
+ */
+ if (opaque->cached_subtype != subtype)
+ {
+ uint16 i;
+
+ for (i = 1; i <= BTMaxStrategyNumber; i++)
+ opaque->strategy_procinfos[i - 1].fn_oid = InvalidOid;
+ opaque->cached_subtype = subtype;
+ }
+
+ if (opaque->strategy_procinfos[strategynum - 1].fn_oid == InvalidOid)
+ {
+ Form_pg_attribute attr;
+ HeapTuple tuple;
+ Oid opfamily,
+ oprid;
+ bool isNull;
+
+ opfamily = bdesc->bd_index->rd_opfamily[attno - 1];
+ attr = TupleDescAttr(bdesc->bd_tupdesc, attno - 1);
+ tuple = SearchSysCache4(AMOPSTRATEGY, ObjectIdGetDatum(opfamily),
+ ObjectIdGetDatum(attr->atttypid),
+ ObjectIdGetDatum(subtype),
+ Int16GetDatum(strategynum));
+ if (!HeapTupleIsValid(tuple))
+ elog(ERROR, "missing operator %d(%u,%u) in opfamily %u",
+ strategynum, attr->atttypid, subtype, opfamily);
+
+ oprid = DatumGetObjectId(SysCacheGetAttr(AMOPSTRATEGY, tuple,
+ Anum_pg_amop_amopopr, &isNull));
+ ReleaseSysCache(tuple);
+ Assert(!isNull && RegProcedureIsValid(oprid));
+
+ fmgr_info_cxt(get_opcode(oprid),
+ &opaque->strategy_procinfos[strategynum - 1],
+ bdesc->bd_context);
+ }
+
+ return &opaque->strategy_procinfos[strategynum - 1];
+}
+
+Datum
+brin_minmax_multi_options(PG_FUNCTION_ARGS)
+{
+ local_relopts *relopts = (local_relopts *) PG_GETARG_POINTER(0);
+
+ init_local_reloptions(relopts, sizeof(MinMaxMultiOptions));
+
+ add_local_int_reloption(relopts, "values_per_range", "desc",
+ MINMAX_MULTI_DEFAULT_VALUES_PER_PAGE, 8, 256,
+ offsetof(MinMaxMultiOptions, valuesPerRange));
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * brin_minmax_multi_summary_in
+ * - input routine for type brin_minmax_multi_summary.
+ *
+ * brin_minmax_multi_summary is only used internally to represent summaries
+ * in BRIN minmax-multi indexes, so it has no operations of its own, and we
+ * disallow input too.
+ */
+Datum
+brin_minmax_multi_summary_in(PG_FUNCTION_ARGS)
+{
+ /*
+ * brin_minmax_multi_summary stores the data in binary form and parsing
+ * text input is not needed, so disallow this.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot accept a value of type %s", "brin_minmax_multi_summary")));
+
+ PG_RETURN_VOID(); /* keep compiler quiet */
+}
+
+
+/*
+ * brin_minmax_multi_summary_out
+ * - output routine for type brin_minmax_multi_summary.
+ *
+ * BRIN minmax-multi summaries are serialized into a bytea value, but we
+ * want to output something nicer humans can understand.
+ */
+Datum
+brin_minmax_multi_summary_out(PG_FUNCTION_ARGS)
+{
+ int i;
+ int idx;
+ SerializedRanges *ranges;
+ Ranges *ranges_deserialized;
+ StringInfoData str;
+ bool isvarlena;
+ Oid outfunc;
+ FmgrInfo fmgrinfo;
+ ArrayBuildState *astate_values = NULL;
+
+ initStringInfo(&str);
+ appendStringInfoChar(&str, '{');
+
+ /*
+ * Detoast to get value with full 4B header (can't be stored in a toast
+ * table, but can use 1B header).
+ */
+ ranges = (SerializedRanges *) PG_DETOAST_DATUM(PG_GETARG_BYTEA_PP(0));
+
+ /* lookup output func for the type */
+ getTypeOutputInfo(ranges->typid, &outfunc, &isvarlena);
+ fmgr_info(outfunc, &fmgrinfo);
+
+ /* deserialize the range info easy-to-process pieces */
+ ranges_deserialized = range_deserialize(ranges->maxvalues, ranges);
+
+ appendStringInfo(&str, "nranges: %u nvalues: %u maxvalues: %u",
+ ranges_deserialized->nranges,
+ ranges_deserialized->nvalues,
+ ranges_deserialized->maxvalues);
+
+ /* serialize ranges */
+ idx = 0;
+ for (i = 0; i < ranges_deserialized->nranges; i++)
+ {
+ char *a,
+ *b;
+ text *c;
+ StringInfoData str;
+
+ initStringInfo(&str);
+
+ a = OutputFunctionCall(&fmgrinfo, ranges_deserialized->values[idx++]);
+ b = OutputFunctionCall(&fmgrinfo, ranges_deserialized->values[idx++]);
+
+ appendStringInfo(&str, "%s ... %s", a, b);
+
+ c = cstring_to_text(str.data);
+
+ astate_values = accumArrayResult(astate_values,
+ PointerGetDatum(c),
+ false,
+ TEXTOID,
+ CurrentMemoryContext);
+ }
+
+ if (ranges_deserialized->nranges > 0)
+ {
+ Oid typoutput;
+ bool typIsVarlena;
+ Datum val;
+ char *extval;
+
+ getTypeOutputInfo(ANYARRAYOID, &typoutput, &typIsVarlena);
+
+ val = PointerGetDatum(makeArrayResult(astate_values, CurrentMemoryContext));
+
+ extval = OidOutputFunctionCall(typoutput, val);
+
+ appendStringInfo(&str, " ranges: %s", extval);
+ }
+
+ /* serialize individual values */
+ astate_values = NULL;
+
+ for (i = 0; i < ranges_deserialized->nvalues; i++)
+ {
+ Datum a;
+ text *b;
+ StringInfoData str;
+
+ initStringInfo(&str);
+
+ a = FunctionCall1(&fmgrinfo, ranges_deserialized->values[idx++]);
+
+ appendStringInfoString(&str, DatumGetCString(a));
+
+ b = cstring_to_text(str.data);
+
+ astate_values = accumArrayResult(astate_values,
+ PointerGetDatum(b),
+ false,
+ TEXTOID,
+ CurrentMemoryContext);
+ }
+
+ if (ranges_deserialized->nvalues > 0)
+ {
+ Oid typoutput;
+ bool typIsVarlena;
+ Datum val;
+ char *extval;
+
+ getTypeOutputInfo(ANYARRAYOID, &typoutput, &typIsVarlena);
+
+ val = PointerGetDatum(makeArrayResult(astate_values, CurrentMemoryContext));
+
+ extval = OidOutputFunctionCall(typoutput, val);
+
+ appendStringInfo(&str, " values: %s", extval);
+ }
+
+
+ appendStringInfoChar(&str, '}');
+
+ PG_RETURN_CSTRING(str.data);
+}
+
+/*
+ * brin_minmax_multi_summary_recv
+ * - binary input routine for type brin_minmax_multi_summary.
+ */
+Datum
+brin_minmax_multi_summary_recv(PG_FUNCTION_ARGS)
+{
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot accept a value of type %s", "brin_minmax_multi_summary")));
+
+ PG_RETURN_VOID(); /* keep compiler quiet */
+}
+
+/*
+ * brin_minmax_multi_summary_send
+ * - binary output routine for type brin_minmax_multi_summary.
+ *
+ * BRIN minmax-multi summaries are serialized in a bytea value (although
+ * the type is named differently), so let's just send that.
+ */
+Datum
+brin_minmax_multi_summary_send(PG_FUNCTION_ARGS)
+{
+ return byteasend(fcinfo);
+}
diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c
new file mode 100644
index 0000000..992b33a
--- /dev/null
+++ b/src/backend/access/brin/brin_pageops.c
@@ -0,0 +1,920 @@
+/*
+ * brin_pageops.c
+ * Page-handling routines for BRIN indexes
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/brin/brin_pageops.c
+ */
+#include "postgres.h"
+
+#include "access/brin_page.h"
+#include "access/brin_pageops.h"
+#include "access/brin_revmap.h"
+#include "access/brin_xlog.h"
+#include "access/xloginsert.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/freespace.h"
+#include "storage/lmgr.h"
+#include "storage/smgr.h"
+#include "utils/rel.h"
+
+/*
+ * Maximum size of an entry in a BRIN_PAGETYPE_REGULAR page. We can tolerate
+ * a single item per page, unlike other index AMs.
+ */
+#define BrinMaxItemSize \
+ MAXALIGN_DOWN(BLCKSZ - \
+ (MAXALIGN(SizeOfPageHeaderData + \
+ sizeof(ItemIdData)) + \
+ MAXALIGN(sizeof(BrinSpecialSpace))))
+
+static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
+ bool *extended);
+static Size br_page_get_freespace(Page page);
+static void brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer);
+
+
+/*
+ * Update tuple origtup (size origsz), located in offset oldoff of buffer
+ * oldbuf, to newtup (size newsz) as summary tuple for the page range starting
+ * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit.
+ *
+ * If samepage is true, attempt to put the new tuple in the same page, but if
+ * there's no room, use some other one.
+ *
+ * If the update is successful, return true; the revmap is updated to point to
+ * the new tuple. If the update is not done for whatever reason, return false.
+ * Caller may retry the update if this happens.
+ */
+bool
+brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
+ BrinRevmap *revmap, BlockNumber heapBlk,
+ Buffer oldbuf, OffsetNumber oldoff,
+ const BrinTuple *origtup, Size origsz,
+ const BrinTuple *newtup, Size newsz,
+ bool samepage)
+{
+ Page oldpage;
+ ItemId oldlp;
+ BrinTuple *oldtup;
+ Size oldsz;
+ Buffer newbuf;
+ BlockNumber newblk = InvalidBlockNumber;
+ bool extended;
+
+ Assert(newsz == MAXALIGN(newsz));
+
+ /* If the item is oversized, don't bother. */
+ if (newsz > BrinMaxItemSize)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
+ newsz, BrinMaxItemSize, RelationGetRelationName(idxrel))));
+ return false; /* keep compiler quiet */
+ }
+
+ /* make sure the revmap is long enough to contain the entry we need */
+ brinRevmapExtend(revmap, heapBlk);
+
+ if (!samepage)
+ {
+ /* need a page on which to put the item */
+ newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended);
+ if (!BufferIsValid(newbuf))
+ {
+ Assert(!extended);
+ return false;
+ }
+
+ /*
+ * Note: it's possible (though unlikely) that the returned newbuf is
+ * the same as oldbuf, if brin_getinsertbuffer determined that the old
+ * buffer does in fact have enough space.
+ */
+ if (newbuf == oldbuf)
+ {
+ Assert(!extended);
+ newbuf = InvalidBuffer;
+ }
+ else
+ newblk = BufferGetBlockNumber(newbuf);
+ }
+ else
+ {
+ LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
+ newbuf = InvalidBuffer;
+ extended = false;
+ }
+ oldpage = BufferGetPage(oldbuf);
+ oldlp = PageGetItemId(oldpage, oldoff);
+
+ /*
+ * Check that the old tuple wasn't updated concurrently: it might have
+ * moved someplace else entirely, and for that matter the whole page
+ * might've become a revmap page. Note that in the first two cases
+ * checked here, the "oldlp" we just calculated is garbage; but
+ * PageGetItemId() is simple enough that it was safe to do that
+ * calculation anyway.
+ */
+ if (!BRIN_IS_REGULAR_PAGE(oldpage) ||
+ oldoff > PageGetMaxOffsetNumber(oldpage) ||
+ !ItemIdIsNormal(oldlp))
+ {
+ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+
+ /*
+ * If this happens, and the new buffer was obtained by extending the
+ * relation, then we need to ensure we don't leave it uninitialized or
+ * forget about it.
+ */
+ if (BufferIsValid(newbuf))
+ {
+ if (extended)
+ brin_initialize_empty_new_buffer(idxrel, newbuf);
+ UnlockReleaseBuffer(newbuf);
+ if (extended)
+ FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
+ }
+ return false;
+ }
+
+ oldsz = ItemIdGetLength(oldlp);
+ oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp);
+
+ /*
+ * ... or it might have been updated in place to different contents.
+ */
+ if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz))
+ {
+ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+ if (BufferIsValid(newbuf))
+ {
+ /* As above, initialize and record new page if we got one */
+ if (extended)
+ brin_initialize_empty_new_buffer(idxrel, newbuf);
+ UnlockReleaseBuffer(newbuf);
+ if (extended)
+ FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
+ }
+ return false;
+ }
+
+ /*
+ * Great, the old tuple is intact. We can proceed with the update.
+ *
+ * If there's enough room in the old page for the new tuple, replace it.
+ *
+ * Note that there might now be enough space on the page even though the
+ * caller told us there isn't, if a concurrent update moved another tuple
+ * elsewhere or replaced a tuple with a smaller one.
+ */
+ if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) &&
+ brin_can_do_samepage_update(oldbuf, origsz, newsz))
+ {
+ START_CRIT_SECTION();
+ if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) unconstify(BrinTuple *, newtup), newsz))
+ elog(ERROR, "failed to replace BRIN tuple");
+ MarkBufferDirty(oldbuf);
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(idxrel))
+ {
+ xl_brin_samepage_update xlrec;
+ XLogRecPtr recptr;
+ uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE;
+
+ xlrec.offnum = oldoff;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, SizeOfBrinSamepageUpdate);
+
+ XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD);
+ XLogRegisterBufData(0, (char *) unconstify(BrinTuple *, newtup), newsz);
+
+ recptr = XLogInsert(RM_BRIN_ID, info);
+
+ PageSetLSN(oldpage, recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+
+ if (BufferIsValid(newbuf))
+ {
+ /* As above, initialize and record new page if we got one */
+ if (extended)
+ brin_initialize_empty_new_buffer(idxrel, newbuf);
+ UnlockReleaseBuffer(newbuf);
+ if (extended)
+ FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
+ }
+
+ return true;
+ }
+ else if (newbuf == InvalidBuffer)
+ {
+ /*
+ * Not enough space, but caller said that there was. Tell them to
+ * start over.
+ */
+ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+ return false;
+ }
+ else
+ {
+ /*
+ * Not enough free space on the oldpage. Put the new tuple on the new
+ * page, and update the revmap.
+ */
+ Page newpage = BufferGetPage(newbuf);
+ Buffer revmapbuf;
+ ItemPointerData newtid;
+ OffsetNumber newoff;
+ Size freespace = 0;
+
+ revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
+
+ START_CRIT_SECTION();
+
+ /*
+ * We need to initialize the page if it's newly obtained. Note we
+ * will WAL-log the initialization as part of the update, so we don't
+ * need to do that here.
+ */
+ if (extended)
+ brin_page_init(newpage, BRIN_PAGETYPE_REGULAR);
+
+ PageIndexTupleDeleteNoCompact(oldpage, oldoff);
+ newoff = PageAddItem(newpage, (Item) unconstify(BrinTuple *, newtup), newsz,
+ InvalidOffsetNumber, false, false);
+ if (newoff == InvalidOffsetNumber)
+ elog(ERROR, "failed to add BRIN tuple to new page");
+ MarkBufferDirty(oldbuf);
+ MarkBufferDirty(newbuf);
+
+ /* needed to update FSM below */
+ if (extended)
+ freespace = br_page_get_freespace(newpage);
+
+ ItemPointerSet(&newtid, newblk, newoff);
+ brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid);
+ MarkBufferDirty(revmapbuf);
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(idxrel))
+ {
+ xl_brin_update xlrec;
+ XLogRecPtr recptr;
+ uint8 info;
+
+ info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0);
+
+ xlrec.insert.offnum = newoff;
+ xlrec.insert.heapBlk = heapBlk;
+ xlrec.insert.pagesPerRange = pagesPerRange;
+ xlrec.oldOffnum = oldoff;
+
+ XLogBeginInsert();
+
+ /* new page */
+ XLogRegisterData((char *) &xlrec, SizeOfBrinUpdate);
+
+ XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0));
+ XLogRegisterBufData(0, (char *) unconstify(BrinTuple *, newtup), newsz);
+
+ /* revmap page */
+ XLogRegisterBuffer(1, revmapbuf, 0);
+
+ /* old page */
+ XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_BRIN_ID, info);
+
+ PageSetLSN(oldpage, recptr);
+ PageSetLSN(newpage, recptr);
+ PageSetLSN(BufferGetPage(revmapbuf), recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
+ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+ UnlockReleaseBuffer(newbuf);
+
+ if (extended)
+ {
+ RecordPageWithFreeSpace(idxrel, newblk, freespace);
+ FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
+ }
+
+ return true;
+ }
+}
+
+/*
+ * Return whether brin_doupdate can do a samepage update.
+ */
+bool
+brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz)
+{
+ return
+ ((newsz <= origsz) ||
+ PageGetExactFreeSpace(BufferGetPage(buffer)) >= (newsz - origsz));
+}
+
+/*
+ * Insert an index tuple into the index relation. The revmap is updated to
+ * mark the range containing the given page as pointing to the inserted entry.
+ * A WAL record is written.
+ *
+ * The buffer, if valid, is first checked for free space to insert the new
+ * entry; if there isn't enough, a new buffer is obtained and pinned. No
+ * buffer lock must be held on entry, no buffer lock is held on exit.
+ *
+ * Return value is the offset number where the tuple was inserted.
+ */
+OffsetNumber
+brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
+ BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk,
+ BrinTuple *tup, Size itemsz)
+{
+ Page page;
+ BlockNumber blk;
+ OffsetNumber off;
+ Size freespace = 0;
+ Buffer revmapbuf;
+ ItemPointerData tid;
+ bool extended;
+
+ Assert(itemsz == MAXALIGN(itemsz));
+
+ /* If the item is oversized, don't even bother. */
+ if (itemsz > BrinMaxItemSize)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
+ itemsz, BrinMaxItemSize, RelationGetRelationName(idxrel))));
+ return InvalidOffsetNumber; /* keep compiler quiet */
+ }
+
+ /* Make sure the revmap is long enough to contain the entry we need */
+ brinRevmapExtend(revmap, heapBlk);
+
+ /*
+ * Acquire lock on buffer supplied by caller, if any. If it doesn't have
+ * enough space, unpin it to obtain a new one below.
+ */
+ if (BufferIsValid(*buffer))
+ {
+ /*
+ * It's possible that another backend (or ourselves!) extended the
+ * revmap over the page we held a pin on, so we cannot assume that
+ * it's still a regular page.
+ */
+ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
+ if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz)
+ {
+ UnlockReleaseBuffer(*buffer);
+ *buffer = InvalidBuffer;
+ }
+ }
+
+ /*
+ * If we still don't have a usable buffer, have brin_getinsertbuffer
+ * obtain one for us.
+ */
+ if (!BufferIsValid(*buffer))
+ {
+ do
+ *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended);
+ while (!BufferIsValid(*buffer));
+ }
+ else
+ extended = false;
+
+ /* Now obtain lock on revmap buffer */
+ revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
+
+ page = BufferGetPage(*buffer);
+ blk = BufferGetBlockNumber(*buffer);
+
+ /* Execute the actual insertion */
+ START_CRIT_SECTION();
+ if (extended)
+ brin_page_init(page, BRIN_PAGETYPE_REGULAR);
+ off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber,
+ false, false);
+ if (off == InvalidOffsetNumber)
+ elog(ERROR, "failed to add BRIN tuple to new page");
+ MarkBufferDirty(*buffer);
+
+ /* needed to update FSM below */
+ if (extended)
+ freespace = br_page_get_freespace(page);
+
+ ItemPointerSet(&tid, blk, off);
+ brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid);
+ MarkBufferDirty(revmapbuf);
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(idxrel))
+ {
+ xl_brin_insert xlrec;
+ XLogRecPtr recptr;
+ uint8 info;
+
+ info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0);
+ xlrec.heapBlk = heapBlk;
+ xlrec.pagesPerRange = pagesPerRange;
+ xlrec.offnum = off;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, SizeOfBrinInsert);
+
+ XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0));
+ XLogRegisterBufData(0, (char *) tup, itemsz);
+
+ XLogRegisterBuffer(1, revmapbuf, 0);
+
+ recptr = XLogInsert(RM_BRIN_ID, info);
+
+ PageSetLSN(page, recptr);
+ PageSetLSN(BufferGetPage(revmapbuf), recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ /* Tuple is firmly on buffer; we can release our locks */
+ LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
+ LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
+
+ BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u",
+ blk, off, heapBlk));
+
+ if (extended)
+ {
+ RecordPageWithFreeSpace(idxrel, blk, freespace);
+ FreeSpaceMapVacuumRange(idxrel, blk, blk + 1);
+ }
+
+ return off;
+}
+
+/*
+ * Initialize a page with the given type.
+ *
+ * Caller is responsible for marking it dirty, as appropriate.
+ */
+void
+brin_page_init(Page page, uint16 type)
+{
+ PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace));
+
+ BrinPageType(page) = type;
+}
+
+/*
+ * Initialize a new BRIN index's metapage.
+ */
+void
+brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)
+{
+ BrinMetaPageData *metadata;
+
+ brin_page_init(page, BRIN_PAGETYPE_META);
+
+ metadata = (BrinMetaPageData *) PageGetContents(page);
+
+ metadata->brinMagic = BRIN_META_MAGIC;
+ metadata->brinVersion = version;
+ metadata->pagesPerRange = pagesPerRange;
+
+ /*
+ * Note we cheat here a little. 0 is not a valid revmap block number
+ * (because it's the metapage buffer), but doing this enables the first
+ * revmap page to be created when the index is.
+ */
+ metadata->lastRevmapPage = 0;
+
+ /*
+ * Set pd_lower just past the end of the metadata. This is essential,
+ * because without doing so, metadata will be lost if xlog.c compresses
+ * the page.
+ */
+ ((PageHeader) page)->pd_lower =
+ ((char *) metadata + sizeof(BrinMetaPageData)) - (char *) page;
+}
+
+/*
+ * Initiate page evacuation protocol.
+ *
+ * The page must be locked in exclusive mode by the caller.
+ *
+ * If the page is not yet initialized or empty, return false without doing
+ * anything; it can be used for revmap without any further changes. If it
+ * contains tuples, mark it for evacuation and return true.
+ */
+bool
+brin_start_evacuating_page(Relation idxRel, Buffer buf)
+{
+ OffsetNumber off;
+ OffsetNumber maxoff;
+ Page page;
+
+ page = BufferGetPage(buf);
+
+ if (PageIsNew(page))
+ return false;
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (off = FirstOffsetNumber; off <= maxoff; off++)
+ {
+ ItemId lp;
+
+ lp = PageGetItemId(page, off);
+ if (ItemIdIsUsed(lp))
+ {
+ /*
+ * Prevent other backends from adding more stuff to this page:
+ * BRIN_EVACUATE_PAGE informs br_page_get_freespace that this page
+ * can no longer be used to add new tuples. Note that this flag
+ * is not WAL-logged, except accidentally.
+ */
+ BrinPageFlags(page) |= BRIN_EVACUATE_PAGE;
+ MarkBufferDirtyHint(buf, true);
+
+ return true;
+ }
+ }
+ return false;
+}
+
+/*
+ * Move all tuples out of a page.
+ *
+ * The caller must hold lock on the page. The lock and pin are released.
+ */
+void
+brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
+ BrinRevmap *revmap, Buffer buf)
+{
+ OffsetNumber off;
+ OffsetNumber maxoff;
+ Page page;
+ BrinTuple *btup = NULL;
+ Size btupsz = 0;
+
+ page = BufferGetPage(buf);
+
+ Assert(BrinPageFlags(page) & BRIN_EVACUATE_PAGE);
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (off = FirstOffsetNumber; off <= maxoff; off++)
+ {
+ BrinTuple *tup;
+ Size sz;
+ ItemId lp;
+
+ CHECK_FOR_INTERRUPTS();
+
+ lp = PageGetItemId(page, off);
+ if (ItemIdIsUsed(lp))
+ {
+ sz = ItemIdGetLength(lp);
+ tup = (BrinTuple *) PageGetItem(page, lp);
+ tup = brin_copy_tuple(tup, sz, btup, &btupsz);
+
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+ if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno,
+ buf, off, tup, sz, tup, sz, false))
+ off--; /* retry */
+
+ LockBuffer(buf, BUFFER_LOCK_SHARE);
+
+ /* It's possible that someone extended the revmap over this page */
+ if (!BRIN_IS_REGULAR_PAGE(page))
+ break;
+ }
+ }
+
+ UnlockReleaseBuffer(buf);
+}
+
+/*
+ * Given a BRIN index page, initialize it if necessary, and record its
+ * current free space in the FSM.
+ *
+ * The main use for this is when, during vacuuming, an uninitialized page is
+ * found, which could be the result of relation extension followed by a crash
+ * before the page can be used.
+ *
+ * Here, we don't bother to update upper FSM pages, instead expecting that our
+ * caller (brin_vacuum_scan) will fix them at the end of the scan. Elsewhere
+ * in this file, it's generally a good idea to propagate additions of free
+ * space into the upper FSM pages immediately.
+ */
+void
+brin_page_cleanup(Relation idxrel, Buffer buf)
+{
+ Page page = BufferGetPage(buf);
+
+ /*
+ * If a page was left uninitialized, initialize it now; also record it in
+ * FSM.
+ *
+ * Somebody else might be extending the relation concurrently. To avoid
+ * re-initializing the page before they can grab the buffer lock, we
+ * acquire the extension lock momentarily. Since they hold the extension
+ * lock from before getting the page and after its been initialized, we're
+ * sure to see their initialization.
+ */
+ if (PageIsNew(page))
+ {
+ LockRelationForExtension(idxrel, ShareLock);
+ UnlockRelationForExtension(idxrel, ShareLock);
+
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ if (PageIsNew(page))
+ {
+ brin_initialize_empty_new_buffer(idxrel, buf);
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ return;
+ }
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ }
+
+ /* Nothing to be done for non-regular index pages */
+ if (BRIN_IS_META_PAGE(BufferGetPage(buf)) ||
+ BRIN_IS_REVMAP_PAGE(BufferGetPage(buf)))
+ return;
+
+ /* Measure free space and record it */
+ RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf),
+ br_page_get_freespace(page));
+}
+
+/*
+ * Return a pinned and exclusively locked buffer which can be used to insert an
+ * index item of size itemsz (caller must ensure not to request sizes
+ * impossible to fulfill). If oldbuf is a valid buffer, it is also locked (in
+ * an order determined to avoid deadlocks).
+ *
+ * If we find that the old page is no longer a regular index page (because
+ * of a revmap extension), the old buffer is unlocked and we return
+ * InvalidBuffer.
+ *
+ * If there's no existing page with enough free space to accommodate the new
+ * item, the relation is extended. If this happens, *extended is set to true,
+ * and it is the caller's responsibility to initialize the page (and WAL-log
+ * that fact) prior to use. The caller should also update the FSM with the
+ * page's remaining free space after the insertion.
+ *
+ * Note that the caller is not expected to update FSM unless *extended is set
+ * true. This policy means that we'll update FSM when a page is created, and
+ * when it's found to have too little space for a desired tuple insertion,
+ * but not every single time we add a tuple to the page.
+ *
+ * Note that in some corner cases it is possible for this routine to extend
+ * the relation and then not return the new page. It is this routine's
+ * responsibility to WAL-log the page initialization and to record the page in
+ * FSM if that happens, since the caller certainly can't do it.
+ */
+static Buffer
+brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
+ bool *extended)
+{
+ BlockNumber oldblk;
+ BlockNumber newblk;
+ Page page;
+ Size freespace;
+
+ /* callers must have checked */
+ Assert(itemsz <= BrinMaxItemSize);
+
+ if (BufferIsValid(oldbuf))
+ oldblk = BufferGetBlockNumber(oldbuf);
+ else
+ oldblk = InvalidBlockNumber;
+
+ /* Choose initial target page, re-using existing target if known */
+ newblk = RelationGetTargetBlock(irel);
+ if (newblk == InvalidBlockNumber)
+ newblk = GetPageWithFreeSpace(irel, itemsz);
+
+ /*
+ * Loop until we find a page with sufficient free space. By the time we
+ * return to caller out of this loop, both buffers are valid and locked;
+ * if we have to restart here, neither page is locked and newblk isn't
+ * pinned (if it's even valid).
+ */
+ for (;;)
+ {
+ Buffer buf;
+ bool extensionLockHeld = false;
+
+ CHECK_FOR_INTERRUPTS();
+
+ *extended = false;
+
+ if (newblk == InvalidBlockNumber)
+ {
+ /*
+ * There's not enough free space in any existing index page,
+ * according to the FSM: extend the relation to obtain a shiny new
+ * page.
+ */
+ if (!RELATION_IS_LOCAL(irel))
+ {
+ LockRelationForExtension(irel, ExclusiveLock);
+ extensionLockHeld = true;
+ }
+ buf = ReadBuffer(irel, P_NEW);
+ newblk = BufferGetBlockNumber(buf);
+ *extended = true;
+
+ BRIN_elog((DEBUG2, "brin_getinsertbuffer: extending to page %u",
+ BufferGetBlockNumber(buf)));
+ }
+ else if (newblk == oldblk)
+ {
+ /*
+ * There's an odd corner-case here where the FSM is out-of-date,
+ * and gave us the old page.
+ */
+ buf = oldbuf;
+ }
+ else
+ {
+ buf = ReadBuffer(irel, newblk);
+ }
+
+ /*
+ * We lock the old buffer first, if it's earlier than the new one; but
+ * then we need to check that it hasn't been turned into a revmap page
+ * concurrently. If we detect that that happened, give up and tell
+ * caller to start over.
+ */
+ if (BufferIsValid(oldbuf) && oldblk < newblk)
+ {
+ LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
+ if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)))
+ {
+ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+
+ /*
+ * It is possible that the new page was obtained from
+ * extending the relation. In that case, we must be sure to
+ * record it in the FSM before leaving, because otherwise the
+ * space would be lost forever. However, we cannot let an
+ * uninitialized page get in the FSM, so we need to initialize
+ * it first.
+ */
+ if (*extended)
+ brin_initialize_empty_new_buffer(irel, buf);
+
+ if (extensionLockHeld)
+ UnlockRelationForExtension(irel, ExclusiveLock);
+
+ ReleaseBuffer(buf);
+
+ if (*extended)
+ {
+ FreeSpaceMapVacuumRange(irel, newblk, newblk + 1);
+ /* shouldn't matter, but don't confuse caller */
+ *extended = false;
+ }
+
+ return InvalidBuffer;
+ }
+ }
+
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+ if (extensionLockHeld)
+ UnlockRelationForExtension(irel, ExclusiveLock);
+
+ page = BufferGetPage(buf);
+
+ /*
+ * We have a new buffer to insert into. Check that the new page has
+ * enough free space, and return it if it does; otherwise start over.
+ * (br_page_get_freespace also checks that the FSM didn't hand us a
+ * page that has since been repurposed for the revmap.)
+ */
+ freespace = *extended ?
+ BrinMaxItemSize : br_page_get_freespace(page);
+ if (freespace >= itemsz)
+ {
+ RelationSetTargetBlock(irel, newblk);
+
+ /*
+ * Lock the old buffer if not locked already. Note that in this
+ * case we know for sure it's a regular page: it's later than the
+ * new page we just got, which is not a revmap page, and revmap
+ * pages are always consecutive.
+ */
+ if (BufferIsValid(oldbuf) && oldblk > newblk)
+ {
+ LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
+ Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)));
+ }
+
+ return buf;
+ }
+
+ /* This page is no good. */
+
+ /*
+ * If an entirely new page does not contain enough free space for the
+ * new item, then surely that item is oversized. Complain loudly; but
+ * first make sure we initialize the page and record it as free, for
+ * next time.
+ */
+ if (*extended)
+ {
+ brin_initialize_empty_new_buffer(irel, buf);
+ /* since this should not happen, skip FreeSpaceMapVacuum */
+
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
+ itemsz, freespace, RelationGetRelationName(irel))));
+ return InvalidBuffer; /* keep compiler quiet */
+ }
+
+ if (newblk != oldblk)
+ UnlockReleaseBuffer(buf);
+ if (BufferIsValid(oldbuf) && oldblk <= newblk)
+ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+
+ /*
+ * Update the FSM with the new, presumably smaller, freespace value
+ * for this page, then search for a new target page.
+ */
+ newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz);
+ }
+}
+
+/*
+ * Initialize a page as an empty regular BRIN page, WAL-log this, and record
+ * the page in FSM.
+ *
+ * There are several corner situations in which we extend the relation to
+ * obtain a new page and later find that we cannot use it immediately. When
+ * that happens, we don't want to leave the page go unrecorded in FSM, because
+ * there is no mechanism to get the space back and the index would bloat.
+ * Also, because we would not WAL-log the action that would initialize the
+ * page, the page would go uninitialized in a standby (or after recovery).
+ *
+ * While we record the page in FSM here, caller is responsible for doing FSM
+ * upper-page update if that seems appropriate.
+ */
+static void
+brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer)
+{
+ Page page;
+
+ BRIN_elog((DEBUG2,
+ "brin_initialize_empty_new_buffer: initializing blank page %u",
+ BufferGetBlockNumber(buffer)));
+
+ START_CRIT_SECTION();
+ page = BufferGetPage(buffer);
+ brin_page_init(page, BRIN_PAGETYPE_REGULAR);
+ MarkBufferDirty(buffer);
+ log_newpage_buffer(buffer, true);
+ END_CRIT_SECTION();
+
+ /*
+ * We update the FSM for this page, but this is not WAL-logged. This is
+ * acceptable because VACUUM will scan the index and update the FSM with
+ * pages whose FSM records were forgotten in a crash.
+ */
+ RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer),
+ br_page_get_freespace(page));
+}
+
+
+/*
+ * Return the amount of free space on a regular BRIN index page.
+ *
+ * If the page is not a regular page, or has been marked with the
+ * BRIN_EVACUATE_PAGE flag, returns 0.
+ */
+static Size
+br_page_get_freespace(Page page)
+{
+ if (!BRIN_IS_REGULAR_PAGE(page) ||
+ (BrinPageFlags(page) & BRIN_EVACUATE_PAGE) != 0)
+ return 0;
+ else
+ return PageGetFreeSpace(page);
+}
diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c
new file mode 100644
index 0000000..c574c8a
--- /dev/null
+++ b/src/backend/access/brin/brin_revmap.c
@@ -0,0 +1,664 @@
+/*
+ * brin_revmap.c
+ * Range map for BRIN indexes
+ *
+ * The range map (revmap) is a translation structure for BRIN indexes: for each
+ * page range there is one summary tuple, and its location is tracked by the
+ * revmap. Whenever a new tuple is inserted into a table that violates the
+ * previously recorded summary values, a new tuple is inserted into the index
+ * and the revmap is updated to point to it.
+ *
+ * The revmap is stored in the first pages of the index, immediately following
+ * the metapage. When the revmap needs to be expanded, all tuples on the
+ * regular BRIN page at that block (if any) are moved out of the way.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/brin/brin_revmap.c
+ */
+#include "postgres.h"
+
+#include "access/brin_page.h"
+#include "access/brin_pageops.h"
+#include "access/brin_revmap.h"
+#include "access/brin_tuple.h"
+#include "access/brin_xlog.h"
+#include "access/rmgr.h"
+#include "access/xloginsert.h"
+#include "miscadmin.h"
+#include "storage/bufmgr.h"
+#include "storage/lmgr.h"
+#include "utils/rel.h"
+
+
+/*
+ * In revmap pages, each item stores an ItemPointerData. These defines let one
+ * find the logical revmap page number and index number of the revmap item for
+ * the given heap block number.
+ */
+#define HEAPBLK_TO_REVMAP_BLK(pagesPerRange, heapBlk) \
+ ((heapBlk / pagesPerRange) / REVMAP_PAGE_MAXITEMS)
+#define HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk) \
+ ((heapBlk / pagesPerRange) % REVMAP_PAGE_MAXITEMS)
+
+
+struct BrinRevmap
+{
+ Relation rm_irel;
+ BlockNumber rm_pagesPerRange;
+ BlockNumber rm_lastRevmapPage; /* cached from the metapage */
+ Buffer rm_metaBuf;
+ Buffer rm_currBuf;
+};
+
+/* typedef appears in brin_revmap.h */
+
+
+static BlockNumber revmap_get_blkno(BrinRevmap *revmap,
+ BlockNumber heapBlk);
+static Buffer revmap_get_buffer(BrinRevmap *revmap, BlockNumber heapBlk);
+static BlockNumber revmap_extend_and_get_blkno(BrinRevmap *revmap,
+ BlockNumber heapBlk);
+static void revmap_physical_extend(BrinRevmap *revmap);
+
+/*
+ * Initialize an access object for a range map. This must be freed by
+ * brinRevmapTerminate when caller is done with it.
+ */
+BrinRevmap *
+brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange,
+ Snapshot snapshot)
+{
+ BrinRevmap *revmap;
+ Buffer meta;
+ BrinMetaPageData *metadata;
+ Page page;
+
+ meta = ReadBuffer(idxrel, BRIN_METAPAGE_BLKNO);
+ LockBuffer(meta, BUFFER_LOCK_SHARE);
+ page = BufferGetPage(meta);
+ TestForOldSnapshot(snapshot, idxrel, page);
+ metadata = (BrinMetaPageData *) PageGetContents(page);
+
+ revmap = palloc(sizeof(BrinRevmap));
+ revmap->rm_irel = idxrel;
+ revmap->rm_pagesPerRange = metadata->pagesPerRange;
+ revmap->rm_lastRevmapPage = metadata->lastRevmapPage;
+ revmap->rm_metaBuf = meta;
+ revmap->rm_currBuf = InvalidBuffer;
+
+ *pagesPerRange = metadata->pagesPerRange;
+
+ LockBuffer(meta, BUFFER_LOCK_UNLOCK);
+
+ return revmap;
+}
+
+/*
+ * Release resources associated with a revmap access object.
+ */
+void
+brinRevmapTerminate(BrinRevmap *revmap)
+{
+ ReleaseBuffer(revmap->rm_metaBuf);
+ if (revmap->rm_currBuf != InvalidBuffer)
+ ReleaseBuffer(revmap->rm_currBuf);
+ pfree(revmap);
+}
+
+/*
+ * Extend the revmap to cover the given heap block number.
+ */
+void
+brinRevmapExtend(BrinRevmap *revmap, BlockNumber heapBlk)
+{
+ BlockNumber mapBlk PG_USED_FOR_ASSERTS_ONLY;
+
+ mapBlk = revmap_extend_and_get_blkno(revmap, heapBlk);
+
+ /* Ensure the buffer we got is in the expected range */
+ Assert(mapBlk != InvalidBlockNumber &&
+ mapBlk != BRIN_METAPAGE_BLKNO &&
+ mapBlk <= revmap->rm_lastRevmapPage);
+}
+
+/*
+ * Prepare to insert an entry into the revmap; the revmap buffer in which the
+ * entry is to reside is locked and returned. Most callers should call
+ * brinRevmapExtend beforehand, as this routine does not extend the revmap if
+ * it's not long enough.
+ *
+ * The returned buffer is also recorded in the revmap struct; finishing that
+ * releases the buffer, therefore the caller needn't do it explicitly.
+ */
+Buffer
+brinLockRevmapPageForUpdate(BrinRevmap *revmap, BlockNumber heapBlk)
+{
+ Buffer rmBuf;
+
+ rmBuf = revmap_get_buffer(revmap, heapBlk);
+ LockBuffer(rmBuf, BUFFER_LOCK_EXCLUSIVE);
+
+ return rmBuf;
+}
+
+/*
+ * In the given revmap buffer (locked appropriately by caller), which is used
+ * in a BRIN index of pagesPerRange pages per range, set the element
+ * corresponding to heap block number heapBlk to the given TID.
+ *
+ * Once the operation is complete, the caller must update the LSN on the
+ * returned buffer.
+ *
+ * This is used both in regular operation and during WAL replay.
+ */
+void
+brinSetHeapBlockItemptr(Buffer buf, BlockNumber pagesPerRange,
+ BlockNumber heapBlk, ItemPointerData tid)
+{
+ RevmapContents *contents;
+ ItemPointerData *iptr;
+ Page page;
+
+ /* The correct page should already be pinned and locked */
+ page = BufferGetPage(buf);
+ contents = (RevmapContents *) PageGetContents(page);
+ iptr = (ItemPointerData *) contents->rm_tids;
+ iptr += HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk);
+
+ if (ItemPointerIsValid(&tid))
+ ItemPointerSet(iptr,
+ ItemPointerGetBlockNumber(&tid),
+ ItemPointerGetOffsetNumber(&tid));
+ else
+ ItemPointerSetInvalid(iptr);
+}
+
+/*
+ * Fetch the BrinTuple for a given heap block.
+ *
+ * The buffer containing the tuple is locked, and returned in *buf. The
+ * returned tuple points to the shared buffer and must not be freed; if caller
+ * wants to use it after releasing the buffer lock, it must create its own
+ * palloc'ed copy. As an optimization, the caller can pass a pinned buffer
+ * *buf on entry, which will avoid a pin-unpin cycle when the next tuple is on
+ * the same page as a previous one.
+ *
+ * If no tuple is found for the given heap range, returns NULL. In that case,
+ * *buf might still be updated (and pin must be released by caller), but it's
+ * not locked.
+ *
+ * The output tuple offset within the buffer is returned in *off, and its size
+ * is returned in *size.
+ */
+BrinTuple *
+brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk,
+ Buffer *buf, OffsetNumber *off, Size *size, int mode,
+ Snapshot snapshot)
+{
+ Relation idxRel = revmap->rm_irel;
+ BlockNumber mapBlk;
+ RevmapContents *contents;
+ ItemPointerData *iptr;
+ BlockNumber blk;
+ Page page;
+ ItemId lp;
+ BrinTuple *tup;
+ ItemPointerData previptr;
+
+ /* normalize the heap block number to be the first page in the range */
+ heapBlk = (heapBlk / revmap->rm_pagesPerRange) * revmap->rm_pagesPerRange;
+
+ /*
+ * Compute the revmap page number we need. If Invalid is returned (i.e.,
+ * the revmap page hasn't been created yet), the requested page range is
+ * not summarized.
+ */
+ mapBlk = revmap_get_blkno(revmap, heapBlk);
+ if (mapBlk == InvalidBlockNumber)
+ {
+ *off = InvalidOffsetNumber;
+ return NULL;
+ }
+
+ ItemPointerSetInvalid(&previptr);
+ for (;;)
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ if (revmap->rm_currBuf == InvalidBuffer ||
+ BufferGetBlockNumber(revmap->rm_currBuf) != mapBlk)
+ {
+ if (revmap->rm_currBuf != InvalidBuffer)
+ ReleaseBuffer(revmap->rm_currBuf);
+
+ Assert(mapBlk != InvalidBlockNumber);
+ revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk);
+ }
+
+ LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_SHARE);
+
+ contents = (RevmapContents *)
+ PageGetContents(BufferGetPage(revmap->rm_currBuf));
+ iptr = contents->rm_tids;
+ iptr += HEAPBLK_TO_REVMAP_INDEX(revmap->rm_pagesPerRange, heapBlk);
+
+ if (!ItemPointerIsValid(iptr))
+ {
+ LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK);
+ return NULL;
+ }
+
+ /*
+ * Check the TID we got in a previous iteration, if any, and save the
+ * current TID we got from the revmap; if we loop, we can sanity-check
+ * that the next one we get is different. Otherwise we might be stuck
+ * looping forever if the revmap is somehow badly broken.
+ */
+ if (ItemPointerIsValid(&previptr) && ItemPointerEquals(&previptr, iptr))
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("corrupted BRIN index: inconsistent range map")));
+ previptr = *iptr;
+
+ blk = ItemPointerGetBlockNumber(iptr);
+ *off = ItemPointerGetOffsetNumber(iptr);
+
+ LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK);
+
+ /* Ok, got a pointer to where the BrinTuple should be. Fetch it. */
+ if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != blk)
+ {
+ if (BufferIsValid(*buf))
+ ReleaseBuffer(*buf);
+ *buf = ReadBuffer(idxRel, blk);
+ }
+ LockBuffer(*buf, mode);
+ page = BufferGetPage(*buf);
+ TestForOldSnapshot(snapshot, idxRel, page);
+
+ /* If we land on a revmap page, start over */
+ if (BRIN_IS_REGULAR_PAGE(page))
+ {
+ /*
+ * If the offset number is greater than what's in the page, it's
+ * possible that the range was desummarized concurrently. Just
+ * return NULL to handle that case.
+ */
+ if (*off > PageGetMaxOffsetNumber(page))
+ {
+ LockBuffer(*buf, BUFFER_LOCK_UNLOCK);
+ return NULL;
+ }
+
+ lp = PageGetItemId(page, *off);
+ if (ItemIdIsUsed(lp))
+ {
+ tup = (BrinTuple *) PageGetItem(page, lp);
+
+ if (tup->bt_blkno == heapBlk)
+ {
+ if (size)
+ *size = ItemIdGetLength(lp);
+ /* found it! */
+ return tup;
+ }
+ }
+ }
+
+ /*
+ * No luck. Assume that the revmap was updated concurrently.
+ */
+ LockBuffer(*buf, BUFFER_LOCK_UNLOCK);
+ }
+ /* not reached, but keep compiler quiet */
+ return NULL;
+}
+
+/*
+ * Delete an index tuple, marking a page range as unsummarized.
+ *
+ * Index must be locked in ShareUpdateExclusiveLock mode.
+ *
+ * Return false if caller should retry.
+ */
+bool
+brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk)
+{
+ BrinRevmap *revmap;
+ BlockNumber pagesPerRange;
+ RevmapContents *contents;
+ ItemPointerData *iptr;
+ ItemPointerData invalidIptr;
+ BlockNumber revmapBlk;
+ Buffer revmapBuf;
+ Buffer regBuf;
+ Page revmapPg;
+ Page regPg;
+ OffsetNumber revmapOffset;
+ OffsetNumber regOffset;
+ ItemId lp;
+
+ revmap = brinRevmapInitialize(idxrel, &pagesPerRange, NULL);
+
+ revmapBlk = revmap_get_blkno(revmap, heapBlk);
+ if (!BlockNumberIsValid(revmapBlk))
+ {
+ /* revmap page doesn't exist: range not summarized, we're done */
+ brinRevmapTerminate(revmap);
+ return true;
+ }
+
+ /* Lock the revmap page, obtain the index tuple pointer from it */
+ revmapBuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
+ revmapPg = BufferGetPage(revmapBuf);
+ revmapOffset = HEAPBLK_TO_REVMAP_INDEX(revmap->rm_pagesPerRange, heapBlk);
+
+ contents = (RevmapContents *) PageGetContents(revmapPg);
+ iptr = contents->rm_tids;
+ iptr += revmapOffset;
+
+ if (!ItemPointerIsValid(iptr))
+ {
+ /* no index tuple: range not summarized, we're done */
+ LockBuffer(revmapBuf, BUFFER_LOCK_UNLOCK);
+ brinRevmapTerminate(revmap);
+ return true;
+ }
+
+ regBuf = ReadBuffer(idxrel, ItemPointerGetBlockNumber(iptr));
+ LockBuffer(regBuf, BUFFER_LOCK_EXCLUSIVE);
+ regPg = BufferGetPage(regBuf);
+
+ /*
+ * We're only removing data, not reading it, so there's no need to
+ * TestForOldSnapshot here.
+ */
+
+ /* if this is no longer a regular page, tell caller to start over */
+ if (!BRIN_IS_REGULAR_PAGE(regPg))
+ {
+ LockBuffer(revmapBuf, BUFFER_LOCK_UNLOCK);
+ LockBuffer(regBuf, BUFFER_LOCK_UNLOCK);
+ brinRevmapTerminate(revmap);
+ return false;
+ }
+
+ regOffset = ItemPointerGetOffsetNumber(iptr);
+ if (regOffset > PageGetMaxOffsetNumber(regPg))
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("corrupted BRIN index: inconsistent range map")));
+
+ lp = PageGetItemId(regPg, regOffset);
+ if (!ItemIdIsUsed(lp))
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("corrupted BRIN index: inconsistent range map")));
+
+ /*
+ * Placeholder tuples only appear during unfinished summarization, and we
+ * hold ShareUpdateExclusiveLock, so this function cannot run concurrently
+ * with that. So any placeholder tuples that exist are leftovers from a
+ * crashed or aborted summarization; remove them silently.
+ */
+
+ START_CRIT_SECTION();
+
+ ItemPointerSetInvalid(&invalidIptr);
+ brinSetHeapBlockItemptr(revmapBuf, revmap->rm_pagesPerRange, heapBlk,
+ invalidIptr);
+ PageIndexTupleDeleteNoCompact(regPg, regOffset);
+ /* XXX record free space in FSM? */
+
+ MarkBufferDirty(regBuf);
+ MarkBufferDirty(revmapBuf);
+
+ if (RelationNeedsWAL(idxrel))
+ {
+ xl_brin_desummarize xlrec;
+ XLogRecPtr recptr;
+
+ xlrec.pagesPerRange = revmap->rm_pagesPerRange;
+ xlrec.heapBlk = heapBlk;
+ xlrec.regOffset = regOffset;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, SizeOfBrinDesummarize);
+ XLogRegisterBuffer(0, revmapBuf, 0);
+ XLogRegisterBuffer(1, regBuf, REGBUF_STANDARD);
+ recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_DESUMMARIZE);
+ PageSetLSN(revmapPg, recptr);
+ PageSetLSN(regPg, recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ UnlockReleaseBuffer(regBuf);
+ LockBuffer(revmapBuf, BUFFER_LOCK_UNLOCK);
+ brinRevmapTerminate(revmap);
+
+ return true;
+}
+
+/*
+ * Given a heap block number, find the corresponding physical revmap block
+ * number and return it. If the revmap page hasn't been allocated yet, return
+ * InvalidBlockNumber.
+ */
+static BlockNumber
+revmap_get_blkno(BrinRevmap *revmap, BlockNumber heapBlk)
+{
+ BlockNumber targetblk;
+
+ /* obtain revmap block number, skip 1 for metapage block */
+ targetblk = HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk) + 1;
+
+ /* Normal case: the revmap page is already allocated */
+ if (targetblk <= revmap->rm_lastRevmapPage)
+ return targetblk;
+
+ return InvalidBlockNumber;
+}
+
+/*
+ * Obtain and return a buffer containing the revmap page for the given heap
+ * page. The revmap must have been previously extended to cover that page.
+ * The returned buffer is also recorded in the revmap struct; finishing that
+ * releases the buffer, therefore the caller needn't do it explicitly.
+ */
+static Buffer
+revmap_get_buffer(BrinRevmap *revmap, BlockNumber heapBlk)
+{
+ BlockNumber mapBlk;
+
+ /* Translate the heap block number to physical index location. */
+ mapBlk = revmap_get_blkno(revmap, heapBlk);
+
+ if (mapBlk == InvalidBlockNumber)
+ elog(ERROR, "revmap does not cover heap block %u", heapBlk);
+
+ /* Ensure the buffer we got is in the expected range */
+ Assert(mapBlk != BRIN_METAPAGE_BLKNO &&
+ mapBlk <= revmap->rm_lastRevmapPage);
+
+ /*
+ * Obtain the buffer from which we need to read. If we already have the
+ * correct buffer in our access struct, use that; otherwise, release that,
+ * (if valid) and read the one we need.
+ */
+ if (revmap->rm_currBuf == InvalidBuffer ||
+ mapBlk != BufferGetBlockNumber(revmap->rm_currBuf))
+ {
+ if (revmap->rm_currBuf != InvalidBuffer)
+ ReleaseBuffer(revmap->rm_currBuf);
+
+ revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk);
+ }
+
+ return revmap->rm_currBuf;
+}
+
+/*
+ * Given a heap block number, find the corresponding physical revmap block
+ * number and return it. If the revmap page hasn't been allocated yet, extend
+ * the revmap until it is.
+ */
+static BlockNumber
+revmap_extend_and_get_blkno(BrinRevmap *revmap, BlockNumber heapBlk)
+{
+ BlockNumber targetblk;
+
+ /* obtain revmap block number, skip 1 for metapage block */
+ targetblk = HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk) + 1;
+
+ /* Extend the revmap, if necessary */
+ while (targetblk > revmap->rm_lastRevmapPage)
+ {
+ CHECK_FOR_INTERRUPTS();
+ revmap_physical_extend(revmap);
+ }
+
+ return targetblk;
+}
+
+/*
+ * Try to extend the revmap by one page. This might not happen for a number of
+ * reasons; caller is expected to retry until the expected outcome is obtained.
+ */
+static void
+revmap_physical_extend(BrinRevmap *revmap)
+{
+ Buffer buf;
+ Page page;
+ Page metapage;
+ BrinMetaPageData *metadata;
+ BlockNumber mapBlk;
+ BlockNumber nblocks;
+ Relation irel = revmap->rm_irel;
+ bool needLock = !RELATION_IS_LOCAL(irel);
+
+ /*
+ * Lock the metapage. This locks out concurrent extensions of the revmap,
+ * but note that we still need to grab the relation extension lock because
+ * another backend can extend the index with regular BRIN pages.
+ */
+ LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_EXCLUSIVE);
+ metapage = BufferGetPage(revmap->rm_metaBuf);
+ metadata = (BrinMetaPageData *) PageGetContents(metapage);
+
+ /*
+ * Check that our cached lastRevmapPage value was up-to-date; if it
+ * wasn't, update the cached copy and have caller start over.
+ */
+ if (metadata->lastRevmapPage != revmap->rm_lastRevmapPage)
+ {
+ revmap->rm_lastRevmapPage = metadata->lastRevmapPage;
+ LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
+ return;
+ }
+ mapBlk = metadata->lastRevmapPage + 1;
+
+ nblocks = RelationGetNumberOfBlocks(irel);
+ if (mapBlk < nblocks)
+ {
+ buf = ReadBuffer(irel, mapBlk);
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ page = BufferGetPage(buf);
+ }
+ else
+ {
+ if (needLock)
+ LockRelationForExtension(irel, ExclusiveLock);
+
+ buf = ReadBuffer(irel, P_NEW);
+ if (BufferGetBlockNumber(buf) != mapBlk)
+ {
+ /*
+ * Very rare corner case: somebody extended the relation
+ * concurrently after we read its length. If this happens, give
+ * up and have caller start over. We will have to evacuate that
+ * page from under whoever is using it.
+ */
+ if (needLock)
+ UnlockRelationForExtension(irel, ExclusiveLock);
+ LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
+ ReleaseBuffer(buf);
+ return;
+ }
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ page = BufferGetPage(buf);
+
+ if (needLock)
+ UnlockRelationForExtension(irel, ExclusiveLock);
+ }
+
+ /* Check that it's a regular block (or an empty page) */
+ if (!PageIsNew(page) && !BRIN_IS_REGULAR_PAGE(page))
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("unexpected page type 0x%04X in BRIN index \"%s\" block %u",
+ BrinPageType(page),
+ RelationGetRelationName(irel),
+ BufferGetBlockNumber(buf))));
+
+ /* If the page is in use, evacuate it and restart */
+ if (brin_start_evacuating_page(irel, buf))
+ {
+ LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
+ brin_evacuate_page(irel, revmap->rm_pagesPerRange, revmap, buf);
+
+ /* have caller start over */
+ return;
+ }
+
+ /*
+ * Ok, we have now locked the metapage and the target block. Re-initialize
+ * the target block as a revmap page, and update the metapage.
+ */
+ START_CRIT_SECTION();
+
+ /* the rm_tids array is initialized to all invalid by PageInit */
+ brin_page_init(page, BRIN_PAGETYPE_REVMAP);
+ MarkBufferDirty(buf);
+
+ metadata->lastRevmapPage = mapBlk;
+
+ /*
+ * Set pd_lower just past the end of the metadata. This is essential,
+ * because without doing so, metadata will be lost if xlog.c compresses
+ * the page. (We must do this here because pre-v11 versions of PG did not
+ * set the metapage's pd_lower correctly, so a pg_upgraded index might
+ * contain the wrong value.)
+ */
+ ((PageHeader) metapage)->pd_lower =
+ ((char *) metadata + sizeof(BrinMetaPageData)) - (char *) metapage;
+
+ MarkBufferDirty(revmap->rm_metaBuf);
+
+ if (RelationNeedsWAL(revmap->rm_irel))
+ {
+ xl_brin_revmap_extend xlrec;
+ XLogRecPtr recptr;
+
+ xlrec.targetBlk = mapBlk;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, SizeOfBrinRevmapExtend);
+ XLogRegisterBuffer(0, revmap->rm_metaBuf, REGBUF_STANDARD);
+
+ XLogRegisterBuffer(1, buf, REGBUF_WILL_INIT);
+
+ recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_REVMAP_EXTEND);
+ PageSetLSN(metapage, recptr);
+ PageSetLSN(page, recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
+
+ UnlockReleaseBuffer(buf);
+}
diff --git a/src/backend/access/brin/brin_tuple.c b/src/backend/access/brin/brin_tuple.c
new file mode 100644
index 0000000..09e563b
--- /dev/null
+++ b/src/backend/access/brin/brin_tuple.c
@@ -0,0 +1,708 @@
+/*
+ * brin_tuple.c
+ * Method implementations for tuples in BRIN indexes.
+ *
+ * Intended usage is that code outside this file only deals with
+ * BrinMemTuples, and convert to and from the on-disk representation through
+ * functions in this file.
+ *
+ * NOTES
+ *
+ * A BRIN tuple is similar to a heap tuple, with a few key differences. The
+ * first interesting difference is that the tuple header is much simpler, only
+ * containing its total length and a small area for flags. Also, the stored
+ * data does not match the relation tuple descriptor exactly: for each
+ * attribute in the descriptor, the index tuple carries an arbitrary number
+ * of values, depending on the opclass.
+ *
+ * Also, for each column of the index relation there are two null bits: one
+ * (hasnulls) stores whether any tuple within the page range has that column
+ * set to null; the other one (allnulls) stores whether the column values are
+ * all null. If allnulls is true, then the tuple data area does not contain
+ * values for that column at all; whereas it does if the hasnulls is set.
+ * Note the size of the null bitmask may not be the same as that of the
+ * datum array.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/brin/brin_tuple.c
+ */
+#include "postgres.h"
+
+#include "access/brin_tuple.h"
+#include "access/detoast.h"
+#include "access/heaptoast.h"
+#include "access/htup_details.h"
+#include "access/toast_internals.h"
+#include "access/tupdesc.h"
+#include "access/tupmacs.h"
+#include "utils/datum.h"
+#include "utils/memutils.h"
+
+
+/*
+ * This enables de-toasting of index entries. Needed until VACUUM is
+ * smart enough to rebuild indexes from scratch.
+ */
+#define TOAST_INDEX_HACK
+
+
+static inline void brin_deconstruct_tuple(BrinDesc *brdesc,
+ char *tp, bits8 *nullbits, bool nulls,
+ Datum *values, bool *allnulls, bool *hasnulls);
+
+
+/*
+ * Return a tuple descriptor used for on-disk storage of BRIN tuples.
+ */
+static TupleDesc
+brtuple_disk_tupdesc(BrinDesc *brdesc)
+{
+ /* We cache these in the BrinDesc */
+ if (brdesc->bd_disktdesc == NULL)
+ {
+ int i;
+ int j;
+ AttrNumber attno = 1;
+ TupleDesc tupdesc;
+ MemoryContext oldcxt;
+
+ /* make sure it's in the bdesc's context */
+ oldcxt = MemoryContextSwitchTo(brdesc->bd_context);
+
+ tupdesc = CreateTemplateTupleDesc(brdesc->bd_totalstored);
+
+ for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
+ {
+ for (j = 0; j < brdesc->bd_info[i]->oi_nstored; j++)
+ TupleDescInitEntry(tupdesc, attno++, NULL,
+ brdesc->bd_info[i]->oi_typcache[j]->type_id,
+ -1, 0);
+ }
+
+ MemoryContextSwitchTo(oldcxt);
+
+ brdesc->bd_disktdesc = tupdesc;
+ }
+
+ return brdesc->bd_disktdesc;
+}
+
+/*
+ * Generate a new on-disk tuple to be inserted in a BRIN index.
+ *
+ * See brin_form_placeholder_tuple if you touch this.
+ */
+BrinTuple *
+brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
+ Size *size)
+{
+ Datum *values;
+ bool *nulls;
+ bool anynulls = false;
+ BrinTuple *rettuple;
+ int keyno;
+ int idxattno;
+ uint16 phony_infomask = 0;
+ bits8 *phony_nullbitmap;
+ Size len,
+ hoff,
+ data_len;
+ int i;
+
+#ifdef TOAST_INDEX_HACK
+ Datum *untoasted_values;
+ int nuntoasted = 0;
+#endif
+
+ Assert(brdesc->bd_totalstored > 0);
+
+ values = (Datum *) palloc(sizeof(Datum) * brdesc->bd_totalstored);
+ nulls = (bool *) palloc0(sizeof(bool) * brdesc->bd_totalstored);
+ phony_nullbitmap = (bits8 *)
+ palloc(sizeof(bits8) * BITMAPLEN(brdesc->bd_totalstored));
+
+#ifdef TOAST_INDEX_HACK
+ untoasted_values = (Datum *) palloc(sizeof(Datum) * brdesc->bd_totalstored);
+#endif
+
+ /*
+ * Set up the values/nulls arrays for heap_fill_tuple
+ */
+ idxattno = 0;
+ for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
+ {
+ int datumno;
+
+ /*
+ * "allnulls" is set when there's no nonnull value in any row in the
+ * column; when this happens, there is no data to store. Thus set the
+ * nullable bits for all data elements of this column and we're done.
+ */
+ if (tuple->bt_columns[keyno].bv_allnulls)
+ {
+ for (datumno = 0;
+ datumno < brdesc->bd_info[keyno]->oi_nstored;
+ datumno++)
+ nulls[idxattno++] = true;
+ anynulls = true;
+ continue;
+ }
+
+ /*
+ * The "hasnulls" bit is set when there are some null values in the
+ * data. We still need to store a real value, but the presence of
+ * this means we need a null bitmap.
+ */
+ if (tuple->bt_columns[keyno].bv_hasnulls)
+ anynulls = true;
+
+ /* If needed, serialize the values before forming the on-disk tuple. */
+ if (tuple->bt_columns[keyno].bv_serialize)
+ {
+ tuple->bt_columns[keyno].bv_serialize(brdesc,
+ tuple->bt_columns[keyno].bv_mem_value,
+ tuple->bt_columns[keyno].bv_values);
+ }
+
+ /*
+ * Now obtain the values of each stored datum. Note that some values
+ * might be toasted, and we cannot rely on the original heap values
+ * sticking around forever, so we must detoast them. Also try to
+ * compress them.
+ */
+ for (datumno = 0;
+ datumno < brdesc->bd_info[keyno]->oi_nstored;
+ datumno++)
+ {
+ Datum value = tuple->bt_columns[keyno].bv_values[datumno];
+
+#ifdef TOAST_INDEX_HACK
+
+ /* We must look at the stored type, not at the index descriptor. */
+ TypeCacheEntry *atttype = brdesc->bd_info[keyno]->oi_typcache[datumno];
+
+ /* Do we need to free the value at the end? */
+ bool free_value = false;
+
+ /* For non-varlena types we don't need to do anything special */
+ if (atttype->typlen != -1)
+ {
+ values[idxattno++] = value;
+ continue;
+ }
+
+ /*
+ * Do nothing if value is not of varlena type. We don't need to
+ * care about NULL values here, thanks to bv_allnulls above.
+ *
+ * If value is stored EXTERNAL, must fetch it so we are not
+ * depending on outside storage.
+ *
+ * XXX Is this actually true? Could it be that the summary is NULL
+ * even for range with non-NULL data? E.g. degenerate bloom filter
+ * may be thrown away, etc.
+ */
+ if (VARATT_IS_EXTERNAL(DatumGetPointer(value)))
+ {
+ value = PointerGetDatum(detoast_external_attr((struct varlena *)
+ DatumGetPointer(value)));
+ free_value = true;
+ }
+
+ /*
+ * If value is above size target, and is of a compressible
+ * datatype, try to compress it in-line.
+ */
+ if (!VARATT_IS_EXTENDED(DatumGetPointer(value)) &&
+ VARSIZE(DatumGetPointer(value)) > TOAST_INDEX_TARGET &&
+ (atttype->typstorage == TYPSTORAGE_EXTENDED ||
+ atttype->typstorage == TYPSTORAGE_MAIN))
+ {
+ Datum cvalue;
+ char compression;
+ Form_pg_attribute att = TupleDescAttr(brdesc->bd_tupdesc,
+ keyno);
+
+ /*
+ * If the BRIN summary and indexed attribute use the same data
+ * type and it has a valid compression method, we can use the
+ * same compression method. Otherwise we have to use the
+ * default method.
+ */
+ if (att->atttypid == atttype->type_id)
+ compression = att->attcompression;
+ else
+ compression = InvalidCompressionMethod;
+
+ cvalue = toast_compress_datum(value, compression);
+
+ if (DatumGetPointer(cvalue) != NULL)
+ {
+ /* successful compression */
+ if (free_value)
+ pfree(DatumGetPointer(value));
+
+ value = cvalue;
+ free_value = true;
+ }
+ }
+
+ /*
+ * If we untoasted / compressed the value, we need to free it
+ * after forming the index tuple.
+ */
+ if (free_value)
+ untoasted_values[nuntoasted++] = value;
+
+#endif
+
+ values[idxattno++] = value;
+ }
+ }
+
+ /* Assert we did not overrun temp arrays */
+ Assert(idxattno <= brdesc->bd_totalstored);
+
+ /* compute total space needed */
+ len = SizeOfBrinTuple;
+ if (anynulls)
+ {
+ /*
+ * We need a double-length bitmap on an on-disk BRIN index tuple; the
+ * first half stores the "allnulls" bits, the second stores
+ * "hasnulls".
+ */
+ len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2);
+ }
+
+ len = hoff = MAXALIGN(len);
+
+ data_len = heap_compute_data_size(brtuple_disk_tupdesc(brdesc),
+ values, nulls);
+ len += data_len;
+
+ len = MAXALIGN(len);
+
+ rettuple = palloc0(len);
+ rettuple->bt_blkno = blkno;
+ rettuple->bt_info = hoff;
+
+ /* Assert that hoff fits in the space available */
+ Assert((rettuple->bt_info & BRIN_OFFSET_MASK) == hoff);
+
+ /*
+ * The infomask and null bitmap as computed by heap_fill_tuple are useless
+ * to us. However, that function will not accept a null infomask; and we
+ * need to pass a valid null bitmap so that it will correctly skip
+ * outputting null attributes in the data area.
+ */
+ heap_fill_tuple(brtuple_disk_tupdesc(brdesc),
+ values,
+ nulls,
+ (char *) rettuple + hoff,
+ data_len,
+ &phony_infomask,
+ phony_nullbitmap);
+
+ /* done with these */
+ pfree(values);
+ pfree(nulls);
+ pfree(phony_nullbitmap);
+
+#ifdef TOAST_INDEX_HACK
+ for (i = 0; i < nuntoasted; i++)
+ pfree(DatumGetPointer(untoasted_values[i]));
+#endif
+
+ /*
+ * Now fill in the real null bitmasks. allnulls first.
+ */
+ if (anynulls)
+ {
+ bits8 *bitP;
+ int bitmask;
+
+ rettuple->bt_info |= BRIN_NULLS_MASK;
+
+ /*
+ * Note that we reverse the sense of null bits in this module: we
+ * store a 1 for a null attribute rather than a 0. So we must reverse
+ * the sense of the att_isnull test in brin_deconstruct_tuple as well.
+ */
+ bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1;
+ bitmask = HIGHBIT;
+ for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
+ {
+ if (bitmask != HIGHBIT)
+ bitmask <<= 1;
+ else
+ {
+ bitP += 1;
+ *bitP = 0x0;
+ bitmask = 1;
+ }
+
+ if (!tuple->bt_columns[keyno].bv_allnulls)
+ continue;
+
+ *bitP |= bitmask;
+ }
+ /* hasnulls bits follow */
+ for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
+ {
+ if (bitmask != HIGHBIT)
+ bitmask <<= 1;
+ else
+ {
+ bitP += 1;
+ *bitP = 0x0;
+ bitmask = 1;
+ }
+
+ if (!tuple->bt_columns[keyno].bv_hasnulls)
+ continue;
+
+ *bitP |= bitmask;
+ }
+ }
+
+ if (tuple->bt_placeholder)
+ rettuple->bt_info |= BRIN_PLACEHOLDER_MASK;
+
+ *size = len;
+ return rettuple;
+}
+
+/*
+ * Generate a new on-disk tuple with no data values, marked as placeholder.
+ *
+ * This is a cut-down version of brin_form_tuple.
+ */
+BrinTuple *
+brin_form_placeholder_tuple(BrinDesc *brdesc, BlockNumber blkno, Size *size)
+{
+ Size len;
+ Size hoff;
+ BrinTuple *rettuple;
+ int keyno;
+ bits8 *bitP;
+ int bitmask;
+
+ /* compute total space needed: always add nulls */
+ len = SizeOfBrinTuple;
+ len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2);
+ len = hoff = MAXALIGN(len);
+
+ rettuple = palloc0(len);
+ rettuple->bt_blkno = blkno;
+ rettuple->bt_info = hoff;
+ rettuple->bt_info |= BRIN_NULLS_MASK | BRIN_PLACEHOLDER_MASK;
+
+ bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1;
+ bitmask = HIGHBIT;
+ /* set allnulls true for all attributes */
+ for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
+ {
+ if (bitmask != HIGHBIT)
+ bitmask <<= 1;
+ else
+ {
+ bitP += 1;
+ *bitP = 0x0;
+ bitmask = 1;
+ }
+
+ *bitP |= bitmask;
+ }
+ /* no need to set hasnulls */
+
+ *size = len;
+ return rettuple;
+}
+
+/*
+ * Free a tuple created by brin_form_tuple
+ */
+void
+brin_free_tuple(BrinTuple *tuple)
+{
+ pfree(tuple);
+}
+
+/*
+ * Given a brin tuple of size len, create a copy of it. If 'dest' is not
+ * NULL, its size is destsz, and can be used as output buffer; if the tuple
+ * to be copied does not fit, it is enlarged by repalloc, and the size is
+ * updated to match. This avoids palloc/free cycles when many brin tuples
+ * are being processed in loops.
+ */
+BrinTuple *
+brin_copy_tuple(BrinTuple *tuple, Size len, BrinTuple *dest, Size *destsz)
+{
+ if (!destsz || *destsz == 0)
+ dest = palloc(len);
+ else if (len > *destsz)
+ {
+ dest = repalloc(dest, len);
+ *destsz = len;
+ }
+
+ memcpy(dest, tuple, len);
+
+ return dest;
+}
+
+/*
+ * Return whether two BrinTuples are bitwise identical.
+ */
+bool
+brin_tuples_equal(const BrinTuple *a, Size alen, const BrinTuple *b, Size blen)
+{
+ if (alen != blen)
+ return false;
+ if (memcmp(a, b, alen) != 0)
+ return false;
+ return true;
+}
+
+/*
+ * Create a new BrinMemTuple from scratch, and initialize it to an empty
+ * state.
+ *
+ * Note: we don't provide any means to free a deformed tuple, so make sure to
+ * use a temporary memory context.
+ */
+BrinMemTuple *
+brin_new_memtuple(BrinDesc *brdesc)
+{
+ BrinMemTuple *dtup;
+ long basesize;
+
+ basesize = MAXALIGN(sizeof(BrinMemTuple) +
+ sizeof(BrinValues) * brdesc->bd_tupdesc->natts);
+ dtup = palloc0(basesize + sizeof(Datum) * brdesc->bd_totalstored);
+
+ dtup->bt_values = palloc(sizeof(Datum) * brdesc->bd_totalstored);
+ dtup->bt_allnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
+ dtup->bt_hasnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
+
+ dtup->bt_context = AllocSetContextCreate(CurrentMemoryContext,
+ "brin dtuple",
+ ALLOCSET_DEFAULT_SIZES);
+
+ brin_memtuple_initialize(dtup, brdesc);
+
+ return dtup;
+}
+
+/*
+ * Reset a BrinMemTuple to initial state. We return the same tuple, for
+ * notational convenience.
+ */
+BrinMemTuple *
+brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc)
+{
+ int i;
+ char *currdatum;
+
+ MemoryContextReset(dtuple->bt_context);
+
+ currdatum = (char *) dtuple +
+ MAXALIGN(sizeof(BrinMemTuple) +
+ sizeof(BrinValues) * brdesc->bd_tupdesc->natts);
+ for (i = 0; i < brdesc->bd_tupdesc->natts; i++)
+ {
+ dtuple->bt_columns[i].bv_attno = i + 1;
+ dtuple->bt_columns[i].bv_allnulls = true;
+ dtuple->bt_columns[i].bv_hasnulls = false;
+ dtuple->bt_columns[i].bv_values = (Datum *) currdatum;
+
+ dtuple->bt_columns[i].bv_mem_value = PointerGetDatum(NULL);
+ dtuple->bt_columns[i].bv_serialize = NULL;
+ dtuple->bt_columns[i].bv_context = dtuple->bt_context;
+
+ currdatum += sizeof(Datum) * brdesc->bd_info[i]->oi_nstored;
+ }
+
+ return dtuple;
+}
+
+/*
+ * Convert a BrinTuple back to a BrinMemTuple. This is the reverse of
+ * brin_form_tuple.
+ *
+ * As an optimization, the caller can pass a previously allocated 'dMemtuple'.
+ * This avoids having to allocate it here, which can be useful when this
+ * function is called many times in a loop. It is caller's responsibility
+ * that the given BrinMemTuple matches what we need here.
+ *
+ * Note we don't need the "on disk tupdesc" here; we rely on our own routine to
+ * deconstruct the tuple from the on-disk format.
+ */
+BrinMemTuple *
+brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple, BrinMemTuple *dMemtuple)
+{
+ BrinMemTuple *dtup;
+ Datum *values;
+ bool *allnulls;
+ bool *hasnulls;
+ char *tp;
+ bits8 *nullbits;
+ int keyno;
+ int valueno;
+ MemoryContext oldcxt;
+
+ dtup = dMemtuple ? brin_memtuple_initialize(dMemtuple, brdesc) :
+ brin_new_memtuple(brdesc);
+
+ if (BrinTupleIsPlaceholder(tuple))
+ dtup->bt_placeholder = true;
+ dtup->bt_blkno = tuple->bt_blkno;
+
+ values = dtup->bt_values;
+ allnulls = dtup->bt_allnulls;
+ hasnulls = dtup->bt_hasnulls;
+
+ tp = (char *) tuple + BrinTupleDataOffset(tuple);
+
+ if (BrinTupleHasNulls(tuple))
+ nullbits = (bits8 *) ((char *) tuple + SizeOfBrinTuple);
+ else
+ nullbits = NULL;
+ brin_deconstruct_tuple(brdesc,
+ tp, nullbits, BrinTupleHasNulls(tuple),
+ values, allnulls, hasnulls);
+
+ /*
+ * Iterate to assign each of the values to the corresponding item in the
+ * values array of each column. The copies occur in the tuple's context.
+ */
+ oldcxt = MemoryContextSwitchTo(dtup->bt_context);
+ for (valueno = 0, keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++)
+ {
+ int i;
+
+ if (allnulls[keyno])
+ {
+ valueno += brdesc->bd_info[keyno]->oi_nstored;
+ continue;
+ }
+
+ /*
+ * We would like to skip datumCopy'ing the values datum in some cases,
+ * caller permitting ...
+ */
+ for (i = 0; i < brdesc->bd_info[keyno]->oi_nstored; i++)
+ dtup->bt_columns[keyno].bv_values[i] =
+ datumCopy(values[valueno++],
+ brdesc->bd_info[keyno]->oi_typcache[i]->typbyval,
+ brdesc->bd_info[keyno]->oi_typcache[i]->typlen);
+
+ dtup->bt_columns[keyno].bv_hasnulls = hasnulls[keyno];
+ dtup->bt_columns[keyno].bv_allnulls = false;
+
+ dtup->bt_columns[keyno].bv_mem_value = PointerGetDatum(NULL);
+ dtup->bt_columns[keyno].bv_serialize = NULL;
+ dtup->bt_columns[keyno].bv_context = dtup->bt_context;
+ }
+
+ MemoryContextSwitchTo(oldcxt);
+
+ return dtup;
+}
+
+/*
+ * brin_deconstruct_tuple
+ * Guts of attribute extraction from an on-disk BRIN tuple.
+ *
+ * Its arguments are:
+ * brdesc BRIN descriptor for the stored tuple
+ * tp pointer to the tuple data area
+ * nullbits pointer to the tuple nulls bitmask
+ * nulls "has nulls" bit in tuple infomask
+ * values output values, array of size brdesc->bd_totalstored
+ * allnulls output "allnulls", size brdesc->bd_tupdesc->natts
+ * hasnulls output "hasnulls", size brdesc->bd_tupdesc->natts
+ *
+ * Output arrays must have been allocated by caller.
+ */
+static inline void
+brin_deconstruct_tuple(BrinDesc *brdesc,
+ char *tp, bits8 *nullbits, bool nulls,
+ Datum *values, bool *allnulls, bool *hasnulls)
+{
+ int attnum;
+ int stored;
+ TupleDesc diskdsc;
+ long off;
+
+ /*
+ * First iterate to natts to obtain both null flags for each attribute.
+ * Note that we reverse the sense of the att_isnull test, because we store
+ * 1 for a null value (rather than a 1 for a not null value as is the
+ * att_isnull convention used elsewhere.) See brin_form_tuple.
+ */
+ for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++)
+ {
+ /*
+ * the "all nulls" bit means that all values in the page range for
+ * this column are nulls. Therefore there are no values in the tuple
+ * data area.
+ */
+ allnulls[attnum] = nulls && !att_isnull(attnum, nullbits);
+
+ /*
+ * the "has nulls" bit means that some tuples have nulls, but others
+ * have not-null values. Therefore we know the tuple contains data
+ * for this column.
+ *
+ * The hasnulls bits follow the allnulls bits in the same bitmask.
+ */
+ hasnulls[attnum] =
+ nulls && !att_isnull(brdesc->bd_tupdesc->natts + attnum, nullbits);
+ }
+
+ /*
+ * Iterate to obtain each attribute's stored values. Note that since we
+ * may reuse attribute entries for more than one column, we cannot cache
+ * offsets here.
+ */
+ diskdsc = brtuple_disk_tupdesc(brdesc);
+ stored = 0;
+ off = 0;
+ for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++)
+ {
+ int datumno;
+
+ if (allnulls[attnum])
+ {
+ stored += brdesc->bd_info[attnum]->oi_nstored;
+ continue;
+ }
+
+ for (datumno = 0;
+ datumno < brdesc->bd_info[attnum]->oi_nstored;
+ datumno++)
+ {
+ Form_pg_attribute thisatt = TupleDescAttr(diskdsc, stored);
+
+ if (thisatt->attlen == -1)
+ {
+ off = att_align_pointer(off, thisatt->attalign, -1,
+ tp + off);
+ }
+ else
+ {
+ /* not varlena, so safe to use att_align_nominal */
+ off = att_align_nominal(off, thisatt->attalign);
+ }
+
+ values[stored++] = fetchatt(thisatt, tp + off);
+
+ off = att_addlength_pointer(off, thisatt->attlen, tp + off);
+ }
+ }
+}
diff --git a/src/backend/access/brin/brin_validate.c b/src/backend/access/brin/brin_validate.c
new file mode 100644
index 0000000..11835d8
--- /dev/null
+++ b/src/backend/access/brin/brin_validate.c
@@ -0,0 +1,281 @@
+/*-------------------------------------------------------------------------
+ *
+ * brin_validate.c
+ * Opclass validator for BRIN.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/brin/brin_validate.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/amvalidate.h"
+#include "access/brin_internal.h"
+#include "access/htup_details.h"
+#include "catalog/pg_amop.h"
+#include "catalog/pg_amproc.h"
+#include "catalog/pg_opclass.h"
+#include "catalog/pg_opfamily.h"
+#include "catalog/pg_type.h"
+#include "utils/builtins.h"
+#include "utils/regproc.h"
+#include "utils/syscache.h"
+
+/*
+ * Validator for a BRIN opclass.
+ *
+ * Some of the checks done here cover the whole opfamily, and therefore are
+ * redundant when checking each opclass in a family. But they don't run long
+ * enough to be much of a problem, so we accept the duplication rather than
+ * complicate the amvalidate API.
+ */
+bool
+brinvalidate(Oid opclassoid)
+{
+ bool result = true;
+ HeapTuple classtup;
+ Form_pg_opclass classform;
+ Oid opfamilyoid;
+ Oid opcintype;
+ char *opclassname;
+ HeapTuple familytup;
+ Form_pg_opfamily familyform;
+ char *opfamilyname;
+ CatCList *proclist,
+ *oprlist;
+ uint64 allfuncs = 0;
+ uint64 allops = 0;
+ List *grouplist;
+ OpFamilyOpFuncGroup *opclassgroup;
+ int i;
+ ListCell *lc;
+
+ /* Fetch opclass information */
+ classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid));
+ if (!HeapTupleIsValid(classtup))
+ elog(ERROR, "cache lookup failed for operator class %u", opclassoid);
+ classform = (Form_pg_opclass) GETSTRUCT(classtup);
+
+ opfamilyoid = classform->opcfamily;
+ opcintype = classform->opcintype;
+ opclassname = NameStr(classform->opcname);
+
+ /* Fetch opfamily information */
+ familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid));
+ if (!HeapTupleIsValid(familytup))
+ elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid);
+ familyform = (Form_pg_opfamily) GETSTRUCT(familytup);
+
+ opfamilyname = NameStr(familyform->opfname);
+
+ /* Fetch all operators and support functions of the opfamily */
+ oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid));
+ proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid));
+
+ /* Check individual support functions */
+ for (i = 0; i < proclist->n_members; i++)
+ {
+ HeapTuple proctup = &proclist->members[i]->tuple;
+ Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup);
+ bool ok;
+
+ /* Check procedure numbers and function signatures */
+ switch (procform->amprocnum)
+ {
+ case BRIN_PROCNUM_OPCINFO:
+ ok = check_amproc_signature(procform->amproc, INTERNALOID, true,
+ 1, 1, INTERNALOID);
+ break;
+ case BRIN_PROCNUM_ADDVALUE:
+ ok = check_amproc_signature(procform->amproc, BOOLOID, true,
+ 4, 4, INTERNALOID, INTERNALOID,
+ INTERNALOID, INTERNALOID);
+ break;
+ case BRIN_PROCNUM_CONSISTENT:
+ ok = check_amproc_signature(procform->amproc, BOOLOID, true,
+ 3, 4, INTERNALOID, INTERNALOID,
+ INTERNALOID, INT4OID);
+ break;
+ case BRIN_PROCNUM_UNION:
+ ok = check_amproc_signature(procform->amproc, BOOLOID, true,
+ 3, 3, INTERNALOID, INTERNALOID,
+ INTERNALOID);
+ break;
+ case BRIN_PROCNUM_OPTIONS:
+ ok = check_amoptsproc_signature(procform->amproc);
+ break;
+ default:
+ /* Complain if it's not a valid optional proc number */
+ if (procform->amprocnum < BRIN_FIRST_OPTIONAL_PROCNUM ||
+ procform->amprocnum > BRIN_LAST_OPTIONAL_PROCNUM)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains function %s with invalid support number %d",
+ opfamilyname, "brin",
+ format_procedure(procform->amproc),
+ procform->amprocnum)));
+ result = false;
+ continue; /* omit bad proc numbers from allfuncs */
+ }
+ /* Can't check signatures of optional procs, so assume OK */
+ ok = true;
+ break;
+ }
+
+ if (!ok)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains function %s with wrong signature for support number %d",
+ opfamilyname, "brin",
+ format_procedure(procform->amproc),
+ procform->amprocnum)));
+ result = false;
+ }
+
+ /* Track all valid procedure numbers seen in opfamily */
+ allfuncs |= ((uint64) 1) << procform->amprocnum;
+ }
+
+ /* Check individual operators */
+ for (i = 0; i < oprlist->n_members; i++)
+ {
+ HeapTuple oprtup = &oprlist->members[i]->tuple;
+ Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup);
+
+ /* Check that only allowed strategy numbers exist */
+ if (oprform->amopstrategy < 1 || oprform->amopstrategy > 63)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains operator %s with invalid strategy number %d",
+ opfamilyname, "brin",
+ format_operator(oprform->amopopr),
+ oprform->amopstrategy)));
+ result = false;
+ }
+ else
+ {
+ /*
+ * The set of operators supplied varies across BRIN opfamilies.
+ * Our plan is to identify all operator strategy numbers used in
+ * the opfamily and then complain about datatype combinations that
+ * are missing any operator(s). However, consider only numbers
+ * that appear in some non-cross-type case, since cross-type
+ * operators may have unique strategies. (This is not a great
+ * heuristic, in particular an erroneous number used in a
+ * cross-type operator will not get noticed; but the core BRIN
+ * opfamilies are messy enough to make it necessary.)
+ */
+ if (oprform->amoplefttype == oprform->amoprighttype)
+ allops |= ((uint64) 1) << oprform->amopstrategy;
+ }
+
+ /* brin doesn't support ORDER BY operators */
+ if (oprform->amoppurpose != AMOP_SEARCH ||
+ OidIsValid(oprform->amopsortfamily))
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains invalid ORDER BY specification for operator %s",
+ opfamilyname, "brin",
+ format_operator(oprform->amopopr))));
+ result = false;
+ }
+
+ /* Check operator signature --- same for all brin strategies */
+ if (!check_amop_signature(oprform->amopopr, BOOLOID,
+ oprform->amoplefttype,
+ oprform->amoprighttype))
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s contains operator %s with wrong signature",
+ opfamilyname, "brin",
+ format_operator(oprform->amopopr))));
+ result = false;
+ }
+ }
+
+ /* Now check for inconsistent groups of operators/functions */
+ grouplist = identify_opfamily_groups(oprlist, proclist);
+ opclassgroup = NULL;
+ foreach(lc, grouplist)
+ {
+ OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc);
+
+ /* Remember the group exactly matching the test opclass */
+ if (thisgroup->lefttype == opcintype &&
+ thisgroup->righttype == opcintype)
+ opclassgroup = thisgroup;
+
+ /*
+ * Some BRIN opfamilies expect cross-type support functions to exist,
+ * and some don't. We don't know exactly which are which, so if we
+ * find a cross-type operator for which there are no support functions
+ * at all, let it pass. (Don't expect that all operators exist for
+ * such cross-type cases, either.)
+ */
+ if (thisgroup->functionset == 0 &&
+ thisgroup->lefttype != thisgroup->righttype)
+ continue;
+
+ /*
+ * Else complain if there seems to be an incomplete set of either
+ * operators or support functions for this datatype pair.
+ */
+ if (thisgroup->operatorset != allops)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s is missing operator(s) for types %s and %s",
+ opfamilyname, "brin",
+ format_type_be(thisgroup->lefttype),
+ format_type_be(thisgroup->righttype))));
+ result = false;
+ }
+ if (thisgroup->functionset != allfuncs)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator family \"%s\" of access method %s is missing support function(s) for types %s and %s",
+ opfamilyname, "brin",
+ format_type_be(thisgroup->lefttype),
+ format_type_be(thisgroup->righttype))));
+ result = false;
+ }
+ }
+
+ /* Check that the originally-named opclass is complete */
+ if (!opclassgroup || opclassgroup->operatorset != allops)
+ {
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator class \"%s\" of access method %s is missing operator(s)",
+ opclassname, "brin")));
+ result = false;
+ }
+ for (i = 1; i <= BRIN_MANDATORY_NPROCS; i++)
+ {
+ if (opclassgroup &&
+ (opclassgroup->functionset & (((int64) 1) << i)) != 0)
+ continue; /* got it */
+ ereport(INFO,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("operator class \"%s\" of access method %s is missing support function %d",
+ opclassname, "brin", i)));
+ result = false;
+ }
+
+ ReleaseCatCacheList(proclist);
+ ReleaseCatCacheList(oprlist);
+ ReleaseSysCache(familytup);
+ ReleaseSysCache(classtup);
+
+ return result;
+}
diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c
new file mode 100644
index 0000000..3519038
--- /dev/null
+++ b/src/backend/access/brin/brin_xlog.c
@@ -0,0 +1,367 @@
+/*
+ * brin_xlog.c
+ * XLog replay routines for BRIN indexes
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/brin/brin_xlog.c
+ */
+#include "postgres.h"
+
+#include "access/brin_page.h"
+#include "access/brin_pageops.h"
+#include "access/brin_xlog.h"
+#include "access/bufmask.h"
+#include "access/xlogutils.h"
+
+
+/*
+ * xlog replay routines
+ */
+static void
+brin_xlog_createidx(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ xl_brin_createidx *xlrec = (xl_brin_createidx *) XLogRecGetData(record);
+ Buffer buf;
+ Page page;
+
+ /* create the index' metapage */
+ buf = XLogInitBufferForRedo(record, 0);
+ Assert(BufferIsValid(buf));
+ page = (Page) BufferGetPage(buf);
+ brin_metapage_init(page, xlrec->pagesPerRange, xlrec->version);
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buf);
+ UnlockReleaseBuffer(buf);
+}
+
+/*
+ * Common part of an insert or update. Inserts the new tuple and updates the
+ * revmap.
+ */
+static void
+brin_xlog_insert_update(XLogReaderState *record,
+ xl_brin_insert *xlrec)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ Buffer buffer;
+ BlockNumber regpgno;
+ Page page;
+ XLogRedoAction action;
+
+ /*
+ * If we inserted the first and only tuple on the page, re-initialize the
+ * page from scratch.
+ */
+ if (XLogRecGetInfo(record) & XLOG_BRIN_INIT_PAGE)
+ {
+ buffer = XLogInitBufferForRedo(record, 0);
+ page = BufferGetPage(buffer);
+ brin_page_init(page, BRIN_PAGETYPE_REGULAR);
+ action = BLK_NEEDS_REDO;
+ }
+ else
+ {
+ action = XLogReadBufferForRedo(record, 0, &buffer);
+ }
+
+ /* need this page's blkno to store in revmap */
+ regpgno = BufferGetBlockNumber(buffer);
+
+ /* insert the index item into the page */
+ if (action == BLK_NEEDS_REDO)
+ {
+ OffsetNumber offnum;
+ BrinTuple *tuple;
+ Size tuplen;
+
+ tuple = (BrinTuple *) XLogRecGetBlockData(record, 0, &tuplen);
+
+ Assert(tuple->bt_blkno == xlrec->heapBlk);
+
+ page = (Page) BufferGetPage(buffer);
+ offnum = xlrec->offnum;
+ if (PageGetMaxOffsetNumber(page) + 1 < offnum)
+ elog(PANIC, "brin_xlog_insert_update: invalid max offset number");
+
+ offnum = PageAddItem(page, (Item) tuple, tuplen, offnum, true, false);
+ if (offnum == InvalidOffsetNumber)
+ elog(PANIC, "brin_xlog_insert_update: failed to add tuple");
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+
+ /* update the revmap */
+ action = XLogReadBufferForRedo(record, 1, &buffer);
+ if (action == BLK_NEEDS_REDO)
+ {
+ ItemPointerData tid;
+
+ ItemPointerSet(&tid, regpgno, xlrec->offnum);
+ page = (Page) BufferGetPage(buffer);
+
+ brinSetHeapBlockItemptr(buffer, xlrec->pagesPerRange, xlrec->heapBlk,
+ tid);
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+
+ /* XXX no FSM updates here ... */
+}
+
+/*
+ * replay a BRIN index insertion
+ */
+static void
+brin_xlog_insert(XLogReaderState *record)
+{
+ xl_brin_insert *xlrec = (xl_brin_insert *) XLogRecGetData(record);
+
+ brin_xlog_insert_update(record, xlrec);
+}
+
+/*
+ * replay a BRIN index update
+ */
+static void
+brin_xlog_update(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ xl_brin_update *xlrec = (xl_brin_update *) XLogRecGetData(record);
+ Buffer buffer;
+ XLogRedoAction action;
+
+ /* First remove the old tuple */
+ action = XLogReadBufferForRedo(record, 2, &buffer);
+ if (action == BLK_NEEDS_REDO)
+ {
+ Page page;
+ OffsetNumber offnum;
+
+ page = (Page) BufferGetPage(buffer);
+
+ offnum = xlrec->oldOffnum;
+
+ PageIndexTupleDeleteNoCompact(page, offnum);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+
+ /* Then insert the new tuple and update revmap, like in an insertion. */
+ brin_xlog_insert_update(record, &xlrec->insert);
+
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
+
+/*
+ * Update a tuple on a single page.
+ */
+static void
+brin_xlog_samepage_update(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ xl_brin_samepage_update *xlrec;
+ Buffer buffer;
+ XLogRedoAction action;
+
+ xlrec = (xl_brin_samepage_update *) XLogRecGetData(record);
+ action = XLogReadBufferForRedo(record, 0, &buffer);
+ if (action == BLK_NEEDS_REDO)
+ {
+ Size tuplen;
+ BrinTuple *brintuple;
+ Page page;
+ OffsetNumber offnum;
+
+ brintuple = (BrinTuple *) XLogRecGetBlockData(record, 0, &tuplen);
+
+ page = (Page) BufferGetPage(buffer);
+
+ offnum = xlrec->offnum;
+
+ if (!PageIndexTupleOverwrite(page, offnum, (Item) brintuple, tuplen))
+ elog(PANIC, "brin_xlog_samepage_update: failed to replace tuple");
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+
+ /* XXX no FSM updates here ... */
+}
+
+/*
+ * Replay a revmap page extension
+ */
+static void
+brin_xlog_revmap_extend(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ xl_brin_revmap_extend *xlrec;
+ Buffer metabuf;
+ Buffer buf;
+ Page page;
+ BlockNumber targetBlk;
+ XLogRedoAction action;
+
+ xlrec = (xl_brin_revmap_extend *) XLogRecGetData(record);
+
+ XLogRecGetBlockTag(record, 1, NULL, NULL, &targetBlk);
+ Assert(xlrec->targetBlk == targetBlk);
+
+ /* Update the metapage */
+ action = XLogReadBufferForRedo(record, 0, &metabuf);
+ if (action == BLK_NEEDS_REDO)
+ {
+ Page metapg;
+ BrinMetaPageData *metadata;
+
+ metapg = BufferGetPage(metabuf);
+ metadata = (BrinMetaPageData *) PageGetContents(metapg);
+
+ Assert(metadata->lastRevmapPage == xlrec->targetBlk - 1);
+ metadata->lastRevmapPage = xlrec->targetBlk;
+
+ PageSetLSN(metapg, lsn);
+
+ /*
+ * Set pd_lower just past the end of the metadata. This is essential,
+ * because without doing so, metadata will be lost if xlog.c
+ * compresses the page. (We must do this here because pre-v11
+ * versions of PG did not set the metapage's pd_lower correctly, so a
+ * pg_upgraded index might contain the wrong value.)
+ */
+ ((PageHeader) metapg)->pd_lower =
+ ((char *) metadata + sizeof(BrinMetaPageData)) - (char *) metapg;
+
+ MarkBufferDirty(metabuf);
+ }
+
+ /*
+ * Re-init the target block as a revmap page. There's never a full- page
+ * image here.
+ */
+
+ buf = XLogInitBufferForRedo(record, 1);
+ page = (Page) BufferGetPage(buf);
+ brin_page_init(page, BRIN_PAGETYPE_REVMAP);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buf);
+
+ UnlockReleaseBuffer(buf);
+ if (BufferIsValid(metabuf))
+ UnlockReleaseBuffer(metabuf);
+}
+
+static void
+brin_xlog_desummarize_page(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ xl_brin_desummarize *xlrec;
+ Buffer buffer;
+ XLogRedoAction action;
+
+ xlrec = (xl_brin_desummarize *) XLogRecGetData(record);
+
+ /* Update the revmap */
+ action = XLogReadBufferForRedo(record, 0, &buffer);
+ if (action == BLK_NEEDS_REDO)
+ {
+ ItemPointerData iptr;
+
+ ItemPointerSetInvalid(&iptr);
+ brinSetHeapBlockItemptr(buffer, xlrec->pagesPerRange, xlrec->heapBlk, iptr);
+
+ PageSetLSN(BufferGetPage(buffer), lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+
+ /* remove the leftover entry from the regular page */
+ action = XLogReadBufferForRedo(record, 1, &buffer);
+ if (action == BLK_NEEDS_REDO)
+ {
+ Page regPg = BufferGetPage(buffer);
+
+ PageIndexTupleDeleteNoCompact(regPg, xlrec->regOffset);
+
+ PageSetLSN(regPg, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
+
+void
+brin_redo(XLogReaderState *record)
+{
+ uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+ switch (info & XLOG_BRIN_OPMASK)
+ {
+ case XLOG_BRIN_CREATE_INDEX:
+ brin_xlog_createidx(record);
+ break;
+ case XLOG_BRIN_INSERT:
+ brin_xlog_insert(record);
+ break;
+ case XLOG_BRIN_UPDATE:
+ brin_xlog_update(record);
+ break;
+ case XLOG_BRIN_SAMEPAGE_UPDATE:
+ brin_xlog_samepage_update(record);
+ break;
+ case XLOG_BRIN_REVMAP_EXTEND:
+ brin_xlog_revmap_extend(record);
+ break;
+ case XLOG_BRIN_DESUMMARIZE:
+ brin_xlog_desummarize_page(record);
+ break;
+ default:
+ elog(PANIC, "brin_redo: unknown op code %u", info);
+ }
+}
+
+/*
+ * Mask a BRIN page before doing consistency checks.
+ */
+void
+brin_mask(char *pagedata, BlockNumber blkno)
+{
+ Page page = (Page) pagedata;
+ PageHeader pagehdr = (PageHeader) page;
+
+ mask_page_lsn_and_checksum(page);
+
+ mask_page_hint_bits(page);
+
+ /*
+ * Regular brin pages contain unused space which needs to be masked.
+ * Similarly for meta pages, but mask it only if pd_lower appears to have
+ * been set correctly.
+ */
+ if (BRIN_IS_REGULAR_PAGE(page) ||
+ (BRIN_IS_META_PAGE(page) && pagehdr->pd_lower > SizeOfPageHeaderData))
+ {
+ mask_unused_space(page);
+ }
+
+ /*
+ * BRIN_EVACUATE_PAGE is not WAL-logged, since it's of no use in recovery.
+ * Mask it. See brin_start_evacuating_page() for details.
+ */
+ BrinPageFlags(page) &= ~BRIN_EVACUATE_PAGE;
+}