summaryrefslogtreecommitdiffstats
path: root/src/backend/access/heap/heapam_handler.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/heap/heapam_handler.c')
-rw-r--r--src/backend/access/heap/heapam_handler.c2608
1 files changed, 2608 insertions, 0 deletions
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
new file mode 100644
index 0000000..2168259
--- /dev/null
+++ b/src/backend/access/heap/heapam_handler.c
@@ -0,0 +1,2608 @@
+/*-------------------------------------------------------------------------
+ *
+ * heapam_handler.c
+ * heap table access method code
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/heap/heapam_handler.c
+ *
+ *
+ * NOTES
+ * This files wires up the lower level heapam.c et al routines with the
+ * tableam abstraction.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/heapam.h"
+#include "access/heaptoast.h"
+#include "access/multixact.h"
+#include "access/rewriteheap.h"
+#include "access/syncscan.h"
+#include "access/tableam.h"
+#include "access/tsmapi.h"
+#include "access/xact.h"
+#include "catalog/catalog.h"
+#include "catalog/index.h"
+#include "catalog/storage.h"
+#include "catalog/storage_xlog.h"
+#include "commands/progress.h"
+#include "executor/executor.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+#include "storage/lmgr.h"
+#include "storage/predicate.h"
+#include "storage/procarray.h"
+#include "storage/smgr.h"
+#include "utils/builtins.h"
+#include "utils/rel.h"
+
+static void reform_and_rewrite_tuple(HeapTuple tuple,
+ Relation OldHeap, Relation NewHeap,
+ Datum *values, bool *isnull, RewriteState rwstate);
+
+static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
+ HeapTuple tuple,
+ OffsetNumber tupoffset);
+
+static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan);
+
+static const TableAmRoutine heapam_methods;
+
+
+/* ------------------------------------------------------------------------
+ * Slot related callbacks for heap AM
+ * ------------------------------------------------------------------------
+ */
+
+static const TupleTableSlotOps *
+heapam_slot_callbacks(Relation relation)
+{
+ return &TTSOpsBufferHeapTuple;
+}
+
+
+/* ------------------------------------------------------------------------
+ * Index Scan Callbacks for heap AM
+ * ------------------------------------------------------------------------
+ */
+
+static IndexFetchTableData *
+heapam_index_fetch_begin(Relation rel)
+{
+ IndexFetchHeapData *hscan = palloc0(sizeof(IndexFetchHeapData));
+
+ hscan->xs_base.rel = rel;
+ hscan->xs_cbuf = InvalidBuffer;
+
+ return &hscan->xs_base;
+}
+
+static void
+heapam_index_fetch_reset(IndexFetchTableData *scan)
+{
+ IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
+
+ if (BufferIsValid(hscan->xs_cbuf))
+ {
+ ReleaseBuffer(hscan->xs_cbuf);
+ hscan->xs_cbuf = InvalidBuffer;
+ }
+}
+
+static void
+heapam_index_fetch_end(IndexFetchTableData *scan)
+{
+ IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
+
+ heapam_index_fetch_reset(scan);
+
+ pfree(hscan);
+}
+
+static bool
+heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
+ ItemPointer tid,
+ Snapshot snapshot,
+ TupleTableSlot *slot,
+ bool *call_again, bool *all_dead)
+{
+ IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
+ BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+ bool got_heap_tuple;
+
+ Assert(TTS_IS_BUFFERTUPLE(slot));
+
+ /* We can skip the buffer-switching logic if we're in mid-HOT chain. */
+ if (!*call_again)
+ {
+ /* Switch to correct buffer if we don't have it already */
+ Buffer prev_buf = hscan->xs_cbuf;
+
+ hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf,
+ hscan->xs_base.rel,
+ ItemPointerGetBlockNumber(tid));
+
+ /*
+ * Prune page, but only if we weren't already on this page
+ */
+ if (prev_buf != hscan->xs_cbuf)
+ heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf);
+ }
+
+ /* Obtain share-lock on the buffer so we can examine visibility */
+ LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_SHARE);
+ got_heap_tuple = heap_hot_search_buffer(tid,
+ hscan->xs_base.rel,
+ hscan->xs_cbuf,
+ snapshot,
+ &bslot->base.tupdata,
+ all_dead,
+ !*call_again);
+ bslot->base.tupdata.t_self = *tid;
+ LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_UNLOCK);
+
+ if (got_heap_tuple)
+ {
+ /*
+ * Only in a non-MVCC snapshot can more than one member of the HOT
+ * chain be visible.
+ */
+ *call_again = !IsMVCCSnapshot(snapshot);
+
+ slot->tts_tableOid = RelationGetRelid(scan->rel);
+ ExecStoreBufferHeapTuple(&bslot->base.tupdata, slot, hscan->xs_cbuf);
+ }
+ else
+ {
+ /* We've reached the end of the HOT chain. */
+ *call_again = false;
+ }
+
+ return got_heap_tuple;
+}
+
+
+/* ------------------------------------------------------------------------
+ * Callbacks for non-modifying operations on individual tuples for heap AM
+ * ------------------------------------------------------------------------
+ */
+
+static bool
+heapam_fetch_row_version(Relation relation,
+ ItemPointer tid,
+ Snapshot snapshot,
+ TupleTableSlot *slot)
+{
+ BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+ Buffer buffer;
+
+ Assert(TTS_IS_BUFFERTUPLE(slot));
+
+ bslot->base.tupdata.t_self = *tid;
+ if (heap_fetch(relation, snapshot, &bslot->base.tupdata, &buffer, false))
+ {
+ /* store in slot, transferring existing pin */
+ ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, slot, buffer);
+ slot->tts_tableOid = RelationGetRelid(relation);
+
+ return true;
+ }
+
+ return false;
+}
+
+static bool
+heapam_tuple_tid_valid(TableScanDesc scan, ItemPointer tid)
+{
+ HeapScanDesc hscan = (HeapScanDesc) scan;
+
+ return ItemPointerIsValid(tid) &&
+ ItemPointerGetBlockNumber(tid) < hscan->rs_nblocks;
+}
+
+static bool
+heapam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot,
+ Snapshot snapshot)
+{
+ BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+ bool res;
+
+ Assert(TTS_IS_BUFFERTUPLE(slot));
+ Assert(BufferIsValid(bslot->buffer));
+
+ /*
+ * We need buffer pin and lock to call HeapTupleSatisfiesVisibility.
+ * Caller should be holding pin, but not lock.
+ */
+ LockBuffer(bslot->buffer, BUFFER_LOCK_SHARE);
+ res = HeapTupleSatisfiesVisibility(bslot->base.tuple, snapshot,
+ bslot->buffer);
+ LockBuffer(bslot->buffer, BUFFER_LOCK_UNLOCK);
+
+ return res;
+}
+
+
+/* ----------------------------------------------------------------------------
+ * Functions for manipulations of physical tuples for heap AM.
+ * ----------------------------------------------------------------------------
+ */
+
+static void
+heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid,
+ int options, BulkInsertState bistate)
+{
+ bool shouldFree = true;
+ HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
+
+ /* Update the tuple with table oid */
+ slot->tts_tableOid = RelationGetRelid(relation);
+ tuple->t_tableOid = slot->tts_tableOid;
+
+ /* Perform the insertion, and copy the resulting ItemPointer */
+ heap_insert(relation, tuple, cid, options, bistate);
+ ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
+
+ if (shouldFree)
+ pfree(tuple);
+}
+
+static void
+heapam_tuple_insert_speculative(Relation relation, TupleTableSlot *slot,
+ CommandId cid, int options,
+ BulkInsertState bistate, uint32 specToken)
+{
+ bool shouldFree = true;
+ HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
+
+ /* Update the tuple with table oid */
+ slot->tts_tableOid = RelationGetRelid(relation);
+ tuple->t_tableOid = slot->tts_tableOid;
+
+ HeapTupleHeaderSetSpeculativeToken(tuple->t_data, specToken);
+ options |= HEAP_INSERT_SPECULATIVE;
+
+ /* Perform the insertion, and copy the resulting ItemPointer */
+ heap_insert(relation, tuple, cid, options, bistate);
+ ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
+
+ if (shouldFree)
+ pfree(tuple);
+}
+
+static void
+heapam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot,
+ uint32 specToken, bool succeeded)
+{
+ bool shouldFree = true;
+ HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
+
+ /* adjust the tuple's state accordingly */
+ if (succeeded)
+ heap_finish_speculative(relation, &slot->tts_tid);
+ else
+ heap_abort_speculative(relation, &slot->tts_tid);
+
+ if (shouldFree)
+ pfree(tuple);
+}
+
+static TM_Result
+heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid,
+ Snapshot snapshot, Snapshot crosscheck, bool wait,
+ TM_FailureData *tmfd, bool changingPart)
+{
+ /*
+ * Currently Deleting of index tuples are handled at vacuum, in case if
+ * the storage itself is cleaning the dead tuples by itself, it is the
+ * time to call the index tuple deletion also.
+ */
+ return heap_delete(relation, tid, cid, crosscheck, wait, tmfd, changingPart);
+}
+
+
+static TM_Result
+heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot,
+ CommandId cid, Snapshot snapshot, Snapshot crosscheck,
+ bool wait, TM_FailureData *tmfd,
+ LockTupleMode *lockmode, bool *update_indexes)
+{
+ bool shouldFree = true;
+ HeapTuple tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
+ TM_Result result;
+
+ /* Update the tuple with table oid */
+ slot->tts_tableOid = RelationGetRelid(relation);
+ tuple->t_tableOid = slot->tts_tableOid;
+
+ result = heap_update(relation, otid, tuple, cid, crosscheck, wait,
+ tmfd, lockmode);
+ ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
+
+ /*
+ * Decide whether new index entries are needed for the tuple
+ *
+ * Note: heap_update returns the tid (location) of the new tuple in the
+ * t_self field.
+ *
+ * If it's a HOT update, we mustn't insert new index entries.
+ */
+ *update_indexes = result == TM_Ok && !HeapTupleIsHeapOnly(tuple);
+
+ if (shouldFree)
+ pfree(tuple);
+
+ return result;
+}
+
+static TM_Result
+heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot,
+ TupleTableSlot *slot, CommandId cid, LockTupleMode mode,
+ LockWaitPolicy wait_policy, uint8 flags,
+ TM_FailureData *tmfd)
+{
+ BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
+ TM_Result result;
+ Buffer buffer;
+ HeapTuple tuple = &bslot->base.tupdata;
+ bool follow_updates;
+
+ follow_updates = (flags & TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS) != 0;
+ tmfd->traversed = false;
+
+ Assert(TTS_IS_BUFFERTUPLE(slot));
+
+tuple_lock_retry:
+ tuple->t_self = *tid;
+ result = heap_lock_tuple(relation, tuple, cid, mode, wait_policy,
+ follow_updates, &buffer, tmfd);
+
+ if (result == TM_Updated &&
+ (flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION))
+ {
+ /* Should not encounter speculative tuple on recheck */
+ Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data));
+
+ ReleaseBuffer(buffer);
+
+ if (!ItemPointerEquals(&tmfd->ctid, &tuple->t_self))
+ {
+ SnapshotData SnapshotDirty;
+ TransactionId priorXmax;
+
+ /* it was updated, so look at the updated version */
+ *tid = tmfd->ctid;
+ /* updated row should have xmin matching this xmax */
+ priorXmax = tmfd->xmax;
+
+ /* signal that a tuple later in the chain is getting locked */
+ tmfd->traversed = true;
+
+ /*
+ * fetch target tuple
+ *
+ * Loop here to deal with updated or busy tuples
+ */
+ InitDirtySnapshot(SnapshotDirty);
+ for (;;)
+ {
+ if (ItemPointerIndicatesMovedPartitions(tid))
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("tuple to be locked was already moved to another partition due to concurrent update")));
+
+ tuple->t_self = *tid;
+ if (heap_fetch(relation, &SnapshotDirty, tuple, &buffer, true))
+ {
+ /*
+ * If xmin isn't what we're expecting, the slot must have
+ * been recycled and reused for an unrelated tuple. This
+ * implies that the latest version of the row was deleted,
+ * so we need do nothing. (Should be safe to examine xmin
+ * without getting buffer's content lock. We assume
+ * reading a TransactionId to be atomic, and Xmin never
+ * changes in an existing tuple, except to invalid or
+ * frozen, and neither of those can match priorXmax.)
+ */
+ if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data),
+ priorXmax))
+ {
+ ReleaseBuffer(buffer);
+ return TM_Deleted;
+ }
+
+ /* otherwise xmin should not be dirty... */
+ if (TransactionIdIsValid(SnapshotDirty.xmin))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg_internal("t_xmin %u is uncommitted in tuple (%u,%u) to be updated in table \"%s\"",
+ SnapshotDirty.xmin,
+ ItemPointerGetBlockNumber(&tuple->t_self),
+ ItemPointerGetOffsetNumber(&tuple->t_self),
+ RelationGetRelationName(relation))));
+
+ /*
+ * If tuple is being updated by other transaction then we
+ * have to wait for its commit/abort, or die trying.
+ */
+ if (TransactionIdIsValid(SnapshotDirty.xmax))
+ {
+ ReleaseBuffer(buffer);
+ switch (wait_policy)
+ {
+ case LockWaitBlock:
+ XactLockTableWait(SnapshotDirty.xmax,
+ relation, &tuple->t_self,
+ XLTW_FetchUpdated);
+ break;
+ case LockWaitSkip:
+ if (!ConditionalXactLockTableWait(SnapshotDirty.xmax))
+ /* skip instead of waiting */
+ return TM_WouldBlock;
+ break;
+ case LockWaitError:
+ if (!ConditionalXactLockTableWait(SnapshotDirty.xmax))
+ ereport(ERROR,
+ (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+ errmsg("could not obtain lock on row in relation \"%s\"",
+ RelationGetRelationName(relation))));
+ break;
+ }
+ continue; /* loop back to repeat heap_fetch */
+ }
+
+ /*
+ * If tuple was inserted by our own transaction, we have
+ * to check cmin against cid: cmin >= current CID means
+ * our command cannot see the tuple, so we should ignore
+ * it. Otherwise heap_lock_tuple() will throw an error,
+ * and so would any later attempt to update or delete the
+ * tuple. (We need not check cmax because
+ * HeapTupleSatisfiesDirty will consider a tuple deleted
+ * by our transaction dead, regardless of cmax.) We just
+ * checked that priorXmax == xmin, so we can test that
+ * variable instead of doing HeapTupleHeaderGetXmin again.
+ */
+ if (TransactionIdIsCurrentTransactionId(priorXmax) &&
+ HeapTupleHeaderGetCmin(tuple->t_data) >= cid)
+ {
+ tmfd->xmax = priorXmax;
+
+ /*
+ * Cmin is the problematic value, so store that. See
+ * above.
+ */
+ tmfd->cmax = HeapTupleHeaderGetCmin(tuple->t_data);
+ ReleaseBuffer(buffer);
+ return TM_SelfModified;
+ }
+
+ /*
+ * This is a live tuple, so try to lock it again.
+ */
+ ReleaseBuffer(buffer);
+ goto tuple_lock_retry;
+ }
+
+ /*
+ * If the referenced slot was actually empty, the latest
+ * version of the row must have been deleted, so we need do
+ * nothing.
+ */
+ if (tuple->t_data == NULL)
+ {
+ Assert(!BufferIsValid(buffer));
+ return TM_Deleted;
+ }
+
+ /*
+ * As above, if xmin isn't what we're expecting, do nothing.
+ */
+ if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data),
+ priorXmax))
+ {
+ ReleaseBuffer(buffer);
+ return TM_Deleted;
+ }
+
+ /*
+ * If we get here, the tuple was found but failed
+ * SnapshotDirty. Assuming the xmin is either a committed xact
+ * or our own xact (as it certainly should be if we're trying
+ * to modify the tuple), this must mean that the row was
+ * updated or deleted by either a committed xact or our own
+ * xact. If it was deleted, we can ignore it; if it was
+ * updated then chain up to the next version and repeat the
+ * whole process.
+ *
+ * As above, it should be safe to examine xmax and t_ctid
+ * without the buffer content lock, because they can't be
+ * changing. We'd better hold a buffer pin though.
+ */
+ if (ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
+ {
+ /* deleted, so forget about it */
+ ReleaseBuffer(buffer);
+ return TM_Deleted;
+ }
+
+ /* updated, so look at the updated row */
+ *tid = tuple->t_data->t_ctid;
+ /* updated row should have xmin matching this xmax */
+ priorXmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
+ ReleaseBuffer(buffer);
+ /* loop back to fetch next in chain */
+ }
+ }
+ else
+ {
+ /* tuple was deleted, so give up */
+ return TM_Deleted;
+ }
+ }
+
+ slot->tts_tableOid = RelationGetRelid(relation);
+ tuple->t_tableOid = slot->tts_tableOid;
+
+ /* store in slot, transferring existing pin */
+ ExecStorePinnedBufferHeapTuple(tuple, slot, buffer);
+
+ return result;
+}
+
+
+/* ------------------------------------------------------------------------
+ * DDL related callbacks for heap AM.
+ * ------------------------------------------------------------------------
+ */
+
+static void
+heapam_relation_set_new_filenode(Relation rel,
+ const RelFileNode *newrnode,
+ char persistence,
+ TransactionId *freezeXid,
+ MultiXactId *minmulti)
+{
+ SMgrRelation srel;
+
+ /*
+ * Initialize to the minimum XID that could put tuples in the table. We
+ * know that no xacts older than RecentXmin are still running, so that
+ * will do.
+ */
+ *freezeXid = RecentXmin;
+
+ /*
+ * Similarly, initialize the minimum Multixact to the first value that
+ * could possibly be stored in tuples in the table. Running transactions
+ * could reuse values from their local cache, so we are careful to
+ * consider all currently running multis.
+ *
+ * XXX this could be refined further, but is it worth the hassle?
+ */
+ *minmulti = GetOldestMultiXactId();
+
+ srel = RelationCreateStorage(*newrnode, persistence, true);
+
+ /*
+ * If required, set up an init fork for an unlogged table so that it can
+ * be correctly reinitialized on restart. An immediate sync is required
+ * even if the page has been logged, because the write did not go through
+ * shared_buffers and therefore a concurrent checkpoint may have moved the
+ * redo pointer past our xlog record. Recovery may as well remove it
+ * while replaying, for example, XLOG_DBASE_CREATE* or XLOG_TBLSPC_CREATE
+ * record. Therefore, logging is necessary even if wal_level=minimal.
+ */
+ if (persistence == RELPERSISTENCE_UNLOGGED)
+ {
+ Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
+ rel->rd_rel->relkind == RELKIND_MATVIEW ||
+ rel->rd_rel->relkind == RELKIND_TOASTVALUE);
+ smgrcreate(srel, INIT_FORKNUM, false);
+ log_smgrcreate(newrnode, INIT_FORKNUM);
+ smgrimmedsync(srel, INIT_FORKNUM);
+ }
+
+ smgrclose(srel);
+}
+
+static void
+heapam_relation_nontransactional_truncate(Relation rel)
+{
+ RelationTruncate(rel, 0);
+}
+
+static void
+heapam_relation_copy_data(Relation rel, const RelFileNode *newrnode)
+{
+ SMgrRelation dstrel;
+
+ dstrel = smgropen(*newrnode, rel->rd_backend);
+
+ /*
+ * Since we copy the file directly without looking at the shared buffers,
+ * we'd better first flush out any pages of the source relation that are
+ * in shared buffers. We assume no new changes will be made while we are
+ * holding exclusive lock on the rel.
+ */
+ FlushRelationBuffers(rel);
+
+ /*
+ * Create and copy all forks of the relation, and schedule unlinking of
+ * old physical files.
+ *
+ * NOTE: any conflict in relfilenode value will be caught in
+ * RelationCreateStorage().
+ */
+ RelationCreateStorage(*newrnode, rel->rd_rel->relpersistence, true);
+
+ /* copy main fork */
+ RelationCopyStorage(RelationGetSmgr(rel), dstrel, MAIN_FORKNUM,
+ rel->rd_rel->relpersistence);
+
+ /* copy those extra forks that exist */
+ for (ForkNumber forkNum = MAIN_FORKNUM + 1;
+ forkNum <= MAX_FORKNUM; forkNum++)
+ {
+ if (smgrexists(RelationGetSmgr(rel), forkNum))
+ {
+ smgrcreate(dstrel, forkNum, false);
+
+ /*
+ * WAL log creation if the relation is persistent, or this is the
+ * init fork of an unlogged relation.
+ */
+ if (RelationIsPermanent(rel) ||
+ (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
+ forkNum == INIT_FORKNUM))
+ log_smgrcreate(newrnode, forkNum);
+ RelationCopyStorage(RelationGetSmgr(rel), dstrel, forkNum,
+ rel->rd_rel->relpersistence);
+ }
+ }
+
+
+ /* drop old relation, and close new one */
+ RelationDropStorage(rel);
+ smgrclose(dstrel);
+}
+
+static void
+heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
+ Relation OldIndex, bool use_sort,
+ TransactionId OldestXmin,
+ TransactionId *xid_cutoff,
+ MultiXactId *multi_cutoff,
+ double *num_tuples,
+ double *tups_vacuumed,
+ double *tups_recently_dead)
+{
+ RewriteState rwstate;
+ IndexScanDesc indexScan;
+ TableScanDesc tableScan;
+ HeapScanDesc heapScan;
+ bool is_system_catalog;
+ Tuplesortstate *tuplesort;
+ TupleDesc oldTupDesc = RelationGetDescr(OldHeap);
+ TupleDesc newTupDesc = RelationGetDescr(NewHeap);
+ TupleTableSlot *slot;
+ int natts;
+ Datum *values;
+ bool *isnull;
+ BufferHeapTupleTableSlot *hslot;
+ BlockNumber prev_cblock = InvalidBlockNumber;
+
+ /* Remember if it's a system catalog */
+ is_system_catalog = IsSystemRelation(OldHeap);
+
+ /*
+ * Valid smgr_targblock implies something already wrote to the relation.
+ * This may be harmless, but this function hasn't planned for it.
+ */
+ Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
+
+ /* Preallocate values/isnull arrays */
+ natts = newTupDesc->natts;
+ values = (Datum *) palloc(natts * sizeof(Datum));
+ isnull = (bool *) palloc(natts * sizeof(bool));
+
+ /* Initialize the rewrite operation */
+ rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, *xid_cutoff,
+ *multi_cutoff);
+
+
+ /* Set up sorting if wanted */
+ if (use_sort)
+ tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
+ maintenance_work_mem,
+ NULL, TUPLESORT_NONE);
+ else
+ tuplesort = NULL;
+
+ /*
+ * Prepare to scan the OldHeap. To ensure we see recently-dead tuples
+ * that still need to be copied, we scan with SnapshotAny and use
+ * HeapTupleSatisfiesVacuum for the visibility test.
+ */
+ if (OldIndex != NULL && !use_sort)
+ {
+ const int ci_index[] = {
+ PROGRESS_CLUSTER_PHASE,
+ PROGRESS_CLUSTER_INDEX_RELID
+ };
+ int64 ci_val[2];
+
+ /* Set phase and OIDOldIndex to columns */
+ ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP;
+ ci_val[1] = RelationGetRelid(OldIndex);
+ pgstat_progress_update_multi_param(2, ci_index, ci_val);
+
+ tableScan = NULL;
+ heapScan = NULL;
+ indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
+ index_rescan(indexScan, NULL, 0, NULL, 0);
+ }
+ else
+ {
+ /* In scan-and-sort mode and also VACUUM FULL, set phase */
+ pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
+ PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP);
+
+ tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL);
+ heapScan = (HeapScanDesc) tableScan;
+ indexScan = NULL;
+
+ /* Set total heap blocks */
+ pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS,
+ heapScan->rs_nblocks);
+ }
+
+ slot = table_slot_create(OldHeap, NULL);
+ hslot = (BufferHeapTupleTableSlot *) slot;
+
+ /*
+ * Scan through the OldHeap, either in OldIndex order or sequentially;
+ * copy each tuple into the NewHeap, or transiently to the tuplesort
+ * module. Note that we don't bother sorting dead tuples (they won't get
+ * to the new table anyway).
+ */
+ for (;;)
+ {
+ HeapTuple tuple;
+ Buffer buf;
+ bool isdead;
+
+ CHECK_FOR_INTERRUPTS();
+
+ if (indexScan != NULL)
+ {
+ if (!index_getnext_slot(indexScan, ForwardScanDirection, slot))
+ break;
+
+ /* Since we used no scan keys, should never need to recheck */
+ if (indexScan->xs_recheck)
+ elog(ERROR, "CLUSTER does not support lossy index conditions");
+ }
+ else
+ {
+ if (!table_scan_getnextslot(tableScan, ForwardScanDirection, slot))
+ {
+ /*
+ * If the last pages of the scan were empty, we would go to
+ * the next phase while heap_blks_scanned != heap_blks_total.
+ * Instead, to ensure that heap_blks_scanned is equivalent to
+ * total_heap_blks after the table scan phase, this parameter
+ * is manually updated to the correct value when the table
+ * scan finishes.
+ */
+ pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED,
+ heapScan->rs_nblocks);
+ break;
+ }
+
+ /*
+ * In scan-and-sort mode and also VACUUM FULL, set heap blocks
+ * scanned
+ *
+ * Note that heapScan may start at an offset and wrap around, i.e.
+ * rs_startblock may be >0, and rs_cblock may end with a number
+ * below rs_startblock. To prevent showing this wraparound to the
+ * user, we offset rs_cblock by rs_startblock (modulo rs_nblocks).
+ */
+ if (prev_cblock != heapScan->rs_cblock)
+ {
+ pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED,
+ (heapScan->rs_cblock +
+ heapScan->rs_nblocks -
+ heapScan->rs_startblock
+ ) % heapScan->rs_nblocks + 1);
+ prev_cblock = heapScan->rs_cblock;
+ }
+ }
+
+ tuple = ExecFetchSlotHeapTuple(slot, false, NULL);
+ buf = hslot->buffer;
+
+ LockBuffer(buf, BUFFER_LOCK_SHARE);
+
+ switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf))
+ {
+ case HEAPTUPLE_DEAD:
+ /* Definitely dead */
+ isdead = true;
+ break;
+ case HEAPTUPLE_RECENTLY_DEAD:
+ *tups_recently_dead += 1;
+ /* fall through */
+ case HEAPTUPLE_LIVE:
+ /* Live or recently dead, must copy it */
+ isdead = false;
+ break;
+ case HEAPTUPLE_INSERT_IN_PROGRESS:
+
+ /*
+ * Since we hold exclusive lock on the relation, normally the
+ * only way to see this is if it was inserted earlier in our
+ * own transaction. However, it can happen in system
+ * catalogs, since we tend to release write lock before commit
+ * there. Give a warning if neither case applies; but in any
+ * case we had better copy it.
+ */
+ if (!is_system_catalog &&
+ !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
+ elog(WARNING, "concurrent insert in progress within table \"%s\"",
+ RelationGetRelationName(OldHeap));
+ /* treat as live */
+ isdead = false;
+ break;
+ case HEAPTUPLE_DELETE_IN_PROGRESS:
+
+ /*
+ * Similar situation to INSERT_IN_PROGRESS case.
+ */
+ if (!is_system_catalog &&
+ !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
+ elog(WARNING, "concurrent delete in progress within table \"%s\"",
+ RelationGetRelationName(OldHeap));
+ /* treat as recently dead */
+ *tups_recently_dead += 1;
+ isdead = false;
+ break;
+ default:
+ elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
+ isdead = false; /* keep compiler quiet */
+ break;
+ }
+
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+ if (isdead)
+ {
+ *tups_vacuumed += 1;
+ /* heap rewrite module still needs to see it... */
+ if (rewrite_heap_dead_tuple(rwstate, tuple))
+ {
+ /* A previous recently-dead tuple is now known dead */
+ *tups_vacuumed += 1;
+ *tups_recently_dead -= 1;
+ }
+ continue;
+ }
+
+ *num_tuples += 1;
+ if (tuplesort != NULL)
+ {
+ tuplesort_putheaptuple(tuplesort, tuple);
+
+ /*
+ * In scan-and-sort mode, report increase in number of tuples
+ * scanned
+ */
+ pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
+ *num_tuples);
+ }
+ else
+ {
+ const int ct_index[] = {
+ PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
+ PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN
+ };
+ int64 ct_val[2];
+
+ reform_and_rewrite_tuple(tuple, OldHeap, NewHeap,
+ values, isnull, rwstate);
+
+ /*
+ * In indexscan mode and also VACUUM FULL, report increase in
+ * number of tuples scanned and written
+ */
+ ct_val[0] = *num_tuples;
+ ct_val[1] = *num_tuples;
+ pgstat_progress_update_multi_param(2, ct_index, ct_val);
+ }
+ }
+
+ if (indexScan != NULL)
+ index_endscan(indexScan);
+ if (tableScan != NULL)
+ table_endscan(tableScan);
+ if (slot)
+ ExecDropSingleTupleTableSlot(slot);
+
+ /*
+ * In scan-and-sort mode, complete the sort, then read out all live tuples
+ * from the tuplestore and write them to the new relation.
+ */
+ if (tuplesort != NULL)
+ {
+ double n_tuples = 0;
+
+ /* Report that we are now sorting tuples */
+ pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
+ PROGRESS_CLUSTER_PHASE_SORT_TUPLES);
+
+ tuplesort_performsort(tuplesort);
+
+ /* Report that we are now writing new heap */
+ pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
+ PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP);
+
+ for (;;)
+ {
+ HeapTuple tuple;
+
+ CHECK_FOR_INTERRUPTS();
+
+ tuple = tuplesort_getheaptuple(tuplesort, true);
+ if (tuple == NULL)
+ break;
+
+ n_tuples += 1;
+ reform_and_rewrite_tuple(tuple,
+ OldHeap, NewHeap,
+ values, isnull,
+ rwstate);
+ /* Report n_tuples */
+ pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN,
+ n_tuples);
+ }
+
+ tuplesort_end(tuplesort);
+ }
+
+ /* Write out any remaining tuples, and fsync if needed */
+ end_heap_rewrite(rwstate);
+
+ /* Clean up */
+ pfree(values);
+ pfree(isnull);
+}
+
+static bool
+heapam_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
+ BufferAccessStrategy bstrategy)
+{
+ HeapScanDesc hscan = (HeapScanDesc) scan;
+
+ /*
+ * We must maintain a pin on the target page's buffer to ensure that
+ * concurrent activity - e.g. HOT pruning - doesn't delete tuples out from
+ * under us. Hence, pin the page until we are done looking at it. We
+ * also choose to hold sharelock on the buffer throughout --- we could
+ * release and re-acquire sharelock for each tuple, but since we aren't
+ * doing much work per tuple, the extra lock traffic is probably better
+ * avoided.
+ */
+ hscan->rs_cblock = blockno;
+ hscan->rs_cindex = FirstOffsetNumber;
+ hscan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM,
+ blockno, RBM_NORMAL, bstrategy);
+ LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
+
+ /* in heap all blocks can contain tuples, so always return true */
+ return true;
+}
+
+static bool
+heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
+ double *liverows, double *deadrows,
+ TupleTableSlot *slot)
+{
+ HeapScanDesc hscan = (HeapScanDesc) scan;
+ Page targpage;
+ OffsetNumber maxoffset;
+ BufferHeapTupleTableSlot *hslot;
+
+ Assert(TTS_IS_BUFFERTUPLE(slot));
+
+ hslot = (BufferHeapTupleTableSlot *) slot;
+ targpage = BufferGetPage(hscan->rs_cbuf);
+ maxoffset = PageGetMaxOffsetNumber(targpage);
+
+ /* Inner loop over all tuples on the selected page */
+ for (; hscan->rs_cindex <= maxoffset; hscan->rs_cindex++)
+ {
+ ItemId itemid;
+ HeapTuple targtuple = &hslot->base.tupdata;
+ bool sample_it = false;
+
+ itemid = PageGetItemId(targpage, hscan->rs_cindex);
+
+ /*
+ * We ignore unused and redirect line pointers. DEAD line pointers
+ * should be counted as dead, because we need vacuum to run to get rid
+ * of them. Note that this rule agrees with the way that
+ * heap_page_prune() counts things.
+ */
+ if (!ItemIdIsNormal(itemid))
+ {
+ if (ItemIdIsDead(itemid))
+ *deadrows += 1;
+ continue;
+ }
+
+ ItemPointerSet(&targtuple->t_self, hscan->rs_cblock, hscan->rs_cindex);
+
+ targtuple->t_tableOid = RelationGetRelid(scan->rs_rd);
+ targtuple->t_data = (HeapTupleHeader) PageGetItem(targpage, itemid);
+ targtuple->t_len = ItemIdGetLength(itemid);
+
+ switch (HeapTupleSatisfiesVacuum(targtuple, OldestXmin,
+ hscan->rs_cbuf))
+ {
+ case HEAPTUPLE_LIVE:
+ sample_it = true;
+ *liverows += 1;
+ break;
+
+ case HEAPTUPLE_DEAD:
+ case HEAPTUPLE_RECENTLY_DEAD:
+ /* Count dead and recently-dead rows */
+ *deadrows += 1;
+ break;
+
+ case HEAPTUPLE_INSERT_IN_PROGRESS:
+
+ /*
+ * Insert-in-progress rows are not counted. We assume that
+ * when the inserting transaction commits or aborts, it will
+ * send a stats message to increment the proper count. This
+ * works right only if that transaction ends after we finish
+ * analyzing the table; if things happen in the other order,
+ * its stats update will be overwritten by ours. However, the
+ * error will be large only if the other transaction runs long
+ * enough to insert many tuples, so assuming it will finish
+ * after us is the safer option.
+ *
+ * A special case is that the inserting transaction might be
+ * our own. In this case we should count and sample the row,
+ * to accommodate users who load a table and analyze it in one
+ * transaction. (pgstat_report_analyze has to adjust the
+ * numbers we report to the cumulative stats system to make
+ * this come out right.)
+ */
+ if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple->t_data)))
+ {
+ sample_it = true;
+ *liverows += 1;
+ }
+ break;
+
+ case HEAPTUPLE_DELETE_IN_PROGRESS:
+
+ /*
+ * We count and sample delete-in-progress rows the same as
+ * live ones, so that the stats counters come out right if the
+ * deleting transaction commits after us, per the same
+ * reasoning given above.
+ *
+ * If the delete was done by our own transaction, however, we
+ * must count the row as dead to make pgstat_report_analyze's
+ * stats adjustments come out right. (Note: this works out
+ * properly when the row was both inserted and deleted in our
+ * xact.)
+ *
+ * The net effect of these choices is that we act as though an
+ * IN_PROGRESS transaction hasn't happened yet, except if it
+ * is our own transaction, which we assume has happened.
+ *
+ * This approach ensures that we behave sanely if we see both
+ * the pre-image and post-image rows for a row being updated
+ * by a concurrent transaction: we will sample the pre-image
+ * but not the post-image. We also get sane results if the
+ * concurrent transaction never commits.
+ */
+ if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple->t_data)))
+ *deadrows += 1;
+ else
+ {
+ sample_it = true;
+ *liverows += 1;
+ }
+ break;
+
+ default:
+ elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
+ break;
+ }
+
+ if (sample_it)
+ {
+ ExecStoreBufferHeapTuple(targtuple, slot, hscan->rs_cbuf);
+ hscan->rs_cindex++;
+
+ /* note that we leave the buffer locked here! */
+ return true;
+ }
+ }
+
+ /* Now release the lock and pin on the page */
+ UnlockReleaseBuffer(hscan->rs_cbuf);
+ hscan->rs_cbuf = InvalidBuffer;
+
+ /* also prevent old slot contents from having pin on page */
+ ExecClearTuple(slot);
+
+ return false;
+}
+
+static double
+heapam_index_build_range_scan(Relation heapRelation,
+ Relation indexRelation,
+ IndexInfo *indexInfo,
+ bool allow_sync,
+ bool anyvisible,
+ bool progress,
+ BlockNumber start_blockno,
+ BlockNumber numblocks,
+ IndexBuildCallback callback,
+ void *callback_state,
+ TableScanDesc scan)
+{
+ HeapScanDesc hscan;
+ bool is_system_catalog;
+ bool checking_uniqueness;
+ HeapTuple heapTuple;
+ Datum values[INDEX_MAX_KEYS];
+ bool isnull[INDEX_MAX_KEYS];
+ double reltuples;
+ ExprState *predicate;
+ TupleTableSlot *slot;
+ EState *estate;
+ ExprContext *econtext;
+ Snapshot snapshot;
+ bool need_unregister_snapshot = false;
+ TransactionId OldestXmin;
+ BlockNumber previous_blkno = InvalidBlockNumber;
+ BlockNumber root_blkno = InvalidBlockNumber;
+ OffsetNumber root_offsets[MaxHeapTuplesPerPage];
+
+ /*
+ * sanity checks
+ */
+ Assert(OidIsValid(indexRelation->rd_rel->relam));
+
+ /* Remember if it's a system catalog */
+ is_system_catalog = IsSystemRelation(heapRelation);
+
+ /* See whether we're verifying uniqueness/exclusion properties */
+ checking_uniqueness = (indexInfo->ii_Unique ||
+ indexInfo->ii_ExclusionOps != NULL);
+
+ /*
+ * "Any visible" mode is not compatible with uniqueness checks; make sure
+ * only one of those is requested.
+ */
+ Assert(!(anyvisible && checking_uniqueness));
+
+ /*
+ * Need an EState for evaluation of index expressions and partial-index
+ * predicates. Also a slot to hold the current tuple.
+ */
+ estate = CreateExecutorState();
+ econtext = GetPerTupleExprContext(estate);
+ slot = table_slot_create(heapRelation, NULL);
+
+ /* Arrange for econtext's scan tuple to be the tuple under test */
+ econtext->ecxt_scantuple = slot;
+
+ /* Set up execution state for predicate, if any. */
+ predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
+
+ /*
+ * Prepare for scan of the base relation. In a normal index build, we use
+ * SnapshotAny because we must retrieve all tuples and do our own time
+ * qual checks (because we have to index RECENTLY_DEAD tuples). In a
+ * concurrent build, or during bootstrap, we take a regular MVCC snapshot
+ * and index whatever's live according to that.
+ */
+ OldestXmin = InvalidTransactionId;
+
+ /* okay to ignore lazy VACUUMs here */
+ if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent)
+ OldestXmin = GetOldestNonRemovableTransactionId(heapRelation);
+
+ if (!scan)
+ {
+ /*
+ * Serial index build.
+ *
+ * Must begin our own heap scan in this case. We may also need to
+ * register a snapshot whose lifetime is under our direct control.
+ */
+ if (!TransactionIdIsValid(OldestXmin))
+ {
+ snapshot = RegisterSnapshot(GetTransactionSnapshot());
+ need_unregister_snapshot = true;
+ }
+ else
+ snapshot = SnapshotAny;
+
+ scan = table_beginscan_strat(heapRelation, /* relation */
+ snapshot, /* snapshot */
+ 0, /* number of keys */
+ NULL, /* scan key */
+ true, /* buffer access strategy OK */
+ allow_sync); /* syncscan OK? */
+ }
+ else
+ {
+ /*
+ * Parallel index build.
+ *
+ * Parallel case never registers/unregisters own snapshot. Snapshot
+ * is taken from parallel heap scan, and is SnapshotAny or an MVCC
+ * snapshot, based on same criteria as serial case.
+ */
+ Assert(!IsBootstrapProcessingMode());
+ Assert(allow_sync);
+ snapshot = scan->rs_snapshot;
+ }
+
+ hscan = (HeapScanDesc) scan;
+
+ /*
+ * Must have called GetOldestNonRemovableTransactionId() if using
+ * SnapshotAny. Shouldn't have for an MVCC snapshot. (It's especially
+ * worth checking this for parallel builds, since ambuild routines that
+ * support parallel builds must work these details out for themselves.)
+ */
+ Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot));
+ Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) :
+ !TransactionIdIsValid(OldestXmin));
+ Assert(snapshot == SnapshotAny || !anyvisible);
+
+ /* Publish number of blocks to scan */
+ if (progress)
+ {
+ BlockNumber nblocks;
+
+ if (hscan->rs_base.rs_parallel != NULL)
+ {
+ ParallelBlockTableScanDesc pbscan;
+
+ pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
+ nblocks = pbscan->phs_nblocks;
+ }
+ else
+ nblocks = hscan->rs_nblocks;
+
+ pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
+ nblocks);
+ }
+
+ /* set our scan endpoints */
+ if (!allow_sync)
+ heap_setscanlimits(scan, start_blockno, numblocks);
+ else
+ {
+ /* syncscan can only be requested on whole relation */
+ Assert(start_blockno == 0);
+ Assert(numblocks == InvalidBlockNumber);
+ }
+
+ reltuples = 0;
+
+ /*
+ * Scan all tuples in the base relation.
+ */
+ while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+ {
+ bool tupleIsAlive;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /* Report scan progress, if asked to. */
+ if (progress)
+ {
+ BlockNumber blocks_done = heapam_scan_get_blocks_done(hscan);
+
+ if (blocks_done != previous_blkno)
+ {
+ pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
+ blocks_done);
+ previous_blkno = blocks_done;
+ }
+ }
+
+ /*
+ * When dealing with a HOT-chain of updated tuples, we want to index
+ * the values of the live tuple (if any), but index it under the TID
+ * of the chain's root tuple. This approach is necessary to preserve
+ * the HOT-chain structure in the heap. So we need to be able to find
+ * the root item offset for every tuple that's in a HOT-chain. When
+ * first reaching a new page of the relation, call
+ * heap_get_root_tuples() to build a map of root item offsets on the
+ * page.
+ *
+ * It might look unsafe to use this information across buffer
+ * lock/unlock. However, we hold ShareLock on the table so no
+ * ordinary insert/update/delete should occur; and we hold pin on the
+ * buffer continuously while visiting the page, so no pruning
+ * operation can occur either.
+ *
+ * In cases with only ShareUpdateExclusiveLock on the table, it's
+ * possible for some HOT tuples to appear that we didn't know about
+ * when we first read the page. To handle that case, we re-obtain the
+ * list of root offsets when a HOT tuple points to a root item that we
+ * don't know about.
+ *
+ * Also, although our opinions about tuple liveness could change while
+ * we scan the page (due to concurrent transaction commits/aborts),
+ * the chain root locations won't, so this info doesn't need to be
+ * rebuilt after waiting for another transaction.
+ *
+ * Note the implied assumption that there is no more than one live
+ * tuple per HOT-chain --- else we could create more than one index
+ * entry pointing to the same root tuple.
+ */
+ if (hscan->rs_cblock != root_blkno)
+ {
+ Page page = BufferGetPage(hscan->rs_cbuf);
+
+ LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
+ heap_get_root_tuples(page, root_offsets);
+ LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+
+ root_blkno = hscan->rs_cblock;
+ }
+
+ if (snapshot == SnapshotAny)
+ {
+ /* do our own time qual check */
+ bool indexIt;
+ TransactionId xwait;
+
+ recheck:
+
+ /*
+ * We could possibly get away with not locking the buffer here,
+ * since caller should hold ShareLock on the relation, but let's
+ * be conservative about it. (This remark is still correct even
+ * with HOT-pruning: our pin on the buffer prevents pruning.)
+ */
+ LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
+
+ /*
+ * The criteria for counting a tuple as live in this block need to
+ * match what analyze.c's heapam_scan_analyze_next_tuple() does,
+ * otherwise CREATE INDEX and ANALYZE may produce wildly different
+ * reltuples values, e.g. when there are many recently-dead
+ * tuples.
+ */
+ switch (HeapTupleSatisfiesVacuum(heapTuple, OldestXmin,
+ hscan->rs_cbuf))
+ {
+ case HEAPTUPLE_DEAD:
+ /* Definitely dead, we can ignore it */
+ indexIt = false;
+ tupleIsAlive = false;
+ break;
+ case HEAPTUPLE_LIVE:
+ /* Normal case, index and unique-check it */
+ indexIt = true;
+ tupleIsAlive = true;
+ /* Count it as live, too */
+ reltuples += 1;
+ break;
+ case HEAPTUPLE_RECENTLY_DEAD:
+
+ /*
+ * If tuple is recently deleted then we must index it
+ * anyway to preserve MVCC semantics. (Pre-existing
+ * transactions could try to use the index after we finish
+ * building it, and may need to see such tuples.)
+ *
+ * However, if it was HOT-updated then we must only index
+ * the live tuple at the end of the HOT-chain. Since this
+ * breaks semantics for pre-existing snapshots, mark the
+ * index as unusable for them.
+ *
+ * We don't count recently-dead tuples in reltuples, even
+ * if we index them; see heapam_scan_analyze_next_tuple().
+ */
+ if (HeapTupleIsHotUpdated(heapTuple))
+ {
+ indexIt = false;
+ /* mark the index as unsafe for old snapshots */
+ indexInfo->ii_BrokenHotChain = true;
+ }
+ else
+ indexIt = true;
+ /* In any case, exclude the tuple from unique-checking */
+ tupleIsAlive = false;
+ break;
+ case HEAPTUPLE_INSERT_IN_PROGRESS:
+
+ /*
+ * In "anyvisible" mode, this tuple is visible and we
+ * don't need any further checks.
+ */
+ if (anyvisible)
+ {
+ indexIt = true;
+ tupleIsAlive = true;
+ reltuples += 1;
+ break;
+ }
+
+ /*
+ * Since caller should hold ShareLock or better, normally
+ * the only way to see this is if it was inserted earlier
+ * in our own transaction. However, it can happen in
+ * system catalogs, since we tend to release write lock
+ * before commit there. Give a warning if neither case
+ * applies.
+ */
+ xwait = HeapTupleHeaderGetXmin(heapTuple->t_data);
+ if (!TransactionIdIsCurrentTransactionId(xwait))
+ {
+ if (!is_system_catalog)
+ elog(WARNING, "concurrent insert in progress within table \"%s\"",
+ RelationGetRelationName(heapRelation));
+
+ /*
+ * If we are performing uniqueness checks, indexing
+ * such a tuple could lead to a bogus uniqueness
+ * failure. In that case we wait for the inserting
+ * transaction to finish and check again.
+ */
+ if (checking_uniqueness)
+ {
+ /*
+ * Must drop the lock on the buffer before we wait
+ */
+ LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+ XactLockTableWait(xwait, heapRelation,
+ &heapTuple->t_self,
+ XLTW_InsertIndexUnique);
+ CHECK_FOR_INTERRUPTS();
+ goto recheck;
+ }
+ }
+ else
+ {
+ /*
+ * For consistency with
+ * heapam_scan_analyze_next_tuple(), count
+ * HEAPTUPLE_INSERT_IN_PROGRESS tuples as live only
+ * when inserted by our own transaction.
+ */
+ reltuples += 1;
+ }
+
+ /*
+ * We must index such tuples, since if the index build
+ * commits then they're good.
+ */
+ indexIt = true;
+ tupleIsAlive = true;
+ break;
+ case HEAPTUPLE_DELETE_IN_PROGRESS:
+
+ /*
+ * As with INSERT_IN_PROGRESS case, this is unexpected
+ * unless it's our own deletion or a system catalog; but
+ * in anyvisible mode, this tuple is visible.
+ */
+ if (anyvisible)
+ {
+ indexIt = true;
+ tupleIsAlive = false;
+ reltuples += 1;
+ break;
+ }
+
+ xwait = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
+ if (!TransactionIdIsCurrentTransactionId(xwait))
+ {
+ if (!is_system_catalog)
+ elog(WARNING, "concurrent delete in progress within table \"%s\"",
+ RelationGetRelationName(heapRelation));
+
+ /*
+ * If we are performing uniqueness checks, assuming
+ * the tuple is dead could lead to missing a
+ * uniqueness violation. In that case we wait for the
+ * deleting transaction to finish and check again.
+ *
+ * Also, if it's a HOT-updated tuple, we should not
+ * index it but rather the live tuple at the end of
+ * the HOT-chain. However, the deleting transaction
+ * could abort, possibly leaving this tuple as live
+ * after all, in which case it has to be indexed. The
+ * only way to know what to do is to wait for the
+ * deleting transaction to finish and check again.
+ */
+ if (checking_uniqueness ||
+ HeapTupleIsHotUpdated(heapTuple))
+ {
+ /*
+ * Must drop the lock on the buffer before we wait
+ */
+ LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+ XactLockTableWait(xwait, heapRelation,
+ &heapTuple->t_self,
+ XLTW_InsertIndexUnique);
+ CHECK_FOR_INTERRUPTS();
+ goto recheck;
+ }
+
+ /*
+ * Otherwise index it but don't check for uniqueness,
+ * the same as a RECENTLY_DEAD tuple.
+ */
+ indexIt = true;
+
+ /*
+ * Count HEAPTUPLE_DELETE_IN_PROGRESS tuples as live,
+ * if they were not deleted by the current
+ * transaction. That's what
+ * heapam_scan_analyze_next_tuple() does, and we want
+ * the behavior to be consistent.
+ */
+ reltuples += 1;
+ }
+ else if (HeapTupleIsHotUpdated(heapTuple))
+ {
+ /*
+ * It's a HOT-updated tuple deleted by our own xact.
+ * We can assume the deletion will commit (else the
+ * index contents don't matter), so treat the same as
+ * RECENTLY_DEAD HOT-updated tuples.
+ */
+ indexIt = false;
+ /* mark the index as unsafe for old snapshots */
+ indexInfo->ii_BrokenHotChain = true;
+ }
+ else
+ {
+ /*
+ * It's a regular tuple deleted by our own xact. Index
+ * it, but don't check for uniqueness nor count in
+ * reltuples, the same as a RECENTLY_DEAD tuple.
+ */
+ indexIt = true;
+ }
+ /* In any case, exclude the tuple from unique-checking */
+ tupleIsAlive = false;
+ break;
+ default:
+ elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
+ indexIt = tupleIsAlive = false; /* keep compiler quiet */
+ break;
+ }
+
+ LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+
+ if (!indexIt)
+ continue;
+ }
+ else
+ {
+ /* heap_getnext did the time qual check */
+ tupleIsAlive = true;
+ reltuples += 1;
+ }
+
+ MemoryContextReset(econtext->ecxt_per_tuple_memory);
+
+ /* Set up for predicate or expression evaluation */
+ ExecStoreBufferHeapTuple(heapTuple, slot, hscan->rs_cbuf);
+
+ /*
+ * In a partial index, discard tuples that don't satisfy the
+ * predicate.
+ */
+ if (predicate != NULL)
+ {
+ if (!ExecQual(predicate, econtext))
+ continue;
+ }
+
+ /*
+ * For the current heap tuple, extract all the attributes we use in
+ * this index, and note which are null. This also performs evaluation
+ * of any expressions needed.
+ */
+ FormIndexDatum(indexInfo,
+ slot,
+ estate,
+ values,
+ isnull);
+
+ /*
+ * You'd think we should go ahead and build the index tuple here, but
+ * some index AMs want to do further processing on the data first. So
+ * pass the values[] and isnull[] arrays, instead.
+ */
+
+ if (HeapTupleIsHeapOnly(heapTuple))
+ {
+ /*
+ * For a heap-only tuple, pretend its TID is that of the root. See
+ * src/backend/access/heap/README.HOT for discussion.
+ */
+ ItemPointerData tid;
+ OffsetNumber offnum;
+
+ offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
+
+ /*
+ * If a HOT tuple points to a root that we don't know about,
+ * obtain root items afresh. If that still fails, report it as
+ * corruption.
+ */
+ if (root_offsets[offnum - 1] == InvalidOffsetNumber)
+ {
+ Page page = BufferGetPage(hscan->rs_cbuf);
+
+ LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
+ heap_get_root_tuples(page, root_offsets);
+ LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+ }
+
+ if (!OffsetNumberIsValid(root_offsets[offnum - 1]))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
+ ItemPointerGetBlockNumber(&heapTuple->t_self),
+ offnum,
+ RelationGetRelationName(heapRelation))));
+
+ ItemPointerSet(&tid, ItemPointerGetBlockNumber(&heapTuple->t_self),
+ root_offsets[offnum - 1]);
+
+ /* Call the AM's callback routine to process the tuple */
+ callback(indexRelation, &tid, values, isnull, tupleIsAlive,
+ callback_state);
+ }
+ else
+ {
+ /* Call the AM's callback routine to process the tuple */
+ callback(indexRelation, &heapTuple->t_self, values, isnull,
+ tupleIsAlive, callback_state);
+ }
+ }
+
+ /* Report scan progress one last time. */
+ if (progress)
+ {
+ BlockNumber blks_done;
+
+ if (hscan->rs_base.rs_parallel != NULL)
+ {
+ ParallelBlockTableScanDesc pbscan;
+
+ pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
+ blks_done = pbscan->phs_nblocks;
+ }
+ else
+ blks_done = hscan->rs_nblocks;
+
+ pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
+ blks_done);
+ }
+
+ table_endscan(scan);
+
+ /* we can now forget our snapshot, if set and registered by us */
+ if (need_unregister_snapshot)
+ UnregisterSnapshot(snapshot);
+
+ ExecDropSingleTupleTableSlot(slot);
+
+ FreeExecutorState(estate);
+
+ /* These may have been pointing to the now-gone estate */
+ indexInfo->ii_ExpressionsState = NIL;
+ indexInfo->ii_PredicateState = NULL;
+
+ return reltuples;
+}
+
+static void
+heapam_index_validate_scan(Relation heapRelation,
+ Relation indexRelation,
+ IndexInfo *indexInfo,
+ Snapshot snapshot,
+ ValidateIndexState *state)
+{
+ TableScanDesc scan;
+ HeapScanDesc hscan;
+ HeapTuple heapTuple;
+ Datum values[INDEX_MAX_KEYS];
+ bool isnull[INDEX_MAX_KEYS];
+ ExprState *predicate;
+ TupleTableSlot *slot;
+ EState *estate;
+ ExprContext *econtext;
+ BlockNumber root_blkno = InvalidBlockNumber;
+ OffsetNumber root_offsets[MaxHeapTuplesPerPage];
+ bool in_index[MaxHeapTuplesPerPage];
+ BlockNumber previous_blkno = InvalidBlockNumber;
+
+ /* state variables for the merge */
+ ItemPointer indexcursor = NULL;
+ ItemPointerData decoded;
+ bool tuplesort_empty = false;
+
+ /*
+ * sanity checks
+ */
+ Assert(OidIsValid(indexRelation->rd_rel->relam));
+
+ /*
+ * Need an EState for evaluation of index expressions and partial-index
+ * predicates. Also a slot to hold the current tuple.
+ */
+ estate = CreateExecutorState();
+ econtext = GetPerTupleExprContext(estate);
+ slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation),
+ &TTSOpsHeapTuple);
+
+ /* Arrange for econtext's scan tuple to be the tuple under test */
+ econtext->ecxt_scantuple = slot;
+
+ /* Set up execution state for predicate, if any. */
+ predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
+
+ /*
+ * Prepare for scan of the base relation. We need just those tuples
+ * satisfying the passed-in reference snapshot. We must disable syncscan
+ * here, because it's critical that we read from block zero forward to
+ * match the sorted TIDs.
+ */
+ scan = table_beginscan_strat(heapRelation, /* relation */
+ snapshot, /* snapshot */
+ 0, /* number of keys */
+ NULL, /* scan key */
+ true, /* buffer access strategy OK */
+ false); /* syncscan not OK */
+ hscan = (HeapScanDesc) scan;
+
+ pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
+ hscan->rs_nblocks);
+
+ /*
+ * Scan all tuples matching the snapshot.
+ */
+ while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+ {
+ ItemPointer heapcursor = &heapTuple->t_self;
+ ItemPointerData rootTuple;
+ OffsetNumber root_offnum;
+
+ CHECK_FOR_INTERRUPTS();
+
+ state->htups += 1;
+
+ if ((previous_blkno == InvalidBlockNumber) ||
+ (hscan->rs_cblock != previous_blkno))
+ {
+ pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
+ hscan->rs_cblock);
+ previous_blkno = hscan->rs_cblock;
+ }
+
+ /*
+ * As commented in table_index_build_scan, we should index heap-only
+ * tuples under the TIDs of their root tuples; so when we advance onto
+ * a new heap page, build a map of root item offsets on the page.
+ *
+ * This complicates merging against the tuplesort output: we will
+ * visit the live tuples in order by their offsets, but the root
+ * offsets that we need to compare against the index contents might be
+ * ordered differently. So we might have to "look back" within the
+ * tuplesort output, but only within the current page. We handle that
+ * by keeping a bool array in_index[] showing all the
+ * already-passed-over tuplesort output TIDs of the current page. We
+ * clear that array here, when advancing onto a new heap page.
+ */
+ if (hscan->rs_cblock != root_blkno)
+ {
+ Page page = BufferGetPage(hscan->rs_cbuf);
+
+ LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
+ heap_get_root_tuples(page, root_offsets);
+ LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+
+ memset(in_index, 0, sizeof(in_index));
+
+ root_blkno = hscan->rs_cblock;
+ }
+
+ /* Convert actual tuple TID to root TID */
+ rootTuple = *heapcursor;
+ root_offnum = ItemPointerGetOffsetNumber(heapcursor);
+
+ if (HeapTupleIsHeapOnly(heapTuple))
+ {
+ root_offnum = root_offsets[root_offnum - 1];
+ if (!OffsetNumberIsValid(root_offnum))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
+ ItemPointerGetBlockNumber(heapcursor),
+ ItemPointerGetOffsetNumber(heapcursor),
+ RelationGetRelationName(heapRelation))));
+ ItemPointerSetOffsetNumber(&rootTuple, root_offnum);
+ }
+
+ /*
+ * "merge" by skipping through the index tuples until we find or pass
+ * the current root tuple.
+ */
+ while (!tuplesort_empty &&
+ (!indexcursor ||
+ ItemPointerCompare(indexcursor, &rootTuple) < 0))
+ {
+ Datum ts_val;
+ bool ts_isnull;
+
+ if (indexcursor)
+ {
+ /*
+ * Remember index items seen earlier on the current heap page
+ */
+ if (ItemPointerGetBlockNumber(indexcursor) == root_blkno)
+ in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true;
+ }
+
+ tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true,
+ &ts_val, &ts_isnull, NULL);
+ Assert(tuplesort_empty || !ts_isnull);
+ if (!tuplesort_empty)
+ {
+ itemptr_decode(&decoded, DatumGetInt64(ts_val));
+ indexcursor = &decoded;
+
+ /* If int8 is pass-by-ref, free (encoded) TID Datum memory */
+#ifndef USE_FLOAT8_BYVAL
+ pfree(DatumGetPointer(ts_val));
+#endif
+ }
+ else
+ {
+ /* Be tidy */
+ indexcursor = NULL;
+ }
+ }
+
+ /*
+ * If the tuplesort has overshot *and* we didn't see a match earlier,
+ * then this tuple is missing from the index, so insert it.
+ */
+ if ((tuplesort_empty ||
+ ItemPointerCompare(indexcursor, &rootTuple) > 0) &&
+ !in_index[root_offnum - 1])
+ {
+ MemoryContextReset(econtext->ecxt_per_tuple_memory);
+
+ /* Set up for predicate or expression evaluation */
+ ExecStoreHeapTuple(heapTuple, slot, false);
+
+ /*
+ * In a partial index, discard tuples that don't satisfy the
+ * predicate.
+ */
+ if (predicate != NULL)
+ {
+ if (!ExecQual(predicate, econtext))
+ continue;
+ }
+
+ /*
+ * For the current heap tuple, extract all the attributes we use
+ * in this index, and note which are null. This also performs
+ * evaluation of any expressions needed.
+ */
+ FormIndexDatum(indexInfo,
+ slot,
+ estate,
+ values,
+ isnull);
+
+ /*
+ * You'd think we should go ahead and build the index tuple here,
+ * but some index AMs want to do further processing on the data
+ * first. So pass the values[] and isnull[] arrays, instead.
+ */
+
+ /*
+ * If the tuple is already committed dead, you might think we
+ * could suppress uniqueness checking, but this is no longer true
+ * in the presence of HOT, because the insert is actually a proxy
+ * for a uniqueness check on the whole HOT-chain. That is, the
+ * tuple we have here could be dead because it was already
+ * HOT-updated, and if so the updating transaction will not have
+ * thought it should insert index entries. The index AM will
+ * check the whole HOT-chain and correctly detect a conflict if
+ * there is one.
+ */
+
+ index_insert(indexRelation,
+ values,
+ isnull,
+ &rootTuple,
+ heapRelation,
+ indexInfo->ii_Unique ?
+ UNIQUE_CHECK_YES : UNIQUE_CHECK_NO,
+ false,
+ indexInfo);
+
+ state->tups_inserted += 1;
+ }
+ }
+
+ table_endscan(scan);
+
+ ExecDropSingleTupleTableSlot(slot);
+
+ FreeExecutorState(estate);
+
+ /* These may have been pointing to the now-gone estate */
+ indexInfo->ii_ExpressionsState = NIL;
+ indexInfo->ii_PredicateState = NULL;
+}
+
+/*
+ * Return the number of blocks that have been read by this scan since
+ * starting. This is meant for progress reporting rather than be fully
+ * accurate: in a parallel scan, workers can be concurrently reading blocks
+ * further ahead than what we report.
+ */
+static BlockNumber
+heapam_scan_get_blocks_done(HeapScanDesc hscan)
+{
+ ParallelBlockTableScanDesc bpscan = NULL;
+ BlockNumber startblock;
+ BlockNumber blocks_done;
+
+ if (hscan->rs_base.rs_parallel != NULL)
+ {
+ bpscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
+ startblock = bpscan->phs_startblock;
+ }
+ else
+ startblock = hscan->rs_startblock;
+
+ /*
+ * Might have wrapped around the end of the relation, if startblock was
+ * not zero.
+ */
+ if (hscan->rs_cblock > startblock)
+ blocks_done = hscan->rs_cblock - startblock;
+ else
+ {
+ BlockNumber nblocks;
+
+ nblocks = bpscan != NULL ? bpscan->phs_nblocks : hscan->rs_nblocks;
+ blocks_done = nblocks - startblock +
+ hscan->rs_cblock;
+ }
+
+ return blocks_done;
+}
+
+
+/* ------------------------------------------------------------------------
+ * Miscellaneous callbacks for the heap AM
+ * ------------------------------------------------------------------------
+ */
+
+/*
+ * Check to see whether the table needs a TOAST table. It does only if
+ * (1) there are any toastable attributes, and (2) the maximum length
+ * of a tuple could exceed TOAST_TUPLE_THRESHOLD. (We don't want to
+ * create a toast table for something like "f1 varchar(20)".)
+ */
+static bool
+heapam_relation_needs_toast_table(Relation rel)
+{
+ int32 data_length = 0;
+ bool maxlength_unknown = false;
+ bool has_toastable_attrs = false;
+ TupleDesc tupdesc = rel->rd_att;
+ int32 tuple_length;
+ int i;
+
+ for (i = 0; i < tupdesc->natts; i++)
+ {
+ Form_pg_attribute att = TupleDescAttr(tupdesc, i);
+
+ if (att->attisdropped)
+ continue;
+ data_length = att_align_nominal(data_length, att->attalign);
+ if (att->attlen > 0)
+ {
+ /* Fixed-length types are never toastable */
+ data_length += att->attlen;
+ }
+ else
+ {
+ int32 maxlen = type_maximum_size(att->atttypid,
+ att->atttypmod);
+
+ if (maxlen < 0)
+ maxlength_unknown = true;
+ else
+ data_length += maxlen;
+ if (att->attstorage != TYPSTORAGE_PLAIN)
+ has_toastable_attrs = true;
+ }
+ }
+ if (!has_toastable_attrs)
+ return false; /* nothing to toast? */
+ if (maxlength_unknown)
+ return true; /* any unlimited-length attrs? */
+ tuple_length = MAXALIGN(SizeofHeapTupleHeader +
+ BITMAPLEN(tupdesc->natts)) +
+ MAXALIGN(data_length);
+ return (tuple_length > TOAST_TUPLE_THRESHOLD);
+}
+
+/*
+ * TOAST tables for heap relations are just heap relations.
+ */
+static Oid
+heapam_relation_toast_am(Relation rel)
+{
+ return rel->rd_rel->relam;
+}
+
+
+/* ------------------------------------------------------------------------
+ * Planner related callbacks for the heap AM
+ * ------------------------------------------------------------------------
+ */
+
+#define HEAP_OVERHEAD_BYTES_PER_TUPLE \
+ (MAXALIGN(SizeofHeapTupleHeader) + sizeof(ItemIdData))
+#define HEAP_USABLE_BYTES_PER_PAGE \
+ (BLCKSZ - SizeOfPageHeaderData)
+
+static void
+heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
+ BlockNumber *pages, double *tuples,
+ double *allvisfrac)
+{
+ table_block_relation_estimate_size(rel, attr_widths, pages,
+ tuples, allvisfrac,
+ HEAP_OVERHEAD_BYTES_PER_TUPLE,
+ HEAP_USABLE_BYTES_PER_PAGE);
+}
+
+
+/* ------------------------------------------------------------------------
+ * Executor related callbacks for the heap AM
+ * ------------------------------------------------------------------------
+ */
+
+static bool
+heapam_scan_bitmap_next_block(TableScanDesc scan,
+ TBMIterateResult *tbmres)
+{
+ HeapScanDesc hscan = (HeapScanDesc) scan;
+ BlockNumber page = tbmres->blockno;
+ Buffer buffer;
+ Snapshot snapshot;
+ int ntup;
+
+ hscan->rs_cindex = 0;
+ hscan->rs_ntuples = 0;
+
+ /*
+ * Ignore any claimed entries past what we think is the end of the
+ * relation. It may have been extended after the start of our scan (we
+ * only hold an AccessShareLock, and it could be inserts from this
+ * backend). We don't take this optimization in SERIALIZABLE isolation
+ * though, as we need to examine all invisible tuples reachable by the
+ * index.
+ */
+ if (!IsolationIsSerializable() && page >= hscan->rs_nblocks)
+ return false;
+
+ /*
+ * Acquire pin on the target heap page, trading in any pin we held before.
+ */
+ hscan->rs_cbuf = ReleaseAndReadBuffer(hscan->rs_cbuf,
+ scan->rs_rd,
+ page);
+ hscan->rs_cblock = page;
+ buffer = hscan->rs_cbuf;
+ snapshot = scan->rs_snapshot;
+
+ ntup = 0;
+
+ /*
+ * Prune and repair fragmentation for the whole page, if possible.
+ */
+ heap_page_prune_opt(scan->rs_rd, buffer);
+
+ /*
+ * We must hold share lock on the buffer content while examining tuple
+ * visibility. Afterwards, however, the tuples we have found to be
+ * visible are guaranteed good as long as we hold the buffer pin.
+ */
+ LockBuffer(buffer, BUFFER_LOCK_SHARE);
+
+ /*
+ * We need two separate strategies for lossy and non-lossy cases.
+ */
+ if (tbmres->ntuples >= 0)
+ {
+ /*
+ * Bitmap is non-lossy, so we just look through the offsets listed in
+ * tbmres; but we have to follow any HOT chain starting at each such
+ * offset.
+ */
+ int curslot;
+
+ for (curslot = 0; curslot < tbmres->ntuples; curslot++)
+ {
+ OffsetNumber offnum = tbmres->offsets[curslot];
+ ItemPointerData tid;
+ HeapTupleData heapTuple;
+
+ ItemPointerSet(&tid, page, offnum);
+ if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
+ &heapTuple, NULL, true))
+ hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
+ }
+ }
+ else
+ {
+ /*
+ * Bitmap is lossy, so we must examine each line pointer on the page.
+ * But we can ignore HOT chains, since we'll check each tuple anyway.
+ */
+ Page dp = (Page) BufferGetPage(buffer);
+ OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
+ OffsetNumber offnum;
+
+ for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
+ {
+ ItemId lp;
+ HeapTupleData loctup;
+ bool valid;
+
+ lp = PageGetItemId(dp, offnum);
+ if (!ItemIdIsNormal(lp))
+ continue;
+ loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
+ loctup.t_len = ItemIdGetLength(lp);
+ loctup.t_tableOid = scan->rs_rd->rd_id;
+ ItemPointerSet(&loctup.t_self, page, offnum);
+ valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
+ if (valid)
+ {
+ hscan->rs_vistuples[ntup++] = offnum;
+ PredicateLockTID(scan->rs_rd, &loctup.t_self, snapshot,
+ HeapTupleHeaderGetXmin(loctup.t_data));
+ }
+ HeapCheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
+ buffer, snapshot);
+ }
+ }
+
+ LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+
+ Assert(ntup <= MaxHeapTuplesPerPage);
+ hscan->rs_ntuples = ntup;
+
+ return ntup > 0;
+}
+
+static bool
+heapam_scan_bitmap_next_tuple(TableScanDesc scan,
+ TBMIterateResult *tbmres,
+ TupleTableSlot *slot)
+{
+ HeapScanDesc hscan = (HeapScanDesc) scan;
+ OffsetNumber targoffset;
+ Page dp;
+ ItemId lp;
+
+ /*
+ * Out of range? If so, nothing more to look at on this page
+ */
+ if (hscan->rs_cindex < 0 || hscan->rs_cindex >= hscan->rs_ntuples)
+ return false;
+
+ targoffset = hscan->rs_vistuples[hscan->rs_cindex];
+ dp = (Page) BufferGetPage(hscan->rs_cbuf);
+ lp = PageGetItemId(dp, targoffset);
+ Assert(ItemIdIsNormal(lp));
+
+ hscan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
+ hscan->rs_ctup.t_len = ItemIdGetLength(lp);
+ hscan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
+ ItemPointerSet(&hscan->rs_ctup.t_self, hscan->rs_cblock, targoffset);
+
+ pgstat_count_heap_fetch(scan->rs_rd);
+
+ /*
+ * Set up the result slot to point to this tuple. Note that the slot
+ * acquires a pin on the buffer.
+ */
+ ExecStoreBufferHeapTuple(&hscan->rs_ctup,
+ slot,
+ hscan->rs_cbuf);
+
+ hscan->rs_cindex++;
+
+ return true;
+}
+
+static bool
+heapam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate)
+{
+ HeapScanDesc hscan = (HeapScanDesc) scan;
+ TsmRoutine *tsm = scanstate->tsmroutine;
+ BlockNumber blockno;
+
+ /* return false immediately if relation is empty */
+ if (hscan->rs_nblocks == 0)
+ return false;
+
+ if (tsm->NextSampleBlock)
+ {
+ blockno = tsm->NextSampleBlock(scanstate, hscan->rs_nblocks);
+ hscan->rs_cblock = blockno;
+ }
+ else
+ {
+ /* scanning table sequentially */
+
+ if (hscan->rs_cblock == InvalidBlockNumber)
+ {
+ Assert(!hscan->rs_inited);
+ blockno = hscan->rs_startblock;
+ }
+ else
+ {
+ Assert(hscan->rs_inited);
+
+ blockno = hscan->rs_cblock + 1;
+
+ if (blockno >= hscan->rs_nblocks)
+ {
+ /* wrap to beginning of rel, might not have started at 0 */
+ blockno = 0;
+ }
+
+ /*
+ * Report our new scan position for synchronization purposes.
+ *
+ * Note: we do this before checking for end of scan so that the
+ * final state of the position hint is back at the start of the
+ * rel. That's not strictly necessary, but otherwise when you run
+ * the same query multiple times the starting position would shift
+ * a little bit backwards on every invocation, which is confusing.
+ * We don't guarantee any specific ordering in general, though.
+ */
+ if (scan->rs_flags & SO_ALLOW_SYNC)
+ ss_report_location(scan->rs_rd, blockno);
+
+ if (blockno == hscan->rs_startblock)
+ {
+ blockno = InvalidBlockNumber;
+ }
+ }
+ }
+
+ if (!BlockNumberIsValid(blockno))
+ {
+ if (BufferIsValid(hscan->rs_cbuf))
+ ReleaseBuffer(hscan->rs_cbuf);
+ hscan->rs_cbuf = InvalidBuffer;
+ hscan->rs_cblock = InvalidBlockNumber;
+ hscan->rs_inited = false;
+
+ return false;
+ }
+
+ heapgetpage(scan, blockno);
+ hscan->rs_inited = true;
+
+ return true;
+}
+
+static bool
+heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate,
+ TupleTableSlot *slot)
+{
+ HeapScanDesc hscan = (HeapScanDesc) scan;
+ TsmRoutine *tsm = scanstate->tsmroutine;
+ BlockNumber blockno = hscan->rs_cblock;
+ bool pagemode = (scan->rs_flags & SO_ALLOW_PAGEMODE) != 0;
+
+ Page page;
+ bool all_visible;
+ OffsetNumber maxoffset;
+
+ /*
+ * When not using pagemode, we must lock the buffer during tuple
+ * visibility checks.
+ */
+ if (!pagemode)
+ LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
+
+ page = (Page) BufferGetPage(hscan->rs_cbuf);
+ all_visible = PageIsAllVisible(page) &&
+ !scan->rs_snapshot->takenDuringRecovery;
+ maxoffset = PageGetMaxOffsetNumber(page);
+
+ for (;;)
+ {
+ OffsetNumber tupoffset;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /* Ask the tablesample method which tuples to check on this page. */
+ tupoffset = tsm->NextSampleTuple(scanstate,
+ blockno,
+ maxoffset);
+
+ if (OffsetNumberIsValid(tupoffset))
+ {
+ ItemId itemid;
+ bool visible;
+ HeapTuple tuple = &(hscan->rs_ctup);
+
+ /* Skip invalid tuple pointers. */
+ itemid = PageGetItemId(page, tupoffset);
+ if (!ItemIdIsNormal(itemid))
+ continue;
+
+ tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
+ tuple->t_len = ItemIdGetLength(itemid);
+ ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
+
+
+ if (all_visible)
+ visible = true;
+ else
+ visible = SampleHeapTupleVisible(scan, hscan->rs_cbuf,
+ tuple, tupoffset);
+
+ /* in pagemode, heapgetpage did this for us */
+ if (!pagemode)
+ HeapCheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
+ hscan->rs_cbuf, scan->rs_snapshot);
+
+ /* Try next tuple from same page. */
+ if (!visible)
+ continue;
+
+ /* Found visible tuple, return it. */
+ if (!pagemode)
+ LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+
+ ExecStoreBufferHeapTuple(tuple, slot, hscan->rs_cbuf);
+
+ /* Count successfully-fetched tuples as heap fetches */
+ pgstat_count_heap_getnext(scan->rs_rd);
+
+ return true;
+ }
+ else
+ {
+ /*
+ * If we get here, it means we've exhausted the items on this page
+ * and it's time to move to the next.
+ */
+ if (!pagemode)
+ LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+
+ ExecClearTuple(slot);
+ return false;
+ }
+ }
+
+ Assert(0);
+}
+
+
+/* ----------------------------------------------------------------------------
+ * Helper functions for the above.
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * Reconstruct and rewrite the given tuple
+ *
+ * We cannot simply copy the tuple as-is, for several reasons:
+ *
+ * 1. We'd like to squeeze out the values of any dropped columns, both
+ * to save space and to ensure we have no corner-case failures. (It's
+ * possible for example that the new table hasn't got a TOAST table
+ * and so is unable to store any large values of dropped cols.)
+ *
+ * 2. The tuple might not even be legal for the new table; this is
+ * currently only known to happen as an after-effect of ALTER TABLE
+ * SET WITHOUT OIDS.
+ *
+ * So, we must reconstruct the tuple from component Datums.
+ */
+static void
+reform_and_rewrite_tuple(HeapTuple tuple,
+ Relation OldHeap, Relation NewHeap,
+ Datum *values, bool *isnull, RewriteState rwstate)
+{
+ TupleDesc oldTupDesc = RelationGetDescr(OldHeap);
+ TupleDesc newTupDesc = RelationGetDescr(NewHeap);
+ HeapTuple copiedTuple;
+ int i;
+
+ heap_deform_tuple(tuple, oldTupDesc, values, isnull);
+
+ /* Be sure to null out any dropped columns */
+ for (i = 0; i < newTupDesc->natts; i++)
+ {
+ if (TupleDescAttr(newTupDesc, i)->attisdropped)
+ isnull[i] = true;
+ }
+
+ copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
+
+ /* The heap rewrite module does the rest */
+ rewrite_heap_tuple(rwstate, tuple, copiedTuple);
+
+ heap_freetuple(copiedTuple);
+}
+
+/*
+ * Check visibility of the tuple.
+ */
+static bool
+SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
+ HeapTuple tuple,
+ OffsetNumber tupoffset)
+{
+ HeapScanDesc hscan = (HeapScanDesc) scan;
+
+ if (scan->rs_flags & SO_ALLOW_PAGEMODE)
+ {
+ /*
+ * In pageatatime mode, heapgetpage() already did visibility checks,
+ * so just look at the info it left in rs_vistuples[].
+ *
+ * We use a binary search over the known-sorted array. Note: we could
+ * save some effort if we insisted that NextSampleTuple select tuples
+ * in increasing order, but it's not clear that there would be enough
+ * gain to justify the restriction.
+ */
+ int start = 0,
+ end = hscan->rs_ntuples - 1;
+
+ while (start <= end)
+ {
+ int mid = (start + end) / 2;
+ OffsetNumber curoffset = hscan->rs_vistuples[mid];
+
+ if (tupoffset == curoffset)
+ return true;
+ else if (tupoffset < curoffset)
+ end = mid - 1;
+ else
+ start = mid + 1;
+ }
+
+ return false;
+ }
+ else
+ {
+ /* Otherwise, we have to check the tuple individually. */
+ return HeapTupleSatisfiesVisibility(tuple, scan->rs_snapshot,
+ buffer);
+ }
+}
+
+
+/* ------------------------------------------------------------------------
+ * Definition of the heap table access method.
+ * ------------------------------------------------------------------------
+ */
+
+static const TableAmRoutine heapam_methods = {
+ .type = T_TableAmRoutine,
+
+ .slot_callbacks = heapam_slot_callbacks,
+
+ .scan_begin = heap_beginscan,
+ .scan_end = heap_endscan,
+ .scan_rescan = heap_rescan,
+ .scan_getnextslot = heap_getnextslot,
+
+ .scan_set_tidrange = heap_set_tidrange,
+ .scan_getnextslot_tidrange = heap_getnextslot_tidrange,
+
+ .parallelscan_estimate = table_block_parallelscan_estimate,
+ .parallelscan_initialize = table_block_parallelscan_initialize,
+ .parallelscan_reinitialize = table_block_parallelscan_reinitialize,
+
+ .index_fetch_begin = heapam_index_fetch_begin,
+ .index_fetch_reset = heapam_index_fetch_reset,
+ .index_fetch_end = heapam_index_fetch_end,
+ .index_fetch_tuple = heapam_index_fetch_tuple,
+
+ .tuple_insert = heapam_tuple_insert,
+ .tuple_insert_speculative = heapam_tuple_insert_speculative,
+ .tuple_complete_speculative = heapam_tuple_complete_speculative,
+ .multi_insert = heap_multi_insert,
+ .tuple_delete = heapam_tuple_delete,
+ .tuple_update = heapam_tuple_update,
+ .tuple_lock = heapam_tuple_lock,
+
+ .tuple_fetch_row_version = heapam_fetch_row_version,
+ .tuple_get_latest_tid = heap_get_latest_tid,
+ .tuple_tid_valid = heapam_tuple_tid_valid,
+ .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot,
+ .index_delete_tuples = heap_index_delete_tuples,
+
+ .relation_set_new_filenode = heapam_relation_set_new_filenode,
+ .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate,
+ .relation_copy_data = heapam_relation_copy_data,
+ .relation_copy_for_cluster = heapam_relation_copy_for_cluster,
+ .relation_vacuum = heap_vacuum_rel,
+ .scan_analyze_next_block = heapam_scan_analyze_next_block,
+ .scan_analyze_next_tuple = heapam_scan_analyze_next_tuple,
+ .index_build_range_scan = heapam_index_build_range_scan,
+ .index_validate_scan = heapam_index_validate_scan,
+
+ .relation_size = table_block_relation_size,
+ .relation_needs_toast_table = heapam_relation_needs_toast_table,
+ .relation_toast_am = heapam_relation_toast_am,
+ .relation_fetch_toast_slice = heap_fetch_toast_slice,
+
+ .relation_estimate_size = heapam_estimate_rel_size,
+
+ .scan_bitmap_next_block = heapam_scan_bitmap_next_block,
+ .scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple,
+ .scan_sample_next_block = heapam_scan_sample_next_block,
+ .scan_sample_next_tuple = heapam_scan_sample_next_tuple
+};
+
+
+const TableAmRoutine *
+GetHeapamTableAmRoutine(void)
+{
+ return &heapam_methods;
+}
+
+Datum
+heap_tableam_handler(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_POINTER(&heapam_methods);
+}