summaryrefslogtreecommitdiffstats
path: root/src/backend/access/table
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/backend/access/table/Makefile21
-rw-r--r--src/backend/access/table/table.c170
-rw-r--r--src/backend/access/table/tableam.c765
-rw-r--r--src/backend/access/table/tableamapi.c158
-rw-r--r--src/backend/access/table/toast_helper.c337
-rw-r--r--src/backend/access/tablesample/Makefile20
-rw-r--r--src/backend/access/tablesample/bernoulli.c229
-rw-r--r--src/backend/access/tablesample/system.c257
-rw-r--r--src/backend/access/tablesample/tablesample.c40
9 files changed, 1997 insertions, 0 deletions
diff --git a/src/backend/access/table/Makefile b/src/backend/access/table/Makefile
new file mode 100644
index 0000000..9aba3ff
--- /dev/null
+++ b/src/backend/access/table/Makefile
@@ -0,0 +1,21 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for access/table
+#
+# IDENTIFICATION
+# src/backend/access/table/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/access/table
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ table.o \
+ tableam.o \
+ tableamapi.o \
+ toast_helper.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/table/table.c b/src/backend/access/table/table.c
new file mode 100644
index 0000000..545007e
--- /dev/null
+++ b/src/backend/access/table/table.c
@@ -0,0 +1,170 @@
+/*-------------------------------------------------------------------------
+ *
+ * table.c
+ * Generic routines for table related code.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/table/table.c
+ *
+ *
+ * NOTES
+ * This file contains table_ routines that implement access to tables (in
+ * contrast to other relation types like indexes) that are independent of
+ * individual table access methods.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/relation.h"
+#include "access/table.h"
+#include "storage/lmgr.h"
+
+
+/* ----------------
+ * table_open - open a table relation by relation OID
+ *
+ * This is essentially relation_open plus check that the relation
+ * is not an index nor a composite type. (The caller should also
+ * check that it's not a view or foreign table before assuming it has
+ * storage.)
+ * ----------------
+ */
+Relation
+table_open(Oid relationId, LOCKMODE lockmode)
+{
+ Relation r;
+
+ r = relation_open(relationId, lockmode);
+
+ if (r->rd_rel->relkind == RELKIND_INDEX ||
+ r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("\"%s\" is an index",
+ RelationGetRelationName(r))));
+ else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("\"%s\" is a composite type",
+ RelationGetRelationName(r))));
+
+ return r;
+}
+
+
+/* ----------------
+ * try_table_open - open a table relation by relation OID
+ *
+ * Same as table_open, except return NULL instead of failing
+ * if the relation does not exist.
+ * ----------------
+ */
+Relation
+try_table_open(Oid relationId, LOCKMODE lockmode)
+{
+ Relation r;
+
+ r = try_relation_open(relationId, lockmode);
+
+ /* leave if table does not exist */
+ if (!r)
+ return NULL;
+
+ if (r->rd_rel->relkind == RELKIND_INDEX ||
+ r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("\"%s\" is an index",
+ RelationGetRelationName(r))));
+ else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("\"%s\" is a composite type",
+ RelationGetRelationName(r))));
+
+ return r;
+}
+
+/* ----------------
+ * table_openrv - open a table relation specified
+ * by a RangeVar node
+ *
+ * As above, but relation is specified by a RangeVar.
+ * ----------------
+ */
+Relation
+table_openrv(const RangeVar *relation, LOCKMODE lockmode)
+{
+ Relation r;
+
+ r = relation_openrv(relation, lockmode);
+
+ if (r->rd_rel->relkind == RELKIND_INDEX ||
+ r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("\"%s\" is an index",
+ RelationGetRelationName(r))));
+ else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("\"%s\" is a composite type",
+ RelationGetRelationName(r))));
+
+ return r;
+}
+
+/* ----------------
+ * table_openrv_extended - open a table relation specified
+ * by a RangeVar node
+ *
+ * As above, but optionally return NULL instead of failing for
+ * relation-not-found.
+ * ----------------
+ */
+Relation
+table_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
+ bool missing_ok)
+{
+ Relation r;
+
+ r = relation_openrv_extended(relation, lockmode, missing_ok);
+
+ if (r)
+ {
+ if (r->rd_rel->relkind == RELKIND_INDEX ||
+ r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("\"%s\" is an index",
+ RelationGetRelationName(r))));
+ else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("\"%s\" is a composite type",
+ RelationGetRelationName(r))));
+ }
+
+ return r;
+}
+
+/* ----------------
+ * table_close - close a table
+ *
+ * If lockmode is not "NoLock", we then release the specified lock.
+ *
+ * Note that it is often sensible to hold a lock beyond relation_close;
+ * in that case, the lock is released automatically at xact end.
+ * ----------------
+ */
+void
+table_close(Relation relation, LOCKMODE lockmode)
+{
+ relation_close(relation, lockmode);
+}
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
new file mode 100644
index 0000000..5ea5bdd
--- /dev/null
+++ b/src/backend/access/table/tableam.c
@@ -0,0 +1,765 @@
+/*----------------------------------------------------------------------
+ *
+ * tableam.c
+ * Table access method routines too big to be inline functions.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/table/tableam.c
+ *
+ * NOTES
+ * Note that most function in here are documented in tableam.h, rather than
+ * here. That's because there's a lot of inline functions in tableam.h and
+ * it'd be harder to understand if one constantly had to switch between files.
+ *
+ *----------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "access/syncscan.h"
+#include "access/tableam.h"
+#include "access/xact.h"
+#include "optimizer/plancat.h"
+#include "port/pg_bitutils.h"
+#include "storage/bufmgr.h"
+#include "storage/shmem.h"
+#include "storage/smgr.h"
+
+/*
+ * Constants to control the behavior of block allocation to parallel workers
+ * during a parallel seqscan. Technically these values do not need to be
+ * powers of 2, but having them as powers of 2 makes the math more optimal
+ * and makes the ramp-down stepping more even.
+ */
+
+/* The number of I/O chunks we try to break a parallel seqscan down into */
+#define PARALLEL_SEQSCAN_NCHUNKS 2048
+/* Ramp down size of allocations when we've only this number of chunks left */
+#define PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS 64
+/* Cap the size of parallel I/O chunks to this number of blocks */
+#define PARALLEL_SEQSCAN_MAX_CHUNK_SIZE 8192
+
+/* GUC variables */
+char *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD;
+bool synchronize_seqscans = true;
+
+
+/* ----------------------------------------------------------------------------
+ * Slot functions.
+ * ----------------------------------------------------------------------------
+ */
+
+const TupleTableSlotOps *
+table_slot_callbacks(Relation relation)
+{
+ const TupleTableSlotOps *tts_cb;
+
+ if (relation->rd_tableam)
+ tts_cb = relation->rd_tableam->slot_callbacks(relation);
+ else if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
+ {
+ /*
+ * Historically FDWs expect to store heap tuples in slots. Continue
+ * handing them one, to make it less painful to adapt FDWs to new
+ * versions. The cost of a heap slot over a virtual slot is pretty
+ * small.
+ */
+ tts_cb = &TTSOpsHeapTuple;
+ }
+ else
+ {
+ /*
+ * These need to be supported, as some parts of the code (like COPY)
+ * need to create slots for such relations too. It seems better to
+ * centralize the knowledge that a heap slot is the right thing in
+ * that case here.
+ */
+ Assert(relation->rd_rel->relkind == RELKIND_VIEW ||
+ relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
+ tts_cb = &TTSOpsVirtual;
+ }
+
+ return tts_cb;
+}
+
+TupleTableSlot *
+table_slot_create(Relation relation, List **reglist)
+{
+ const TupleTableSlotOps *tts_cb;
+ TupleTableSlot *slot;
+
+ tts_cb = table_slot_callbacks(relation);
+ slot = MakeSingleTupleTableSlot(RelationGetDescr(relation), tts_cb);
+
+ if (reglist)
+ *reglist = lappend(*reglist, slot);
+
+ return slot;
+}
+
+
+/* ----------------------------------------------------------------------------
+ * Table scan functions.
+ * ----------------------------------------------------------------------------
+ */
+
+TableScanDesc
+table_beginscan_catalog(Relation relation, int nkeys, struct ScanKeyData *key)
+{
+ uint32 flags = SO_TYPE_SEQSCAN |
+ SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | SO_TEMP_SNAPSHOT;
+ Oid relid = RelationGetRelid(relation);
+ Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
+
+ return relation->rd_tableam->scan_begin(relation, snapshot, nkeys, key,
+ NULL, flags);
+}
+
+void
+table_scan_update_snapshot(TableScanDesc scan, Snapshot snapshot)
+{
+ Assert(IsMVCCSnapshot(snapshot));
+
+ RegisterSnapshot(snapshot);
+ scan->rs_snapshot = snapshot;
+ scan->rs_flags |= SO_TEMP_SNAPSHOT;
+}
+
+
+/* ----------------------------------------------------------------------------
+ * Parallel table scan related functions.
+ * ----------------------------------------------------------------------------
+ */
+
+Size
+table_parallelscan_estimate(Relation rel, Snapshot snapshot)
+{
+ Size sz = 0;
+
+ if (IsMVCCSnapshot(snapshot))
+ sz = add_size(sz, EstimateSnapshotSpace(snapshot));
+ else
+ Assert(snapshot == SnapshotAny);
+
+ sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel));
+
+ return sz;
+}
+
+void
+table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan,
+ Snapshot snapshot)
+{
+ Size snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan);
+
+ pscan->phs_snapshot_off = snapshot_off;
+
+ if (IsMVCCSnapshot(snapshot))
+ {
+ SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off);
+ pscan->phs_snapshot_any = false;
+ }
+ else
+ {
+ Assert(snapshot == SnapshotAny);
+ pscan->phs_snapshot_any = true;
+ }
+}
+
+TableScanDesc
+table_beginscan_parallel(Relation relation, ParallelTableScanDesc parallel_scan)
+{
+ Snapshot snapshot;
+ uint32 flags = SO_TYPE_SEQSCAN |
+ SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
+
+ Assert(RelationGetRelid(relation) == parallel_scan->phs_relid);
+
+ if (!parallel_scan->phs_snapshot_any)
+ {
+ /* Snapshot was serialized -- restore it */
+ snapshot = RestoreSnapshot((char *) parallel_scan +
+ parallel_scan->phs_snapshot_off);
+ RegisterSnapshot(snapshot);
+ flags |= SO_TEMP_SNAPSHOT;
+ }
+ else
+ {
+ /* SnapshotAny passed by caller (not serialized) */
+ snapshot = SnapshotAny;
+ }
+
+ return relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL,
+ parallel_scan, flags);
+}
+
+
+/* ----------------------------------------------------------------------------
+ * Index scan related functions.
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * To perform that check simply start an index scan, create the necessary
+ * slot, do the heap lookup, and shut everything down again. This could be
+ * optimized, but is unlikely to matter from a performance POV. If there
+ * frequently are live index pointers also matching a unique index key, the
+ * CPU overhead of this routine is unlikely to matter.
+ *
+ * Note that *tid may be modified when we return true if the AM supports
+ * storing multiple row versions reachable via a single index entry (like
+ * heap's HOT).
+ */
+bool
+table_index_fetch_tuple_check(Relation rel,
+ ItemPointer tid,
+ Snapshot snapshot,
+ bool *all_dead)
+{
+ IndexFetchTableData *scan;
+ TupleTableSlot *slot;
+ bool call_again = false;
+ bool found;
+
+ slot = table_slot_create(rel, NULL);
+ scan = table_index_fetch_begin(rel);
+ found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again,
+ all_dead);
+ table_index_fetch_end(scan);
+ ExecDropSingleTupleTableSlot(slot);
+
+ return found;
+}
+
+
+/* ------------------------------------------------------------------------
+ * Functions for non-modifying operations on individual tuples
+ * ------------------------------------------------------------------------
+ */
+
+void
+table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid)
+{
+ Relation rel = scan->rs_rd;
+ const TableAmRoutine *tableam = rel->rd_tableam;
+
+ /*
+ * We don't expect direct calls to table_tuple_get_latest_tid with valid
+ * CheckXidAlive for catalog or regular tables. See detailed comments in
+ * xact.c where these variables are declared.
+ */
+ if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
+ elog(ERROR, "unexpected table_tuple_get_latest_tid call during logical decoding");
+
+ /*
+ * Since this can be called with user-supplied TID, don't trust the input
+ * too much.
+ */
+ if (!tableam->tuple_tid_valid(scan, tid))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("tid (%u, %u) is not valid for relation \"%s\"",
+ ItemPointerGetBlockNumberNoCheck(tid),
+ ItemPointerGetOffsetNumberNoCheck(tid),
+ RelationGetRelationName(rel))));
+
+ tableam->tuple_get_latest_tid(scan, tid);
+}
+
+
+/* ----------------------------------------------------------------------------
+ * Functions to make modifications a bit simpler.
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * simple_table_tuple_insert - insert a tuple
+ *
+ * Currently, this routine differs from table_tuple_insert only in supplying a
+ * default command ID and not allowing access to the speedup options.
+ */
+void
+simple_table_tuple_insert(Relation rel, TupleTableSlot *slot)
+{
+ table_tuple_insert(rel, slot, GetCurrentCommandId(true), 0, NULL);
+}
+
+/*
+ * simple_table_tuple_delete - delete a tuple
+ *
+ * This routine may be used to delete a tuple when concurrent updates of
+ * the target tuple are not expected (for example, because we have a lock
+ * on the relation associated with the tuple). Any failure is reported
+ * via ereport().
+ */
+void
+simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot)
+{
+ TM_Result result;
+ TM_FailureData tmfd;
+
+ result = table_tuple_delete(rel, tid,
+ GetCurrentCommandId(true),
+ snapshot, InvalidSnapshot,
+ true /* wait for commit */ ,
+ &tmfd, false /* changingPart */ );
+
+ switch (result)
+ {
+ case TM_SelfModified:
+ /* Tuple was already updated in current command? */
+ elog(ERROR, "tuple already updated by self");
+ break;
+
+ case TM_Ok:
+ /* done successfully */
+ break;
+
+ case TM_Updated:
+ elog(ERROR, "tuple concurrently updated");
+ break;
+
+ case TM_Deleted:
+ elog(ERROR, "tuple concurrently deleted");
+ break;
+
+ default:
+ elog(ERROR, "unrecognized table_tuple_delete status: %u", result);
+ break;
+ }
+}
+
+/*
+ * simple_table_tuple_update - replace a tuple
+ *
+ * This routine may be used to update a tuple when concurrent updates of
+ * the target tuple are not expected (for example, because we have a lock
+ * on the relation associated with the tuple). Any failure is reported
+ * via ereport().
+ */
+void
+simple_table_tuple_update(Relation rel, ItemPointer otid,
+ TupleTableSlot *slot,
+ Snapshot snapshot,
+ bool *update_indexes)
+{
+ TM_Result result;
+ TM_FailureData tmfd;
+ LockTupleMode lockmode;
+
+ result = table_tuple_update(rel, otid, slot,
+ GetCurrentCommandId(true),
+ snapshot, InvalidSnapshot,
+ true /* wait for commit */ ,
+ &tmfd, &lockmode, update_indexes);
+
+ switch (result)
+ {
+ case TM_SelfModified:
+ /* Tuple was already updated in current command? */
+ elog(ERROR, "tuple already updated by self");
+ break;
+
+ case TM_Ok:
+ /* done successfully */
+ break;
+
+ case TM_Updated:
+ elog(ERROR, "tuple concurrently updated");
+ break;
+
+ case TM_Deleted:
+ elog(ERROR, "tuple concurrently deleted");
+ break;
+
+ default:
+ elog(ERROR, "unrecognized table_tuple_update status: %u", result);
+ break;
+ }
+
+}
+
+
+/* ----------------------------------------------------------------------------
+ * Helper functions to implement parallel scans for block oriented AMs.
+ * ----------------------------------------------------------------------------
+ */
+
+Size
+table_block_parallelscan_estimate(Relation rel)
+{
+ return sizeof(ParallelBlockTableScanDescData);
+}
+
+Size
+table_block_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan)
+{
+ ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
+
+ bpscan->base.phs_relid = RelationGetRelid(rel);
+ bpscan->phs_nblocks = RelationGetNumberOfBlocks(rel);
+ /* compare phs_syncscan initialization to similar logic in initscan */
+ bpscan->base.phs_syncscan = synchronize_seqscans &&
+ !RelationUsesLocalBuffers(rel) &&
+ bpscan->phs_nblocks > NBuffers / 4;
+ SpinLockInit(&bpscan->phs_mutex);
+ bpscan->phs_startblock = InvalidBlockNumber;
+ pg_atomic_init_u64(&bpscan->phs_nallocated, 0);
+
+ return sizeof(ParallelBlockTableScanDescData);
+}
+
+void
+table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
+{
+ ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
+
+ pg_atomic_write_u64(&bpscan->phs_nallocated, 0);
+}
+
+/*
+ * find and set the scan's startblock
+ *
+ * Determine where the parallel seq scan should start. This function may be
+ * called many times, once by each parallel worker. We must be careful only
+ * to set the startblock once.
+ */
+void
+table_block_parallelscan_startblock_init(Relation rel,
+ ParallelBlockTableScanWorker pbscanwork,
+ ParallelBlockTableScanDesc pbscan)
+{
+ BlockNumber sync_startpage = InvalidBlockNumber;
+
+ /* Reset the state we use for controlling allocation size. */
+ memset(pbscanwork, 0, sizeof(*pbscanwork));
+
+ StaticAssertStmt(MaxBlockNumber <= 0xFFFFFFFE,
+ "pg_nextpower2_32 may be too small for non-standard BlockNumber width");
+
+ /*
+ * We determine the chunk size based on the size of the relation. First we
+ * split the relation into PARALLEL_SEQSCAN_NCHUNKS chunks but we then
+ * take the next highest power of 2 number of the chunk size. This means
+ * we split the relation into somewhere between PARALLEL_SEQSCAN_NCHUNKS
+ * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks.
+ */
+ pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(pbscan->phs_nblocks /
+ PARALLEL_SEQSCAN_NCHUNKS, 1));
+
+ /*
+ * Ensure we don't go over the maximum chunk size with larger tables. This
+ * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
+ * tables. Too large a chunk size has been shown to be detrimental to
+ * synchronous scan performance.
+ */
+ pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size,
+ PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);
+
+retry:
+ /* Grab the spinlock. */
+ SpinLockAcquire(&pbscan->phs_mutex);
+
+ /*
+ * If the scan's startblock has not yet been initialized, we must do so
+ * now. If this is not a synchronized scan, we just start at block 0, but
+ * if it is a synchronized scan, we must get the starting position from
+ * the synchronized scan machinery. We can't hold the spinlock while
+ * doing that, though, so release the spinlock, get the information we
+ * need, and retry. If nobody else has initialized the scan in the
+ * meantime, we'll fill in the value we fetched on the second time
+ * through.
+ */
+ if (pbscan->phs_startblock == InvalidBlockNumber)
+ {
+ if (!pbscan->base.phs_syncscan)
+ pbscan->phs_startblock = 0;
+ else if (sync_startpage != InvalidBlockNumber)
+ pbscan->phs_startblock = sync_startpage;
+ else
+ {
+ SpinLockRelease(&pbscan->phs_mutex);
+ sync_startpage = ss_get_location(rel, pbscan->phs_nblocks);
+ goto retry;
+ }
+ }
+ SpinLockRelease(&pbscan->phs_mutex);
+}
+
+/*
+ * get the next page to scan
+ *
+ * Get the next page to scan. Even if there are no pages left to scan,
+ * another backend could have grabbed a page to scan and not yet finished
+ * looking at it, so it doesn't follow that the scan is done when the first
+ * backend gets an InvalidBlockNumber return.
+ */
+BlockNumber
+table_block_parallelscan_nextpage(Relation rel,
+ ParallelBlockTableScanWorker pbscanwork,
+ ParallelBlockTableScanDesc pbscan)
+{
+ BlockNumber page;
+ uint64 nallocated;
+
+ /*
+ * The logic below allocates block numbers out to parallel workers in a
+ * way that each worker will receive a set of consecutive block numbers to
+ * scan. Earlier versions of this would allocate the next highest block
+ * number to the next worker to call this function. This would generally
+ * result in workers never receiving consecutive block numbers. Some
+ * operating systems would not detect the sequential I/O pattern due to
+ * each backend being a different process which could result in poor
+ * performance due to inefficient or no readahead. To work around this
+ * issue, we now allocate a range of block numbers for each worker and
+ * when they come back for another block, we give them the next one in
+ * that range until the range is complete. When the worker completes the
+ * range of blocks we then allocate another range for it and return the
+ * first block number from that range.
+ *
+ * Here we name these ranges of blocks "chunks". The initial size of
+ * these chunks is determined in table_block_parallelscan_startblock_init
+ * based on the size of the relation. Towards the end of the scan, we
+ * start making reductions in the size of the chunks in order to attempt
+ * to divide the remaining work over all the workers as evenly as
+ * possible.
+ *
+ * Here pbscanwork is local worker memory. phsw_chunk_remaining tracks
+ * the number of blocks remaining in the chunk. When that reaches 0 then
+ * we must allocate a new chunk for the worker.
+ *
+ * phs_nallocated tracks how many blocks have been allocated to workers
+ * already. When phs_nallocated >= rs_nblocks, all blocks have been
+ * allocated.
+ *
+ * Because we use an atomic fetch-and-add to fetch the current value, the
+ * phs_nallocated counter will exceed rs_nblocks, because workers will
+ * still increment the value, when they try to allocate the next block but
+ * all blocks have been allocated already. The counter must be 64 bits
+ * wide because of that, to avoid wrapping around when rs_nblocks is close
+ * to 2^32.
+ *
+ * The actual block to return is calculated by adding the counter to the
+ * starting block number, modulo nblocks.
+ */
+
+ /*
+ * First check if we have any remaining blocks in a previous chunk for
+ * this worker. We must consume all of the blocks from that before we
+ * allocate a new chunk to the worker.
+ */
+ if (pbscanwork->phsw_chunk_remaining > 0)
+ {
+ /*
+ * Give them the next block in the range and update the remaining
+ * number of blocks.
+ */
+ nallocated = ++pbscanwork->phsw_nallocated;
+ pbscanwork->phsw_chunk_remaining--;
+ }
+ else
+ {
+ /*
+ * When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks
+ * remaining in the scan, we half the chunk size. Since we reduce the
+ * chunk size here, we'll hit this again after doing
+ * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size. After a few
+ * iterations of this, we'll end up doing the last few blocks with the
+ * chunk size set to 1.
+ */
+ if (pbscanwork->phsw_chunk_size > 1 &&
+ pbscanwork->phsw_nallocated > pbscan->phs_nblocks -
+ (pbscanwork->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))
+ pbscanwork->phsw_chunk_size >>= 1;
+
+ nallocated = pbscanwork->phsw_nallocated =
+ pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
+ pbscanwork->phsw_chunk_size);
+
+ /*
+ * Set the remaining number of blocks in this chunk so that subsequent
+ * calls from this worker continue on with this chunk until it's done.
+ */
+ pbscanwork->phsw_chunk_remaining = pbscanwork->phsw_chunk_size - 1;
+ }
+
+ if (nallocated >= pbscan->phs_nblocks)
+ page = InvalidBlockNumber; /* all blocks have been allocated */
+ else
+ page = (nallocated + pbscan->phs_startblock) % pbscan->phs_nblocks;
+
+ /*
+ * Report scan location. Normally, we report the current page number.
+ * When we reach the end of the scan, though, we report the starting page,
+ * not the ending page, just so the starting positions for later scans
+ * doesn't slew backwards. We only report the position at the end of the
+ * scan once, though: subsequent callers will report nothing.
+ */
+ if (pbscan->base.phs_syncscan)
+ {
+ if (page != InvalidBlockNumber)
+ ss_report_location(rel, page);
+ else if (nallocated == pbscan->phs_nblocks)
+ ss_report_location(rel, pbscan->phs_startblock);
+ }
+
+ return page;
+}
+
+/* ----------------------------------------------------------------------------
+ * Helper functions to implement relation sizing for block oriented AMs.
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * table_block_relation_size
+ *
+ * If a table AM uses the various relation forks as the sole place where data
+ * is stored, and if it uses them in the expected manner (e.g. the actual data
+ * is in the main fork rather than some other), it can use this implementation
+ * of the relation_size callback rather than implementing its own.
+ */
+uint64
+table_block_relation_size(Relation rel, ForkNumber forkNumber)
+{
+ uint64 nblocks = 0;
+
+ /* Open it at the smgr level if not already done */
+ RelationOpenSmgr(rel);
+
+ /* InvalidForkNumber indicates returning the size for all forks */
+ if (forkNumber == InvalidForkNumber)
+ {
+ for (int i = 0; i < MAX_FORKNUM; i++)
+ nblocks += smgrnblocks(rel->rd_smgr, i);
+ }
+ else
+ nblocks = smgrnblocks(rel->rd_smgr, forkNumber);
+
+ return nblocks * BLCKSZ;
+}
+
+/*
+ * table_block_relation_estimate_size
+ *
+ * This function can't be directly used as the implementation of the
+ * relation_estimate_size callback, because it has a few additional parameters.
+ * Instead, it is intended to be used as a helper function; the caller can
+ * pass through the arguments to its relation_estimate_size function plus the
+ * additional values required here.
+ *
+ * overhead_bytes_per_tuple should contain the approximate number of bytes
+ * of storage required to store a tuple above and beyond what is required for
+ * the tuple data proper. Typically, this would include things like the
+ * size of the tuple header and item pointer. This is only used for query
+ * planning, so a table AM where the value is not constant could choose to
+ * pass a "best guess".
+ *
+ * usable_bytes_per_page should contain the approximate number of bytes per
+ * page usable for tuple data, excluding the page header and any anticipated
+ * special space.
+ */
+void
+table_block_relation_estimate_size(Relation rel, int32 *attr_widths,
+ BlockNumber *pages, double *tuples,
+ double *allvisfrac,
+ Size overhead_bytes_per_tuple,
+ Size usable_bytes_per_page)
+{
+ BlockNumber curpages;
+ BlockNumber relpages;
+ double reltuples;
+ BlockNumber relallvisible;
+ double density;
+
+ /* it should have storage, so we can call the smgr */
+ curpages = RelationGetNumberOfBlocks(rel);
+
+ /* coerce values in pg_class to more desirable types */
+ relpages = (BlockNumber) rel->rd_rel->relpages;
+ reltuples = (double) rel->rd_rel->reltuples;
+ relallvisible = (BlockNumber) rel->rd_rel->relallvisible;
+
+ /*
+ * HACK: if the relation has never yet been vacuumed, use a minimum size
+ * estimate of 10 pages. The idea here is to avoid assuming a
+ * newly-created table is really small, even if it currently is, because
+ * that may not be true once some data gets loaded into it. Once a vacuum
+ * or analyze cycle has been done on it, it's more reasonable to believe
+ * the size is somewhat stable.
+ *
+ * (Note that this is only an issue if the plan gets cached and used again
+ * after the table has been filled. What we're trying to avoid is using a
+ * nestloop-type plan on a table that has grown substantially since the
+ * plan was made. Normally, autovacuum/autoanalyze will occur once enough
+ * inserts have happened and cause cached-plan invalidation; but that
+ * doesn't happen instantaneously, and it won't happen at all for cases
+ * such as temporary tables.)
+ *
+ * We test "never vacuumed" by seeing whether reltuples < 0.
+ *
+ * If the table has inheritance children, we don't apply this heuristic.
+ * Totally empty parent tables are quite common, so we should be willing
+ * to believe that they are empty.
+ */
+ if (curpages < 10 &&
+ reltuples < 0 &&
+ !rel->rd_rel->relhassubclass)
+ curpages = 10;
+
+ /* report estimated # pages */
+ *pages = curpages;
+ /* quick exit if rel is clearly empty */
+ if (curpages == 0)
+ {
+ *tuples = 0;
+ *allvisfrac = 0;
+ return;
+ }
+
+ /* estimate number of tuples from previous tuple density */
+ if (reltuples >= 0 && relpages > 0)
+ density = reltuples / (double) relpages;
+ else
+ {
+ /*
+ * When we have no data because the relation was never yet vacuumed,
+ * estimate tuple width from attribute datatypes. We assume here that
+ * the pages are completely full, which is OK for tables but is
+ * probably an overestimate for indexes. Fortunately
+ * get_relation_info() can clamp the overestimate to the parent
+ * table's size.
+ *
+ * Note: this code intentionally disregards alignment considerations,
+ * because (a) that would be gilding the lily considering how crude
+ * the estimate is, (b) it creates platform dependencies in the
+ * default plans which are kind of a headache for regression testing,
+ * and (c) different table AMs might use different padding schemes.
+ */
+ int32 tuple_width;
+
+ tuple_width = get_rel_data_width(rel, attr_widths);
+ tuple_width += overhead_bytes_per_tuple;
+ /* note: integer division is intentional here */
+ density = usable_bytes_per_page / tuple_width;
+ }
+ *tuples = rint(density * (double) curpages);
+
+ /*
+ * We use relallvisible as-is, rather than scaling it up like we do for
+ * the pages and tuples counts, on the theory that any pages added since
+ * the last VACUUM are most likely not marked all-visible. But costsize.c
+ * wants it converted to a fraction.
+ */
+ if (relallvisible == 0 || curpages <= 0)
+ *allvisfrac = 0;
+ else if ((double) relallvisible >= curpages)
+ *allvisfrac = 1;
+ else
+ *allvisfrac = (double) relallvisible / curpages;
+}
diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c
new file mode 100644
index 0000000..325ecdc
--- /dev/null
+++ b/src/backend/access/table/tableamapi.c
@@ -0,0 +1,158 @@
+/*----------------------------------------------------------------------
+ *
+ * tableamapi.c
+ * Support routines for API for Postgres table access methods
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/table/tableamapi.c
+ *----------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/htup_details.h"
+#include "access/tableam.h"
+#include "access/xact.h"
+#include "catalog/pg_am.h"
+#include "catalog/pg_proc.h"
+#include "commands/defrem.h"
+#include "miscadmin.h"
+#include "utils/fmgroids.h"
+#include "utils/memutils.h"
+#include "utils/syscache.h"
+
+
+/*
+ * GetTableAmRoutine
+ * Call the specified access method handler routine to get its
+ * TableAmRoutine struct, which will be palloc'd in the caller's
+ * memory context.
+ */
+const TableAmRoutine *
+GetTableAmRoutine(Oid amhandler)
+{
+ Datum datum;
+ const TableAmRoutine *routine;
+
+ datum = OidFunctionCall0(amhandler);
+ routine = (TableAmRoutine *) DatumGetPointer(datum);
+
+ if (routine == NULL || !IsA(routine, TableAmRoutine))
+ elog(ERROR, "table access method handler %u did not return a TableAmRoutine struct",
+ amhandler);
+
+ /*
+ * Assert that all required callbacks are present. That makes it a bit
+ * easier to keep AMs up to date, e.g. when forward porting them to a new
+ * major version.
+ */
+ Assert(routine->scan_begin != NULL);
+ Assert(routine->scan_end != NULL);
+ Assert(routine->scan_rescan != NULL);
+ Assert(routine->scan_getnextslot != NULL);
+
+ Assert(routine->parallelscan_estimate != NULL);
+ Assert(routine->parallelscan_initialize != NULL);
+ Assert(routine->parallelscan_reinitialize != NULL);
+
+ Assert(routine->index_fetch_begin != NULL);
+ Assert(routine->index_fetch_reset != NULL);
+ Assert(routine->index_fetch_end != NULL);
+ Assert(routine->index_fetch_tuple != NULL);
+
+ Assert(routine->tuple_fetch_row_version != NULL);
+ Assert(routine->tuple_tid_valid != NULL);
+ Assert(routine->tuple_get_latest_tid != NULL);
+ Assert(routine->tuple_satisfies_snapshot != NULL);
+ Assert(routine->index_delete_tuples != NULL);
+
+ Assert(routine->tuple_insert != NULL);
+
+ /*
+ * Could be made optional, but would require throwing error during
+ * parse-analysis.
+ */
+ Assert(routine->tuple_insert_speculative != NULL);
+ Assert(routine->tuple_complete_speculative != NULL);
+
+ Assert(routine->multi_insert != NULL);
+ Assert(routine->tuple_delete != NULL);
+ Assert(routine->tuple_update != NULL);
+ Assert(routine->tuple_lock != NULL);
+
+ Assert(routine->relation_set_new_filenode != NULL);
+ Assert(routine->relation_nontransactional_truncate != NULL);
+ Assert(routine->relation_copy_data != NULL);
+ Assert(routine->relation_copy_for_cluster != NULL);
+ Assert(routine->relation_vacuum != NULL);
+ Assert(routine->scan_analyze_next_block != NULL);
+ Assert(routine->scan_analyze_next_tuple != NULL);
+ Assert(routine->index_build_range_scan != NULL);
+ Assert(routine->index_validate_scan != NULL);
+
+ Assert(routine->relation_size != NULL);
+ Assert(routine->relation_needs_toast_table != NULL);
+
+ Assert(routine->relation_estimate_size != NULL);
+
+ /* optional, but one callback implies presence of the other */
+ Assert((routine->scan_bitmap_next_block == NULL) ==
+ (routine->scan_bitmap_next_tuple == NULL));
+ Assert(routine->scan_sample_next_block != NULL);
+ Assert(routine->scan_sample_next_tuple != NULL);
+
+ return routine;
+}
+
+/* check_hook: validate new default_table_access_method */
+bool
+check_default_table_access_method(char **newval, void **extra, GucSource source)
+{
+ if (**newval == '\0')
+ {
+ GUC_check_errdetail("%s cannot be empty.",
+ "default_table_access_method");
+ return false;
+ }
+
+ if (strlen(*newval) >= NAMEDATALEN)
+ {
+ GUC_check_errdetail("%s is too long (maximum %d characters).",
+ "default_table_access_method", NAMEDATALEN - 1);
+ return false;
+ }
+
+ /*
+ * If we aren't inside a transaction, or not connected to a database, we
+ * cannot do the catalog access necessary to verify the method. Must
+ * accept the value on faith.
+ */
+ if (IsTransactionState() && MyDatabaseId != InvalidOid)
+ {
+ if (!OidIsValid(get_table_am_oid(*newval, true)))
+ {
+ /*
+ * When source == PGC_S_TEST, don't throw a hard error for a
+ * nonexistent table access method, only a NOTICE. See comments in
+ * guc.h.
+ */
+ if (source == PGC_S_TEST)
+ {
+ ereport(NOTICE,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("table access method \"%s\" does not exist",
+ *newval)));
+ }
+ else
+ {
+ GUC_check_errdetail("Table access method \"%s\" does not exist.",
+ *newval);
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
diff --git a/src/backend/access/table/toast_helper.c b/src/backend/access/table/toast_helper.c
new file mode 100644
index 0000000..013236b
--- /dev/null
+++ b/src/backend/access/table/toast_helper.c
@@ -0,0 +1,337 @@
+/*-------------------------------------------------------------------------
+ *
+ * toast_helper.c
+ * Helper functions for table AMs implementing compressed or
+ * out-of-line storage of varlena attributes.
+ *
+ * Copyright (c) 2000-2021, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/access/table/toast_helper.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/detoast.h"
+#include "access/table.h"
+#include "access/toast_helper.h"
+#include "access/toast_internals.h"
+#include "catalog/pg_type_d.h"
+
+
+/*
+ * Prepare to TOAST a tuple.
+ *
+ * tupleDesc, toast_values, and toast_isnull are required parameters; they
+ * provide the necessary details about the tuple to be toasted.
+ *
+ * toast_oldvalues and toast_oldisnull should be NULL for a newly-inserted
+ * tuple; for an update, they should describe the existing tuple.
+ *
+ * All of these arrays should have a length equal to tupleDesc->natts.
+ *
+ * On return, toast_flags and toast_attr will have been initialized.
+ * toast_flags is just a single uint8, but toast_attr is a caller-provided
+ * array with a length equal to tupleDesc->natts. The caller need not
+ * perform any initialization of the array before calling this function.
+ */
+void
+toast_tuple_init(ToastTupleContext *ttc)
+{
+ TupleDesc tupleDesc = ttc->ttc_rel->rd_att;
+ int numAttrs = tupleDesc->natts;
+ int i;
+
+ ttc->ttc_flags = 0;
+
+ for (i = 0; i < numAttrs; i++)
+ {
+ Form_pg_attribute att = TupleDescAttr(tupleDesc, i);
+ struct varlena *old_value;
+ struct varlena *new_value;
+
+ ttc->ttc_attr[i].tai_colflags = 0;
+ ttc->ttc_attr[i].tai_oldexternal = NULL;
+ ttc->ttc_attr[i].tai_compression = att->attcompression;
+
+ if (ttc->ttc_oldvalues != NULL)
+ {
+ /*
+ * For UPDATE get the old and new values of this attribute
+ */
+ old_value =
+ (struct varlena *) DatumGetPointer(ttc->ttc_oldvalues[i]);
+ new_value =
+ (struct varlena *) DatumGetPointer(ttc->ttc_values[i]);
+
+ /*
+ * If the old value is stored on disk, check if it has changed so
+ * we have to delete it later.
+ */
+ if (att->attlen == -1 && !ttc->ttc_oldisnull[i] &&
+ VARATT_IS_EXTERNAL_ONDISK(old_value))
+ {
+ if (ttc->ttc_isnull[i] ||
+ !VARATT_IS_EXTERNAL_ONDISK(new_value) ||
+ memcmp((char *) old_value, (char *) new_value,
+ VARSIZE_EXTERNAL(old_value)) != 0)
+ {
+ /*
+ * The old external stored value isn't needed any more
+ * after the update
+ */
+ ttc->ttc_attr[i].tai_colflags |= TOASTCOL_NEEDS_DELETE_OLD;
+ ttc->ttc_flags |= TOAST_NEEDS_DELETE_OLD;
+ }
+ else
+ {
+ /*
+ * This attribute isn't changed by this update so we reuse
+ * the original reference to the old value in the new
+ * tuple.
+ */
+ ttc->ttc_attr[i].tai_colflags |= TOASTCOL_IGNORE;
+ continue;
+ }
+ }
+ }
+ else
+ {
+ /*
+ * For INSERT simply get the new value
+ */
+ new_value = (struct varlena *) DatumGetPointer(ttc->ttc_values[i]);
+ }
+
+ /*
+ * Handle NULL attributes
+ */
+ if (ttc->ttc_isnull[i])
+ {
+ ttc->ttc_attr[i].tai_colflags |= TOASTCOL_IGNORE;
+ ttc->ttc_flags |= TOAST_HAS_NULLS;
+ continue;
+ }
+
+ /*
+ * Now look at varlena attributes
+ */
+ if (att->attlen == -1)
+ {
+ /*
+ * If the table's attribute says PLAIN always, force it so.
+ */
+ if (att->attstorage == TYPSTORAGE_PLAIN)
+ ttc->ttc_attr[i].tai_colflags |= TOASTCOL_IGNORE;
+
+ /*
+ * We took care of UPDATE above, so any external value we find
+ * still in the tuple must be someone else's that we cannot reuse
+ * (this includes the case of an out-of-line in-memory datum).
+ * Fetch it back (without decompression, unless we are forcing
+ * PLAIN storage). If necessary, we'll push it out as a new
+ * external value below.
+ */
+ if (VARATT_IS_EXTERNAL(new_value))
+ {
+ ttc->ttc_attr[i].tai_oldexternal = new_value;
+ if (att->attstorage == TYPSTORAGE_PLAIN)
+ new_value = detoast_attr(new_value);
+ else
+ new_value = detoast_external_attr(new_value);
+ ttc->ttc_values[i] = PointerGetDatum(new_value);
+ ttc->ttc_attr[i].tai_colflags |= TOASTCOL_NEEDS_FREE;
+ ttc->ttc_flags |= (TOAST_NEEDS_CHANGE | TOAST_NEEDS_FREE);
+ }
+
+ /*
+ * Remember the size of this attribute
+ */
+ ttc->ttc_attr[i].tai_size = VARSIZE_ANY(new_value);
+ }
+ else
+ {
+ /*
+ * Not a varlena attribute, plain storage always
+ */
+ ttc->ttc_attr[i].tai_colflags |= TOASTCOL_IGNORE;
+ }
+ }
+}
+
+/*
+ * Find the largest varlena attribute that satisfies certain criteria.
+ *
+ * The relevant column must not be marked TOASTCOL_IGNORE, and if the
+ * for_compression flag is passed as true, it must also not be marked
+ * TOASTCOL_INCOMPRESSIBLE.
+ *
+ * The column must have attstorage EXTERNAL or EXTENDED if check_main is
+ * false, and must have attstorage MAIN if check_main is true.
+ *
+ * The column must have a minimum size of MAXALIGN(TOAST_POINTER_SIZE);
+ * if not, no benefit is to be expected by compressing it.
+ *
+ * The return value is the index of the biggest suitable column, or
+ * -1 if there is none.
+ */
+int
+toast_tuple_find_biggest_attribute(ToastTupleContext *ttc,
+ bool for_compression, bool check_main)
+{
+ TupleDesc tupleDesc = ttc->ttc_rel->rd_att;
+ int numAttrs = tupleDesc->natts;
+ int biggest_attno = -1;
+ int32 biggest_size = MAXALIGN(TOAST_POINTER_SIZE);
+ int32 skip_colflags = TOASTCOL_IGNORE;
+ int i;
+
+ if (for_compression)
+ skip_colflags |= TOASTCOL_INCOMPRESSIBLE;
+
+ for (i = 0; i < numAttrs; i++)
+ {
+ Form_pg_attribute att = TupleDescAttr(tupleDesc, i);
+
+ if ((ttc->ttc_attr[i].tai_colflags & skip_colflags) != 0)
+ continue;
+ if (VARATT_IS_EXTERNAL(DatumGetPointer(ttc->ttc_values[i])))
+ continue; /* can't happen, toast_action would be PLAIN */
+ if (for_compression &&
+ VARATT_IS_COMPRESSED(DatumGetPointer(ttc->ttc_values[i])))
+ continue;
+ if (check_main && att->attstorage != TYPSTORAGE_MAIN)
+ continue;
+ if (!check_main && att->attstorage != TYPSTORAGE_EXTENDED &&
+ att->attstorage != TYPSTORAGE_EXTERNAL)
+ continue;
+
+ if (ttc->ttc_attr[i].tai_size > biggest_size)
+ {
+ biggest_attno = i;
+ biggest_size = ttc->ttc_attr[i].tai_size;
+ }
+ }
+
+ return biggest_attno;
+}
+
+/*
+ * Try compression for an attribute.
+ *
+ * If we find that the attribute is not compressible, mark it so.
+ */
+void
+toast_tuple_try_compression(ToastTupleContext *ttc, int attribute)
+{
+ Datum *value = &ttc->ttc_values[attribute];
+ Datum new_value;
+ ToastAttrInfo *attr = &ttc->ttc_attr[attribute];
+
+ new_value = toast_compress_datum(*value, attr->tai_compression);
+
+ if (DatumGetPointer(new_value) != NULL)
+ {
+ /* successful compression */
+ if ((attr->tai_colflags & TOASTCOL_NEEDS_FREE) != 0)
+ pfree(DatumGetPointer(*value));
+ *value = new_value;
+ attr->tai_colflags |= TOASTCOL_NEEDS_FREE;
+ attr->tai_size = VARSIZE(DatumGetPointer(*value));
+ ttc->ttc_flags |= (TOAST_NEEDS_CHANGE | TOAST_NEEDS_FREE);
+ }
+ else
+ {
+ /* incompressible, ignore on subsequent compression passes */
+ attr->tai_colflags |= TOASTCOL_INCOMPRESSIBLE;
+ }
+}
+
+/*
+ * Move an attribute to external storage.
+ */
+void
+toast_tuple_externalize(ToastTupleContext *ttc, int attribute, int options)
+{
+ Datum *value = &ttc->ttc_values[attribute];
+ Datum old_value = *value;
+ ToastAttrInfo *attr = &ttc->ttc_attr[attribute];
+
+ attr->tai_colflags |= TOASTCOL_IGNORE;
+ *value = toast_save_datum(ttc->ttc_rel, old_value, attr->tai_oldexternal,
+ options);
+ if ((attr->tai_colflags & TOASTCOL_NEEDS_FREE) != 0)
+ pfree(DatumGetPointer(old_value));
+ attr->tai_colflags |= TOASTCOL_NEEDS_FREE;
+ ttc->ttc_flags |= (TOAST_NEEDS_CHANGE | TOAST_NEEDS_FREE);
+}
+
+/*
+ * Perform appropriate cleanup after one tuple has been subjected to TOAST.
+ */
+void
+toast_tuple_cleanup(ToastTupleContext *ttc)
+{
+ TupleDesc tupleDesc = ttc->ttc_rel->rd_att;
+ int numAttrs = tupleDesc->natts;
+
+ /*
+ * Free allocated temp values
+ */
+ if ((ttc->ttc_flags & TOAST_NEEDS_FREE) != 0)
+ {
+ int i;
+
+ for (i = 0; i < numAttrs; i++)
+ {
+ ToastAttrInfo *attr = &ttc->ttc_attr[i];
+
+ if ((attr->tai_colflags & TOASTCOL_NEEDS_FREE) != 0)
+ pfree(DatumGetPointer(ttc->ttc_values[i]));
+ }
+ }
+
+ /*
+ * Delete external values from the old tuple
+ */
+ if ((ttc->ttc_flags & TOAST_NEEDS_DELETE_OLD) != 0)
+ {
+ int i;
+
+ for (i = 0; i < numAttrs; i++)
+ {
+ ToastAttrInfo *attr = &ttc->ttc_attr[i];
+
+ if ((attr->tai_colflags & TOASTCOL_NEEDS_DELETE_OLD) != 0)
+ toast_delete_datum(ttc->ttc_rel, ttc->ttc_oldvalues[i], false);
+ }
+ }
+}
+
+/*
+ * Check for external stored attributes and delete them from the secondary
+ * relation.
+ */
+void
+toast_delete_external(Relation rel, Datum *values, bool *isnull,
+ bool is_speculative)
+{
+ TupleDesc tupleDesc = rel->rd_att;
+ int numAttrs = tupleDesc->natts;
+ int i;
+
+ for (i = 0; i < numAttrs; i++)
+ {
+ if (TupleDescAttr(tupleDesc, i)->attlen == -1)
+ {
+ Datum value = values[i];
+
+ if (isnull[i])
+ continue;
+ else if (VARATT_IS_EXTERNAL_ONDISK(PointerGetDatum(value)))
+ toast_delete_datum(rel, value, is_speculative);
+ }
+ }
+}
diff --git a/src/backend/access/tablesample/Makefile b/src/backend/access/tablesample/Makefile
new file mode 100644
index 0000000..01641e5
--- /dev/null
+++ b/src/backend/access/tablesample/Makefile
@@ -0,0 +1,20 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for access/tablesample
+#
+# IDENTIFICATION
+# src/backend/access/tablesample/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/access/tablesample
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ bernoulli.o \
+ system.o \
+ tablesample.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/tablesample/bernoulli.c b/src/backend/access/tablesample/bernoulli.c
new file mode 100644
index 0000000..ae6e4f5
--- /dev/null
+++ b/src/backend/access/tablesample/bernoulli.c
@@ -0,0 +1,229 @@
+/*-------------------------------------------------------------------------
+ *
+ * bernoulli.c
+ * support routines for BERNOULLI tablesample method
+ *
+ * To ensure repeatability of samples, it is necessary that selection of a
+ * given tuple be history-independent; otherwise syncscanning would break
+ * repeatability, to say nothing of logically-irrelevant maintenance such
+ * as physical extension or shortening of the relation.
+ *
+ * To achieve that, we proceed by hashing each candidate TID together with
+ * the active seed, and then selecting it if the hash is less than the
+ * cutoff value computed from the selection probability by BeginSampleScan.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/tablesample/bernoulli.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <math.h>
+
+#include "access/tsmapi.h"
+#include "catalog/pg_type.h"
+#include "common/hashfn.h"
+#include "optimizer/optimizer.h"
+#include "utils/builtins.h"
+
+
+/* Private state */
+typedef struct
+{
+ uint64 cutoff; /* select tuples with hash less than this */
+ uint32 seed; /* random seed */
+ OffsetNumber lt; /* last tuple returned from current block */
+} BernoulliSamplerData;
+
+
+static void bernoulli_samplescangetsamplesize(PlannerInfo *root,
+ RelOptInfo *baserel,
+ List *paramexprs,
+ BlockNumber *pages,
+ double *tuples);
+static void bernoulli_initsamplescan(SampleScanState *node,
+ int eflags);
+static void bernoulli_beginsamplescan(SampleScanState *node,
+ Datum *params,
+ int nparams,
+ uint32 seed);
+static OffsetNumber bernoulli_nextsampletuple(SampleScanState *node,
+ BlockNumber blockno,
+ OffsetNumber maxoffset);
+
+
+/*
+ * Create a TsmRoutine descriptor for the BERNOULLI method.
+ */
+Datum
+tsm_bernoulli_handler(PG_FUNCTION_ARGS)
+{
+ TsmRoutine *tsm = makeNode(TsmRoutine);
+
+ tsm->parameterTypes = list_make1_oid(FLOAT4OID);
+ tsm->repeatable_across_queries = true;
+ tsm->repeatable_across_scans = true;
+ tsm->SampleScanGetSampleSize = bernoulli_samplescangetsamplesize;
+ tsm->InitSampleScan = bernoulli_initsamplescan;
+ tsm->BeginSampleScan = bernoulli_beginsamplescan;
+ tsm->NextSampleBlock = NULL;
+ tsm->NextSampleTuple = bernoulli_nextsampletuple;
+ tsm->EndSampleScan = NULL;
+
+ PG_RETURN_POINTER(tsm);
+}
+
+/*
+ * Sample size estimation.
+ */
+static void
+bernoulli_samplescangetsamplesize(PlannerInfo *root,
+ RelOptInfo *baserel,
+ List *paramexprs,
+ BlockNumber *pages,
+ double *tuples)
+{
+ Node *pctnode;
+ float4 samplefract;
+
+ /* Try to extract an estimate for the sample percentage */
+ pctnode = (Node *) linitial(paramexprs);
+ pctnode = estimate_expression_value(root, pctnode);
+
+ if (IsA(pctnode, Const) &&
+ !((Const *) pctnode)->constisnull)
+ {
+ samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue);
+ if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract))
+ samplefract /= 100.0f;
+ else
+ {
+ /* Default samplefract if the value is bogus */
+ samplefract = 0.1f;
+ }
+ }
+ else
+ {
+ /* Default samplefract if we didn't obtain a non-null Const */
+ samplefract = 0.1f;
+ }
+
+ /* We'll visit all pages of the baserel */
+ *pages = baserel->pages;
+
+ *tuples = clamp_row_est(baserel->tuples * samplefract);
+}
+
+/*
+ * Initialize during executor setup.
+ */
+static void
+bernoulli_initsamplescan(SampleScanState *node, int eflags)
+{
+ node->tsm_state = palloc0(sizeof(BernoulliSamplerData));
+}
+
+/*
+ * Examine parameters and prepare for a sample scan.
+ */
+static void
+bernoulli_beginsamplescan(SampleScanState *node,
+ Datum *params,
+ int nparams,
+ uint32 seed)
+{
+ BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state;
+ double percent = DatumGetFloat4(params[0]);
+ double dcutoff;
+
+ if (percent < 0 || percent > 100 || isnan(percent))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
+ errmsg("sample percentage must be between 0 and 100")));
+
+ /*
+ * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to
+ * store that as a uint64, of course. Note that this gives strictly
+ * correct behavior at the limits of zero or one probability.
+ */
+ dcutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100);
+ sampler->cutoff = (uint64) dcutoff;
+ sampler->seed = seed;
+ sampler->lt = InvalidOffsetNumber;
+
+ /*
+ * Use bulkread, since we're scanning all pages. But pagemode visibility
+ * checking is a win only at larger sampling fractions. The 25% cutoff
+ * here is based on very limited experimentation.
+ */
+ node->use_bulkread = true;
+ node->use_pagemode = (percent >= 25);
+}
+
+/*
+ * Select next sampled tuple in current block.
+ *
+ * It is OK here to return an offset without knowing if the tuple is visible
+ * (or even exists). The reason is that we do the coinflip for every tuple
+ * offset in the table. Since all tuples have the same probability of being
+ * returned, it doesn't matter if we do extra coinflips for invisible tuples.
+ *
+ * When we reach end of the block, return InvalidOffsetNumber which tells
+ * SampleScan to go to next block.
+ */
+static OffsetNumber
+bernoulli_nextsampletuple(SampleScanState *node,
+ BlockNumber blockno,
+ OffsetNumber maxoffset)
+{
+ BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state;
+ OffsetNumber tupoffset = sampler->lt;
+ uint32 hashinput[3];
+
+ /* Advance to first/next tuple in block */
+ if (tupoffset == InvalidOffsetNumber)
+ tupoffset = FirstOffsetNumber;
+ else
+ tupoffset++;
+
+ /*
+ * We compute the hash by applying hash_any to an array of 3 uint32's
+ * containing the block, offset, and seed. This is efficient to set up,
+ * and with the current implementation of hash_any, it gives
+ * machine-independent results, which is a nice property for regression
+ * testing.
+ *
+ * These words in the hash input are the same throughout the block:
+ */
+ hashinput[0] = blockno;
+ hashinput[2] = sampler->seed;
+
+ /*
+ * Loop over tuple offsets until finding suitable TID or reaching end of
+ * block.
+ */
+ for (; tupoffset <= maxoffset; tupoffset++)
+ {
+ uint32 hash;
+
+ hashinput[1] = tupoffset;
+
+ hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput,
+ (int) sizeof(hashinput)));
+ if (hash < sampler->cutoff)
+ break;
+ }
+
+ if (tupoffset > maxoffset)
+ tupoffset = InvalidOffsetNumber;
+
+ sampler->lt = tupoffset;
+
+ return tupoffset;
+}
diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c
new file mode 100644
index 0000000..b0869e5
--- /dev/null
+++ b/src/backend/access/tablesample/system.c
@@ -0,0 +1,257 @@
+/*-------------------------------------------------------------------------
+ *
+ * system.c
+ * support routines for SYSTEM tablesample method
+ *
+ * To ensure repeatability of samples, it is necessary that selection of a
+ * given tuple be history-independent; otherwise syncscanning would break
+ * repeatability, to say nothing of logically-irrelevant maintenance such
+ * as physical extension or shortening of the relation.
+ *
+ * To achieve that, we proceed by hashing each candidate block number together
+ * with the active seed, and then selecting it if the hash is less than the
+ * cutoff value computed from the selection probability by BeginSampleScan.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/access/tablesample/system.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <math.h>
+
+#include "access/relscan.h"
+#include "access/tsmapi.h"
+#include "catalog/pg_type.h"
+#include "common/hashfn.h"
+#include "optimizer/optimizer.h"
+#include "utils/builtins.h"
+
+
+/* Private state */
+typedef struct
+{
+ uint64 cutoff; /* select blocks with hash less than this */
+ uint32 seed; /* random seed */
+ BlockNumber nextblock; /* next block to consider sampling */
+ OffsetNumber lt; /* last tuple returned from current block */
+} SystemSamplerData;
+
+
+static void system_samplescangetsamplesize(PlannerInfo *root,
+ RelOptInfo *baserel,
+ List *paramexprs,
+ BlockNumber *pages,
+ double *tuples);
+static void system_initsamplescan(SampleScanState *node,
+ int eflags);
+static void system_beginsamplescan(SampleScanState *node,
+ Datum *params,
+ int nparams,
+ uint32 seed);
+static BlockNumber system_nextsampleblock(SampleScanState *node, BlockNumber nblocks);
+static OffsetNumber system_nextsampletuple(SampleScanState *node,
+ BlockNumber blockno,
+ OffsetNumber maxoffset);
+
+
+/*
+ * Create a TsmRoutine descriptor for the SYSTEM method.
+ */
+Datum
+tsm_system_handler(PG_FUNCTION_ARGS)
+{
+ TsmRoutine *tsm = makeNode(TsmRoutine);
+
+ tsm->parameterTypes = list_make1_oid(FLOAT4OID);
+ tsm->repeatable_across_queries = true;
+ tsm->repeatable_across_scans = true;
+ tsm->SampleScanGetSampleSize = system_samplescangetsamplesize;
+ tsm->InitSampleScan = system_initsamplescan;
+ tsm->BeginSampleScan = system_beginsamplescan;
+ tsm->NextSampleBlock = system_nextsampleblock;
+ tsm->NextSampleTuple = system_nextsampletuple;
+ tsm->EndSampleScan = NULL;
+
+ PG_RETURN_POINTER(tsm);
+}
+
+/*
+ * Sample size estimation.
+ */
+static void
+system_samplescangetsamplesize(PlannerInfo *root,
+ RelOptInfo *baserel,
+ List *paramexprs,
+ BlockNumber *pages,
+ double *tuples)
+{
+ Node *pctnode;
+ float4 samplefract;
+
+ /* Try to extract an estimate for the sample percentage */
+ pctnode = (Node *) linitial(paramexprs);
+ pctnode = estimate_expression_value(root, pctnode);
+
+ if (IsA(pctnode, Const) &&
+ !((Const *) pctnode)->constisnull)
+ {
+ samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue);
+ if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract))
+ samplefract /= 100.0f;
+ else
+ {
+ /* Default samplefract if the value is bogus */
+ samplefract = 0.1f;
+ }
+ }
+ else
+ {
+ /* Default samplefract if we didn't obtain a non-null Const */
+ samplefract = 0.1f;
+ }
+
+ /* We'll visit a sample of the pages ... */
+ *pages = clamp_row_est(baserel->pages * samplefract);
+
+ /* ... and hopefully get a representative number of tuples from them */
+ *tuples = clamp_row_est(baserel->tuples * samplefract);
+}
+
+/*
+ * Initialize during executor setup.
+ */
+static void
+system_initsamplescan(SampleScanState *node, int eflags)
+{
+ node->tsm_state = palloc0(sizeof(SystemSamplerData));
+}
+
+/*
+ * Examine parameters and prepare for a sample scan.
+ */
+static void
+system_beginsamplescan(SampleScanState *node,
+ Datum *params,
+ int nparams,
+ uint32 seed)
+{
+ SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
+ double percent = DatumGetFloat4(params[0]);
+ double dcutoff;
+
+ if (percent < 0 || percent > 100 || isnan(percent))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
+ errmsg("sample percentage must be between 0 and 100")));
+
+ /*
+ * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to
+ * store that as a uint64, of course. Note that this gives strictly
+ * correct behavior at the limits of zero or one probability.
+ */
+ dcutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100);
+ sampler->cutoff = (uint64) dcutoff;
+ sampler->seed = seed;
+ sampler->nextblock = 0;
+ sampler->lt = InvalidOffsetNumber;
+
+ /*
+ * Bulkread buffer access strategy probably makes sense unless we're
+ * scanning a very small fraction of the table. The 1% cutoff here is a
+ * guess. We should use pagemode visibility checking, since we scan all
+ * tuples on each selected page.
+ */
+ node->use_bulkread = (percent >= 1);
+ node->use_pagemode = true;
+}
+
+/*
+ * Select next block to sample.
+ */
+static BlockNumber
+system_nextsampleblock(SampleScanState *node, BlockNumber nblocks)
+{
+ SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
+ BlockNumber nextblock = sampler->nextblock;
+ uint32 hashinput[2];
+
+ /*
+ * We compute the hash by applying hash_any to an array of 2 uint32's
+ * containing the block number and seed. This is efficient to set up, and
+ * with the current implementation of hash_any, it gives
+ * machine-independent results, which is a nice property for regression
+ * testing.
+ *
+ * These words in the hash input are the same throughout the block:
+ */
+ hashinput[1] = sampler->seed;
+
+ /*
+ * Loop over block numbers until finding suitable block or reaching end of
+ * relation.
+ */
+ for (; nextblock < nblocks; nextblock++)
+ {
+ uint32 hash;
+
+ hashinput[0] = nextblock;
+
+ hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput,
+ (int) sizeof(hashinput)));
+ if (hash < sampler->cutoff)
+ break;
+ }
+
+ if (nextblock < nblocks)
+ {
+ /* Found a suitable block; remember where we should start next time */
+ sampler->nextblock = nextblock + 1;
+ return nextblock;
+ }
+
+ /* Done, but let's reset nextblock to 0 for safety. */
+ sampler->nextblock = 0;
+ return InvalidBlockNumber;
+}
+
+/*
+ * Select next sampled tuple in current block.
+ *
+ * In block sampling, we just want to sample all the tuples in each selected
+ * block.
+ *
+ * It is OK here to return an offset without knowing if the tuple is visible
+ * (or even exists); nodeSamplescan.c will deal with that.
+ *
+ * When we reach end of the block, return InvalidOffsetNumber which tells
+ * SampleScan to go to next block.
+ */
+static OffsetNumber
+system_nextsampletuple(SampleScanState *node,
+ BlockNumber blockno,
+ OffsetNumber maxoffset)
+{
+ SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
+ OffsetNumber tupoffset = sampler->lt;
+
+ /* Advance to next possible offset on page */
+ if (tupoffset == InvalidOffsetNumber)
+ tupoffset = FirstOffsetNumber;
+ else
+ tupoffset++;
+
+ /* Done? */
+ if (tupoffset > maxoffset)
+ tupoffset = InvalidOffsetNumber;
+
+ sampler->lt = tupoffset;
+
+ return tupoffset;
+}
diff --git a/src/backend/access/tablesample/tablesample.c b/src/backend/access/tablesample/tablesample.c
new file mode 100644
index 0000000..02f2a95
--- /dev/null
+++ b/src/backend/access/tablesample/tablesample.c
@@ -0,0 +1,40 @@
+/*-------------------------------------------------------------------------
+ *
+ * tablesample.c
+ * Support functions for TABLESAMPLE feature
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/tablesample/tablesample.c
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/tsmapi.h"
+
+
+/*
+ * GetTsmRoutine --- get a TsmRoutine struct by invoking the handler.
+ *
+ * This is a convenience routine that's just meant to check for errors.
+ */
+TsmRoutine *
+GetTsmRoutine(Oid tsmhandler)
+{
+ Datum datum;
+ TsmRoutine *routine;
+
+ datum = OidFunctionCall1(tsmhandler, PointerGetDatum(NULL));
+ routine = (TsmRoutine *) DatumGetPointer(datum);
+
+ if (routine == NULL || !IsA(routine, TsmRoutine))
+ elog(ERROR, "tablesample handler function %u did not return a TsmRoutine struct",
+ tsmhandler);
+
+ return routine;
+}