diff options
Diffstat (limited to '')
-rw-r--r-- | src/backend/access/table/Makefile | 21 | ||||
-rw-r--r-- | src/backend/access/table/table.c | 170 | ||||
-rw-r--r-- | src/backend/access/table/tableam.c | 765 | ||||
-rw-r--r-- | src/backend/access/table/tableamapi.c | 158 | ||||
-rw-r--r-- | src/backend/access/table/toast_helper.c | 337 | ||||
-rw-r--r-- | src/backend/access/tablesample/Makefile | 20 | ||||
-rw-r--r-- | src/backend/access/tablesample/bernoulli.c | 229 | ||||
-rw-r--r-- | src/backend/access/tablesample/system.c | 257 | ||||
-rw-r--r-- | src/backend/access/tablesample/tablesample.c | 40 |
9 files changed, 1997 insertions, 0 deletions
diff --git a/src/backend/access/table/Makefile b/src/backend/access/table/Makefile new file mode 100644 index 0000000..9aba3ff --- /dev/null +++ b/src/backend/access/table/Makefile @@ -0,0 +1,21 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/table +# +# IDENTIFICATION +# src/backend/access/table/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/table +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + table.o \ + tableam.o \ + tableamapi.o \ + toast_helper.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/table/table.c b/src/backend/access/table/table.c new file mode 100644 index 0000000..545007e --- /dev/null +++ b/src/backend/access/table/table.c @@ -0,0 +1,170 @@ +/*------------------------------------------------------------------------- + * + * table.c + * Generic routines for table related code. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/table/table.c + * + * + * NOTES + * This file contains table_ routines that implement access to tables (in + * contrast to other relation types like indexes) that are independent of + * individual table access methods. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/relation.h" +#include "access/table.h" +#include "storage/lmgr.h" + + +/* ---------------- + * table_open - open a table relation by relation OID + * + * This is essentially relation_open plus check that the relation + * is not an index nor a composite type. (The caller should also + * check that it's not a view or foreign table before assuming it has + * storage.) + * ---------------- + */ +Relation +table_open(Oid relationId, LOCKMODE lockmode) +{ + Relation r; + + r = relation_open(relationId, lockmode); + + if (r->rd_rel->relkind == RELKIND_INDEX || + r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is an index", + RelationGetRelationName(r)))); + else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is a composite type", + RelationGetRelationName(r)))); + + return r; +} + + +/* ---------------- + * try_table_open - open a table relation by relation OID + * + * Same as table_open, except return NULL instead of failing + * if the relation does not exist. + * ---------------- + */ +Relation +try_table_open(Oid relationId, LOCKMODE lockmode) +{ + Relation r; + + r = try_relation_open(relationId, lockmode); + + /* leave if table does not exist */ + if (!r) + return NULL; + + if (r->rd_rel->relkind == RELKIND_INDEX || + r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is an index", + RelationGetRelationName(r)))); + else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is a composite type", + RelationGetRelationName(r)))); + + return r; +} + +/* ---------------- + * table_openrv - open a table relation specified + * by a RangeVar node + * + * As above, but relation is specified by a RangeVar. + * ---------------- + */ +Relation +table_openrv(const RangeVar *relation, LOCKMODE lockmode) +{ + Relation r; + + r = relation_openrv(relation, lockmode); + + if (r->rd_rel->relkind == RELKIND_INDEX || + r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is an index", + RelationGetRelationName(r)))); + else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is a composite type", + RelationGetRelationName(r)))); + + return r; +} + +/* ---------------- + * table_openrv_extended - open a table relation specified + * by a RangeVar node + * + * As above, but optionally return NULL instead of failing for + * relation-not-found. + * ---------------- + */ +Relation +table_openrv_extended(const RangeVar *relation, LOCKMODE lockmode, + bool missing_ok) +{ + Relation r; + + r = relation_openrv_extended(relation, lockmode, missing_ok); + + if (r) + { + if (r->rd_rel->relkind == RELKIND_INDEX || + r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is an index", + RelationGetRelationName(r)))); + else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is a composite type", + RelationGetRelationName(r)))); + } + + return r; +} + +/* ---------------- + * table_close - close a table + * + * If lockmode is not "NoLock", we then release the specified lock. + * + * Note that it is often sensible to hold a lock beyond relation_close; + * in that case, the lock is released automatically at xact end. + * ---------------- + */ +void +table_close(Relation relation, LOCKMODE lockmode) +{ + relation_close(relation, lockmode); +} diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c new file mode 100644 index 0000000..5ea5bdd --- /dev/null +++ b/src/backend/access/table/tableam.c @@ -0,0 +1,765 @@ +/*---------------------------------------------------------------------- + * + * tableam.c + * Table access method routines too big to be inline functions. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/table/tableam.c + * + * NOTES + * Note that most function in here are documented in tableam.h, rather than + * here. That's because there's a lot of inline functions in tableam.h and + * it'd be harder to understand if one constantly had to switch between files. + * + *---------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <math.h> + +#include "access/syncscan.h" +#include "access/tableam.h" +#include "access/xact.h" +#include "optimizer/plancat.h" +#include "port/pg_bitutils.h" +#include "storage/bufmgr.h" +#include "storage/shmem.h" +#include "storage/smgr.h" + +/* + * Constants to control the behavior of block allocation to parallel workers + * during a parallel seqscan. Technically these values do not need to be + * powers of 2, but having them as powers of 2 makes the math more optimal + * and makes the ramp-down stepping more even. + */ + +/* The number of I/O chunks we try to break a parallel seqscan down into */ +#define PARALLEL_SEQSCAN_NCHUNKS 2048 +/* Ramp down size of allocations when we've only this number of chunks left */ +#define PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS 64 +/* Cap the size of parallel I/O chunks to this number of blocks */ +#define PARALLEL_SEQSCAN_MAX_CHUNK_SIZE 8192 + +/* GUC variables */ +char *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD; +bool synchronize_seqscans = true; + + +/* ---------------------------------------------------------------------------- + * Slot functions. + * ---------------------------------------------------------------------------- + */ + +const TupleTableSlotOps * +table_slot_callbacks(Relation relation) +{ + const TupleTableSlotOps *tts_cb; + + if (relation->rd_tableam) + tts_cb = relation->rd_tableam->slot_callbacks(relation); + else if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE) + { + /* + * Historically FDWs expect to store heap tuples in slots. Continue + * handing them one, to make it less painful to adapt FDWs to new + * versions. The cost of a heap slot over a virtual slot is pretty + * small. + */ + tts_cb = &TTSOpsHeapTuple; + } + else + { + /* + * These need to be supported, as some parts of the code (like COPY) + * need to create slots for such relations too. It seems better to + * centralize the knowledge that a heap slot is the right thing in + * that case here. + */ + Assert(relation->rd_rel->relkind == RELKIND_VIEW || + relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); + tts_cb = &TTSOpsVirtual; + } + + return tts_cb; +} + +TupleTableSlot * +table_slot_create(Relation relation, List **reglist) +{ + const TupleTableSlotOps *tts_cb; + TupleTableSlot *slot; + + tts_cb = table_slot_callbacks(relation); + slot = MakeSingleTupleTableSlot(RelationGetDescr(relation), tts_cb); + + if (reglist) + *reglist = lappend(*reglist, slot); + + return slot; +} + + +/* ---------------------------------------------------------------------------- + * Table scan functions. + * ---------------------------------------------------------------------------- + */ + +TableScanDesc +table_beginscan_catalog(Relation relation, int nkeys, struct ScanKeyData *key) +{ + uint32 flags = SO_TYPE_SEQSCAN | + SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | SO_TEMP_SNAPSHOT; + Oid relid = RelationGetRelid(relation); + Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid)); + + return relation->rd_tableam->scan_begin(relation, snapshot, nkeys, key, + NULL, flags); +} + +void +table_scan_update_snapshot(TableScanDesc scan, Snapshot snapshot) +{ + Assert(IsMVCCSnapshot(snapshot)); + + RegisterSnapshot(snapshot); + scan->rs_snapshot = snapshot; + scan->rs_flags |= SO_TEMP_SNAPSHOT; +} + + +/* ---------------------------------------------------------------------------- + * Parallel table scan related functions. + * ---------------------------------------------------------------------------- + */ + +Size +table_parallelscan_estimate(Relation rel, Snapshot snapshot) +{ + Size sz = 0; + + if (IsMVCCSnapshot(snapshot)) + sz = add_size(sz, EstimateSnapshotSpace(snapshot)); + else + Assert(snapshot == SnapshotAny); + + sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel)); + + return sz; +} + +void +table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, + Snapshot snapshot) +{ + Size snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan); + + pscan->phs_snapshot_off = snapshot_off; + + if (IsMVCCSnapshot(snapshot)) + { + SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off); + pscan->phs_snapshot_any = false; + } + else + { + Assert(snapshot == SnapshotAny); + pscan->phs_snapshot_any = true; + } +} + +TableScanDesc +table_beginscan_parallel(Relation relation, ParallelTableScanDesc parallel_scan) +{ + Snapshot snapshot; + uint32 flags = SO_TYPE_SEQSCAN | + SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE; + + Assert(RelationGetRelid(relation) == parallel_scan->phs_relid); + + if (!parallel_scan->phs_snapshot_any) + { + /* Snapshot was serialized -- restore it */ + snapshot = RestoreSnapshot((char *) parallel_scan + + parallel_scan->phs_snapshot_off); + RegisterSnapshot(snapshot); + flags |= SO_TEMP_SNAPSHOT; + } + else + { + /* SnapshotAny passed by caller (not serialized) */ + snapshot = SnapshotAny; + } + + return relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL, + parallel_scan, flags); +} + + +/* ---------------------------------------------------------------------------- + * Index scan related functions. + * ---------------------------------------------------------------------------- + */ + +/* + * To perform that check simply start an index scan, create the necessary + * slot, do the heap lookup, and shut everything down again. This could be + * optimized, but is unlikely to matter from a performance POV. If there + * frequently are live index pointers also matching a unique index key, the + * CPU overhead of this routine is unlikely to matter. + * + * Note that *tid may be modified when we return true if the AM supports + * storing multiple row versions reachable via a single index entry (like + * heap's HOT). + */ +bool +table_index_fetch_tuple_check(Relation rel, + ItemPointer tid, + Snapshot snapshot, + bool *all_dead) +{ + IndexFetchTableData *scan; + TupleTableSlot *slot; + bool call_again = false; + bool found; + + slot = table_slot_create(rel, NULL); + scan = table_index_fetch_begin(rel); + found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again, + all_dead); + table_index_fetch_end(scan); + ExecDropSingleTupleTableSlot(slot); + + return found; +} + + +/* ------------------------------------------------------------------------ + * Functions for non-modifying operations on individual tuples + * ------------------------------------------------------------------------ + */ + +void +table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid) +{ + Relation rel = scan->rs_rd; + const TableAmRoutine *tableam = rel->rd_tableam; + + /* + * We don't expect direct calls to table_tuple_get_latest_tid with valid + * CheckXidAlive for catalog or regular tables. See detailed comments in + * xact.c where these variables are declared. + */ + if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) + elog(ERROR, "unexpected table_tuple_get_latest_tid call during logical decoding"); + + /* + * Since this can be called with user-supplied TID, don't trust the input + * too much. + */ + if (!tableam->tuple_tid_valid(scan, tid)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("tid (%u, %u) is not valid for relation \"%s\"", + ItemPointerGetBlockNumberNoCheck(tid), + ItemPointerGetOffsetNumberNoCheck(tid), + RelationGetRelationName(rel)))); + + tableam->tuple_get_latest_tid(scan, tid); +} + + +/* ---------------------------------------------------------------------------- + * Functions to make modifications a bit simpler. + * ---------------------------------------------------------------------------- + */ + +/* + * simple_table_tuple_insert - insert a tuple + * + * Currently, this routine differs from table_tuple_insert only in supplying a + * default command ID and not allowing access to the speedup options. + */ +void +simple_table_tuple_insert(Relation rel, TupleTableSlot *slot) +{ + table_tuple_insert(rel, slot, GetCurrentCommandId(true), 0, NULL); +} + +/* + * simple_table_tuple_delete - delete a tuple + * + * This routine may be used to delete a tuple when concurrent updates of + * the target tuple are not expected (for example, because we have a lock + * on the relation associated with the tuple). Any failure is reported + * via ereport(). + */ +void +simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot) +{ + TM_Result result; + TM_FailureData tmfd; + + result = table_tuple_delete(rel, tid, + GetCurrentCommandId(true), + snapshot, InvalidSnapshot, + true /* wait for commit */ , + &tmfd, false /* changingPart */ ); + + switch (result) + { + case TM_SelfModified: + /* Tuple was already updated in current command? */ + elog(ERROR, "tuple already updated by self"); + break; + + case TM_Ok: + /* done successfully */ + break; + + case TM_Updated: + elog(ERROR, "tuple concurrently updated"); + break; + + case TM_Deleted: + elog(ERROR, "tuple concurrently deleted"); + break; + + default: + elog(ERROR, "unrecognized table_tuple_delete status: %u", result); + break; + } +} + +/* + * simple_table_tuple_update - replace a tuple + * + * This routine may be used to update a tuple when concurrent updates of + * the target tuple are not expected (for example, because we have a lock + * on the relation associated with the tuple). Any failure is reported + * via ereport(). + */ +void +simple_table_tuple_update(Relation rel, ItemPointer otid, + TupleTableSlot *slot, + Snapshot snapshot, + bool *update_indexes) +{ + TM_Result result; + TM_FailureData tmfd; + LockTupleMode lockmode; + + result = table_tuple_update(rel, otid, slot, + GetCurrentCommandId(true), + snapshot, InvalidSnapshot, + true /* wait for commit */ , + &tmfd, &lockmode, update_indexes); + + switch (result) + { + case TM_SelfModified: + /* Tuple was already updated in current command? */ + elog(ERROR, "tuple already updated by self"); + break; + + case TM_Ok: + /* done successfully */ + break; + + case TM_Updated: + elog(ERROR, "tuple concurrently updated"); + break; + + case TM_Deleted: + elog(ERROR, "tuple concurrently deleted"); + break; + + default: + elog(ERROR, "unrecognized table_tuple_update status: %u", result); + break; + } + +} + + +/* ---------------------------------------------------------------------------- + * Helper functions to implement parallel scans for block oriented AMs. + * ---------------------------------------------------------------------------- + */ + +Size +table_block_parallelscan_estimate(Relation rel) +{ + return sizeof(ParallelBlockTableScanDescData); +} + +Size +table_block_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan) +{ + ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan; + + bpscan->base.phs_relid = RelationGetRelid(rel); + bpscan->phs_nblocks = RelationGetNumberOfBlocks(rel); + /* compare phs_syncscan initialization to similar logic in initscan */ + bpscan->base.phs_syncscan = synchronize_seqscans && + !RelationUsesLocalBuffers(rel) && + bpscan->phs_nblocks > NBuffers / 4; + SpinLockInit(&bpscan->phs_mutex); + bpscan->phs_startblock = InvalidBlockNumber; + pg_atomic_init_u64(&bpscan->phs_nallocated, 0); + + return sizeof(ParallelBlockTableScanDescData); +} + +void +table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan) +{ + ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan; + + pg_atomic_write_u64(&bpscan->phs_nallocated, 0); +} + +/* + * find and set the scan's startblock + * + * Determine where the parallel seq scan should start. This function may be + * called many times, once by each parallel worker. We must be careful only + * to set the startblock once. + */ +void +table_block_parallelscan_startblock_init(Relation rel, + ParallelBlockTableScanWorker pbscanwork, + ParallelBlockTableScanDesc pbscan) +{ + BlockNumber sync_startpage = InvalidBlockNumber; + + /* Reset the state we use for controlling allocation size. */ + memset(pbscanwork, 0, sizeof(*pbscanwork)); + + StaticAssertStmt(MaxBlockNumber <= 0xFFFFFFFE, + "pg_nextpower2_32 may be too small for non-standard BlockNumber width"); + + /* + * We determine the chunk size based on the size of the relation. First we + * split the relation into PARALLEL_SEQSCAN_NCHUNKS chunks but we then + * take the next highest power of 2 number of the chunk size. This means + * we split the relation into somewhere between PARALLEL_SEQSCAN_NCHUNKS + * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks. + */ + pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(pbscan->phs_nblocks / + PARALLEL_SEQSCAN_NCHUNKS, 1)); + + /* + * Ensure we don't go over the maximum chunk size with larger tables. This + * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger + * tables. Too large a chunk size has been shown to be detrimental to + * synchronous scan performance. + */ + pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size, + PARALLEL_SEQSCAN_MAX_CHUNK_SIZE); + +retry: + /* Grab the spinlock. */ + SpinLockAcquire(&pbscan->phs_mutex); + + /* + * If the scan's startblock has not yet been initialized, we must do so + * now. If this is not a synchronized scan, we just start at block 0, but + * if it is a synchronized scan, we must get the starting position from + * the synchronized scan machinery. We can't hold the spinlock while + * doing that, though, so release the spinlock, get the information we + * need, and retry. If nobody else has initialized the scan in the + * meantime, we'll fill in the value we fetched on the second time + * through. + */ + if (pbscan->phs_startblock == InvalidBlockNumber) + { + if (!pbscan->base.phs_syncscan) + pbscan->phs_startblock = 0; + else if (sync_startpage != InvalidBlockNumber) + pbscan->phs_startblock = sync_startpage; + else + { + SpinLockRelease(&pbscan->phs_mutex); + sync_startpage = ss_get_location(rel, pbscan->phs_nblocks); + goto retry; + } + } + SpinLockRelease(&pbscan->phs_mutex); +} + +/* + * get the next page to scan + * + * Get the next page to scan. Even if there are no pages left to scan, + * another backend could have grabbed a page to scan and not yet finished + * looking at it, so it doesn't follow that the scan is done when the first + * backend gets an InvalidBlockNumber return. + */ +BlockNumber +table_block_parallelscan_nextpage(Relation rel, + ParallelBlockTableScanWorker pbscanwork, + ParallelBlockTableScanDesc pbscan) +{ + BlockNumber page; + uint64 nallocated; + + /* + * The logic below allocates block numbers out to parallel workers in a + * way that each worker will receive a set of consecutive block numbers to + * scan. Earlier versions of this would allocate the next highest block + * number to the next worker to call this function. This would generally + * result in workers never receiving consecutive block numbers. Some + * operating systems would not detect the sequential I/O pattern due to + * each backend being a different process which could result in poor + * performance due to inefficient or no readahead. To work around this + * issue, we now allocate a range of block numbers for each worker and + * when they come back for another block, we give them the next one in + * that range until the range is complete. When the worker completes the + * range of blocks we then allocate another range for it and return the + * first block number from that range. + * + * Here we name these ranges of blocks "chunks". The initial size of + * these chunks is determined in table_block_parallelscan_startblock_init + * based on the size of the relation. Towards the end of the scan, we + * start making reductions in the size of the chunks in order to attempt + * to divide the remaining work over all the workers as evenly as + * possible. + * + * Here pbscanwork is local worker memory. phsw_chunk_remaining tracks + * the number of blocks remaining in the chunk. When that reaches 0 then + * we must allocate a new chunk for the worker. + * + * phs_nallocated tracks how many blocks have been allocated to workers + * already. When phs_nallocated >= rs_nblocks, all blocks have been + * allocated. + * + * Because we use an atomic fetch-and-add to fetch the current value, the + * phs_nallocated counter will exceed rs_nblocks, because workers will + * still increment the value, when they try to allocate the next block but + * all blocks have been allocated already. The counter must be 64 bits + * wide because of that, to avoid wrapping around when rs_nblocks is close + * to 2^32. + * + * The actual block to return is calculated by adding the counter to the + * starting block number, modulo nblocks. + */ + + /* + * First check if we have any remaining blocks in a previous chunk for + * this worker. We must consume all of the blocks from that before we + * allocate a new chunk to the worker. + */ + if (pbscanwork->phsw_chunk_remaining > 0) + { + /* + * Give them the next block in the range and update the remaining + * number of blocks. + */ + nallocated = ++pbscanwork->phsw_nallocated; + pbscanwork->phsw_chunk_remaining--; + } + else + { + /* + * When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks + * remaining in the scan, we half the chunk size. Since we reduce the + * chunk size here, we'll hit this again after doing + * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size. After a few + * iterations of this, we'll end up doing the last few blocks with the + * chunk size set to 1. + */ + if (pbscanwork->phsw_chunk_size > 1 && + pbscanwork->phsw_nallocated > pbscan->phs_nblocks - + (pbscanwork->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS)) + pbscanwork->phsw_chunk_size >>= 1; + + nallocated = pbscanwork->phsw_nallocated = + pg_atomic_fetch_add_u64(&pbscan->phs_nallocated, + pbscanwork->phsw_chunk_size); + + /* + * Set the remaining number of blocks in this chunk so that subsequent + * calls from this worker continue on with this chunk until it's done. + */ + pbscanwork->phsw_chunk_remaining = pbscanwork->phsw_chunk_size - 1; + } + + if (nallocated >= pbscan->phs_nblocks) + page = InvalidBlockNumber; /* all blocks have been allocated */ + else + page = (nallocated + pbscan->phs_startblock) % pbscan->phs_nblocks; + + /* + * Report scan location. Normally, we report the current page number. + * When we reach the end of the scan, though, we report the starting page, + * not the ending page, just so the starting positions for later scans + * doesn't slew backwards. We only report the position at the end of the + * scan once, though: subsequent callers will report nothing. + */ + if (pbscan->base.phs_syncscan) + { + if (page != InvalidBlockNumber) + ss_report_location(rel, page); + else if (nallocated == pbscan->phs_nblocks) + ss_report_location(rel, pbscan->phs_startblock); + } + + return page; +} + +/* ---------------------------------------------------------------------------- + * Helper functions to implement relation sizing for block oriented AMs. + * ---------------------------------------------------------------------------- + */ + +/* + * table_block_relation_size + * + * If a table AM uses the various relation forks as the sole place where data + * is stored, and if it uses them in the expected manner (e.g. the actual data + * is in the main fork rather than some other), it can use this implementation + * of the relation_size callback rather than implementing its own. + */ +uint64 +table_block_relation_size(Relation rel, ForkNumber forkNumber) +{ + uint64 nblocks = 0; + + /* Open it at the smgr level if not already done */ + RelationOpenSmgr(rel); + + /* InvalidForkNumber indicates returning the size for all forks */ + if (forkNumber == InvalidForkNumber) + { + for (int i = 0; i < MAX_FORKNUM; i++) + nblocks += smgrnblocks(rel->rd_smgr, i); + } + else + nblocks = smgrnblocks(rel->rd_smgr, forkNumber); + + return nblocks * BLCKSZ; +} + +/* + * table_block_relation_estimate_size + * + * This function can't be directly used as the implementation of the + * relation_estimate_size callback, because it has a few additional parameters. + * Instead, it is intended to be used as a helper function; the caller can + * pass through the arguments to its relation_estimate_size function plus the + * additional values required here. + * + * overhead_bytes_per_tuple should contain the approximate number of bytes + * of storage required to store a tuple above and beyond what is required for + * the tuple data proper. Typically, this would include things like the + * size of the tuple header and item pointer. This is only used for query + * planning, so a table AM where the value is not constant could choose to + * pass a "best guess". + * + * usable_bytes_per_page should contain the approximate number of bytes per + * page usable for tuple data, excluding the page header and any anticipated + * special space. + */ +void +table_block_relation_estimate_size(Relation rel, int32 *attr_widths, + BlockNumber *pages, double *tuples, + double *allvisfrac, + Size overhead_bytes_per_tuple, + Size usable_bytes_per_page) +{ + BlockNumber curpages; + BlockNumber relpages; + double reltuples; + BlockNumber relallvisible; + double density; + + /* it should have storage, so we can call the smgr */ + curpages = RelationGetNumberOfBlocks(rel); + + /* coerce values in pg_class to more desirable types */ + relpages = (BlockNumber) rel->rd_rel->relpages; + reltuples = (double) rel->rd_rel->reltuples; + relallvisible = (BlockNumber) rel->rd_rel->relallvisible; + + /* + * HACK: if the relation has never yet been vacuumed, use a minimum size + * estimate of 10 pages. The idea here is to avoid assuming a + * newly-created table is really small, even if it currently is, because + * that may not be true once some data gets loaded into it. Once a vacuum + * or analyze cycle has been done on it, it's more reasonable to believe + * the size is somewhat stable. + * + * (Note that this is only an issue if the plan gets cached and used again + * after the table has been filled. What we're trying to avoid is using a + * nestloop-type plan on a table that has grown substantially since the + * plan was made. Normally, autovacuum/autoanalyze will occur once enough + * inserts have happened and cause cached-plan invalidation; but that + * doesn't happen instantaneously, and it won't happen at all for cases + * such as temporary tables.) + * + * We test "never vacuumed" by seeing whether reltuples < 0. + * + * If the table has inheritance children, we don't apply this heuristic. + * Totally empty parent tables are quite common, so we should be willing + * to believe that they are empty. + */ + if (curpages < 10 && + reltuples < 0 && + !rel->rd_rel->relhassubclass) + curpages = 10; + + /* report estimated # pages */ + *pages = curpages; + /* quick exit if rel is clearly empty */ + if (curpages == 0) + { + *tuples = 0; + *allvisfrac = 0; + return; + } + + /* estimate number of tuples from previous tuple density */ + if (reltuples >= 0 && relpages > 0) + density = reltuples / (double) relpages; + else + { + /* + * When we have no data because the relation was never yet vacuumed, + * estimate tuple width from attribute datatypes. We assume here that + * the pages are completely full, which is OK for tables but is + * probably an overestimate for indexes. Fortunately + * get_relation_info() can clamp the overestimate to the parent + * table's size. + * + * Note: this code intentionally disregards alignment considerations, + * because (a) that would be gilding the lily considering how crude + * the estimate is, (b) it creates platform dependencies in the + * default plans which are kind of a headache for regression testing, + * and (c) different table AMs might use different padding schemes. + */ + int32 tuple_width; + + tuple_width = get_rel_data_width(rel, attr_widths); + tuple_width += overhead_bytes_per_tuple; + /* note: integer division is intentional here */ + density = usable_bytes_per_page / tuple_width; + } + *tuples = rint(density * (double) curpages); + + /* + * We use relallvisible as-is, rather than scaling it up like we do for + * the pages and tuples counts, on the theory that any pages added since + * the last VACUUM are most likely not marked all-visible. But costsize.c + * wants it converted to a fraction. + */ + if (relallvisible == 0 || curpages <= 0) + *allvisfrac = 0; + else if ((double) relallvisible >= curpages) + *allvisfrac = 1; + else + *allvisfrac = (double) relallvisible / curpages; +} diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c new file mode 100644 index 0000000..325ecdc --- /dev/null +++ b/src/backend/access/table/tableamapi.c @@ -0,0 +1,158 @@ +/*---------------------------------------------------------------------- + * + * tableamapi.c + * Support routines for API for Postgres table access methods + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/table/tableamapi.c + *---------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/tableam.h" +#include "access/xact.h" +#include "catalog/pg_am.h" +#include "catalog/pg_proc.h" +#include "commands/defrem.h" +#include "miscadmin.h" +#include "utils/fmgroids.h" +#include "utils/memutils.h" +#include "utils/syscache.h" + + +/* + * GetTableAmRoutine + * Call the specified access method handler routine to get its + * TableAmRoutine struct, which will be palloc'd in the caller's + * memory context. + */ +const TableAmRoutine * +GetTableAmRoutine(Oid amhandler) +{ + Datum datum; + const TableAmRoutine *routine; + + datum = OidFunctionCall0(amhandler); + routine = (TableAmRoutine *) DatumGetPointer(datum); + + if (routine == NULL || !IsA(routine, TableAmRoutine)) + elog(ERROR, "table access method handler %u did not return a TableAmRoutine struct", + amhandler); + + /* + * Assert that all required callbacks are present. That makes it a bit + * easier to keep AMs up to date, e.g. when forward porting them to a new + * major version. + */ + Assert(routine->scan_begin != NULL); + Assert(routine->scan_end != NULL); + Assert(routine->scan_rescan != NULL); + Assert(routine->scan_getnextslot != NULL); + + Assert(routine->parallelscan_estimate != NULL); + Assert(routine->parallelscan_initialize != NULL); + Assert(routine->parallelscan_reinitialize != NULL); + + Assert(routine->index_fetch_begin != NULL); + Assert(routine->index_fetch_reset != NULL); + Assert(routine->index_fetch_end != NULL); + Assert(routine->index_fetch_tuple != NULL); + + Assert(routine->tuple_fetch_row_version != NULL); + Assert(routine->tuple_tid_valid != NULL); + Assert(routine->tuple_get_latest_tid != NULL); + Assert(routine->tuple_satisfies_snapshot != NULL); + Assert(routine->index_delete_tuples != NULL); + + Assert(routine->tuple_insert != NULL); + + /* + * Could be made optional, but would require throwing error during + * parse-analysis. + */ + Assert(routine->tuple_insert_speculative != NULL); + Assert(routine->tuple_complete_speculative != NULL); + + Assert(routine->multi_insert != NULL); + Assert(routine->tuple_delete != NULL); + Assert(routine->tuple_update != NULL); + Assert(routine->tuple_lock != NULL); + + Assert(routine->relation_set_new_filenode != NULL); + Assert(routine->relation_nontransactional_truncate != NULL); + Assert(routine->relation_copy_data != NULL); + Assert(routine->relation_copy_for_cluster != NULL); + Assert(routine->relation_vacuum != NULL); + Assert(routine->scan_analyze_next_block != NULL); + Assert(routine->scan_analyze_next_tuple != NULL); + Assert(routine->index_build_range_scan != NULL); + Assert(routine->index_validate_scan != NULL); + + Assert(routine->relation_size != NULL); + Assert(routine->relation_needs_toast_table != NULL); + + Assert(routine->relation_estimate_size != NULL); + + /* optional, but one callback implies presence of the other */ + Assert((routine->scan_bitmap_next_block == NULL) == + (routine->scan_bitmap_next_tuple == NULL)); + Assert(routine->scan_sample_next_block != NULL); + Assert(routine->scan_sample_next_tuple != NULL); + + return routine; +} + +/* check_hook: validate new default_table_access_method */ +bool +check_default_table_access_method(char **newval, void **extra, GucSource source) +{ + if (**newval == '\0') + { + GUC_check_errdetail("%s cannot be empty.", + "default_table_access_method"); + return false; + } + + if (strlen(*newval) >= NAMEDATALEN) + { + GUC_check_errdetail("%s is too long (maximum %d characters).", + "default_table_access_method", NAMEDATALEN - 1); + return false; + } + + /* + * If we aren't inside a transaction, or not connected to a database, we + * cannot do the catalog access necessary to verify the method. Must + * accept the value on faith. + */ + if (IsTransactionState() && MyDatabaseId != InvalidOid) + { + if (!OidIsValid(get_table_am_oid(*newval, true))) + { + /* + * When source == PGC_S_TEST, don't throw a hard error for a + * nonexistent table access method, only a NOTICE. See comments in + * guc.h. + */ + if (source == PGC_S_TEST) + { + ereport(NOTICE, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("table access method \"%s\" does not exist", + *newval))); + } + else + { + GUC_check_errdetail("Table access method \"%s\" does not exist.", + *newval); + return false; + } + } + } + + return true; +} diff --git a/src/backend/access/table/toast_helper.c b/src/backend/access/table/toast_helper.c new file mode 100644 index 0000000..013236b --- /dev/null +++ b/src/backend/access/table/toast_helper.c @@ -0,0 +1,337 @@ +/*------------------------------------------------------------------------- + * + * toast_helper.c + * Helper functions for table AMs implementing compressed or + * out-of-line storage of varlena attributes. + * + * Copyright (c) 2000-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/table/toast_helper.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/detoast.h" +#include "access/table.h" +#include "access/toast_helper.h" +#include "access/toast_internals.h" +#include "catalog/pg_type_d.h" + + +/* + * Prepare to TOAST a tuple. + * + * tupleDesc, toast_values, and toast_isnull are required parameters; they + * provide the necessary details about the tuple to be toasted. + * + * toast_oldvalues and toast_oldisnull should be NULL for a newly-inserted + * tuple; for an update, they should describe the existing tuple. + * + * All of these arrays should have a length equal to tupleDesc->natts. + * + * On return, toast_flags and toast_attr will have been initialized. + * toast_flags is just a single uint8, but toast_attr is a caller-provided + * array with a length equal to tupleDesc->natts. The caller need not + * perform any initialization of the array before calling this function. + */ +void +toast_tuple_init(ToastTupleContext *ttc) +{ + TupleDesc tupleDesc = ttc->ttc_rel->rd_att; + int numAttrs = tupleDesc->natts; + int i; + + ttc->ttc_flags = 0; + + for (i = 0; i < numAttrs; i++) + { + Form_pg_attribute att = TupleDescAttr(tupleDesc, i); + struct varlena *old_value; + struct varlena *new_value; + + ttc->ttc_attr[i].tai_colflags = 0; + ttc->ttc_attr[i].tai_oldexternal = NULL; + ttc->ttc_attr[i].tai_compression = att->attcompression; + + if (ttc->ttc_oldvalues != NULL) + { + /* + * For UPDATE get the old and new values of this attribute + */ + old_value = + (struct varlena *) DatumGetPointer(ttc->ttc_oldvalues[i]); + new_value = + (struct varlena *) DatumGetPointer(ttc->ttc_values[i]); + + /* + * If the old value is stored on disk, check if it has changed so + * we have to delete it later. + */ + if (att->attlen == -1 && !ttc->ttc_oldisnull[i] && + VARATT_IS_EXTERNAL_ONDISK(old_value)) + { + if (ttc->ttc_isnull[i] || + !VARATT_IS_EXTERNAL_ONDISK(new_value) || + memcmp((char *) old_value, (char *) new_value, + VARSIZE_EXTERNAL(old_value)) != 0) + { + /* + * The old external stored value isn't needed any more + * after the update + */ + ttc->ttc_attr[i].tai_colflags |= TOASTCOL_NEEDS_DELETE_OLD; + ttc->ttc_flags |= TOAST_NEEDS_DELETE_OLD; + } + else + { + /* + * This attribute isn't changed by this update so we reuse + * the original reference to the old value in the new + * tuple. + */ + ttc->ttc_attr[i].tai_colflags |= TOASTCOL_IGNORE; + continue; + } + } + } + else + { + /* + * For INSERT simply get the new value + */ + new_value = (struct varlena *) DatumGetPointer(ttc->ttc_values[i]); + } + + /* + * Handle NULL attributes + */ + if (ttc->ttc_isnull[i]) + { + ttc->ttc_attr[i].tai_colflags |= TOASTCOL_IGNORE; + ttc->ttc_flags |= TOAST_HAS_NULLS; + continue; + } + + /* + * Now look at varlena attributes + */ + if (att->attlen == -1) + { + /* + * If the table's attribute says PLAIN always, force it so. + */ + if (att->attstorage == TYPSTORAGE_PLAIN) + ttc->ttc_attr[i].tai_colflags |= TOASTCOL_IGNORE; + + /* + * We took care of UPDATE above, so any external value we find + * still in the tuple must be someone else's that we cannot reuse + * (this includes the case of an out-of-line in-memory datum). + * Fetch it back (without decompression, unless we are forcing + * PLAIN storage). If necessary, we'll push it out as a new + * external value below. + */ + if (VARATT_IS_EXTERNAL(new_value)) + { + ttc->ttc_attr[i].tai_oldexternal = new_value; + if (att->attstorage == TYPSTORAGE_PLAIN) + new_value = detoast_attr(new_value); + else + new_value = detoast_external_attr(new_value); + ttc->ttc_values[i] = PointerGetDatum(new_value); + ttc->ttc_attr[i].tai_colflags |= TOASTCOL_NEEDS_FREE; + ttc->ttc_flags |= (TOAST_NEEDS_CHANGE | TOAST_NEEDS_FREE); + } + + /* + * Remember the size of this attribute + */ + ttc->ttc_attr[i].tai_size = VARSIZE_ANY(new_value); + } + else + { + /* + * Not a varlena attribute, plain storage always + */ + ttc->ttc_attr[i].tai_colflags |= TOASTCOL_IGNORE; + } + } +} + +/* + * Find the largest varlena attribute that satisfies certain criteria. + * + * The relevant column must not be marked TOASTCOL_IGNORE, and if the + * for_compression flag is passed as true, it must also not be marked + * TOASTCOL_INCOMPRESSIBLE. + * + * The column must have attstorage EXTERNAL or EXTENDED if check_main is + * false, and must have attstorage MAIN if check_main is true. + * + * The column must have a minimum size of MAXALIGN(TOAST_POINTER_SIZE); + * if not, no benefit is to be expected by compressing it. + * + * The return value is the index of the biggest suitable column, or + * -1 if there is none. + */ +int +toast_tuple_find_biggest_attribute(ToastTupleContext *ttc, + bool for_compression, bool check_main) +{ + TupleDesc tupleDesc = ttc->ttc_rel->rd_att; + int numAttrs = tupleDesc->natts; + int biggest_attno = -1; + int32 biggest_size = MAXALIGN(TOAST_POINTER_SIZE); + int32 skip_colflags = TOASTCOL_IGNORE; + int i; + + if (for_compression) + skip_colflags |= TOASTCOL_INCOMPRESSIBLE; + + for (i = 0; i < numAttrs; i++) + { + Form_pg_attribute att = TupleDescAttr(tupleDesc, i); + + if ((ttc->ttc_attr[i].tai_colflags & skip_colflags) != 0) + continue; + if (VARATT_IS_EXTERNAL(DatumGetPointer(ttc->ttc_values[i]))) + continue; /* can't happen, toast_action would be PLAIN */ + if (for_compression && + VARATT_IS_COMPRESSED(DatumGetPointer(ttc->ttc_values[i]))) + continue; + if (check_main && att->attstorage != TYPSTORAGE_MAIN) + continue; + if (!check_main && att->attstorage != TYPSTORAGE_EXTENDED && + att->attstorage != TYPSTORAGE_EXTERNAL) + continue; + + if (ttc->ttc_attr[i].tai_size > biggest_size) + { + biggest_attno = i; + biggest_size = ttc->ttc_attr[i].tai_size; + } + } + + return biggest_attno; +} + +/* + * Try compression for an attribute. + * + * If we find that the attribute is not compressible, mark it so. + */ +void +toast_tuple_try_compression(ToastTupleContext *ttc, int attribute) +{ + Datum *value = &ttc->ttc_values[attribute]; + Datum new_value; + ToastAttrInfo *attr = &ttc->ttc_attr[attribute]; + + new_value = toast_compress_datum(*value, attr->tai_compression); + + if (DatumGetPointer(new_value) != NULL) + { + /* successful compression */ + if ((attr->tai_colflags & TOASTCOL_NEEDS_FREE) != 0) + pfree(DatumGetPointer(*value)); + *value = new_value; + attr->tai_colflags |= TOASTCOL_NEEDS_FREE; + attr->tai_size = VARSIZE(DatumGetPointer(*value)); + ttc->ttc_flags |= (TOAST_NEEDS_CHANGE | TOAST_NEEDS_FREE); + } + else + { + /* incompressible, ignore on subsequent compression passes */ + attr->tai_colflags |= TOASTCOL_INCOMPRESSIBLE; + } +} + +/* + * Move an attribute to external storage. + */ +void +toast_tuple_externalize(ToastTupleContext *ttc, int attribute, int options) +{ + Datum *value = &ttc->ttc_values[attribute]; + Datum old_value = *value; + ToastAttrInfo *attr = &ttc->ttc_attr[attribute]; + + attr->tai_colflags |= TOASTCOL_IGNORE; + *value = toast_save_datum(ttc->ttc_rel, old_value, attr->tai_oldexternal, + options); + if ((attr->tai_colflags & TOASTCOL_NEEDS_FREE) != 0) + pfree(DatumGetPointer(old_value)); + attr->tai_colflags |= TOASTCOL_NEEDS_FREE; + ttc->ttc_flags |= (TOAST_NEEDS_CHANGE | TOAST_NEEDS_FREE); +} + +/* + * Perform appropriate cleanup after one tuple has been subjected to TOAST. + */ +void +toast_tuple_cleanup(ToastTupleContext *ttc) +{ + TupleDesc tupleDesc = ttc->ttc_rel->rd_att; + int numAttrs = tupleDesc->natts; + + /* + * Free allocated temp values + */ + if ((ttc->ttc_flags & TOAST_NEEDS_FREE) != 0) + { + int i; + + for (i = 0; i < numAttrs; i++) + { + ToastAttrInfo *attr = &ttc->ttc_attr[i]; + + if ((attr->tai_colflags & TOASTCOL_NEEDS_FREE) != 0) + pfree(DatumGetPointer(ttc->ttc_values[i])); + } + } + + /* + * Delete external values from the old tuple + */ + if ((ttc->ttc_flags & TOAST_NEEDS_DELETE_OLD) != 0) + { + int i; + + for (i = 0; i < numAttrs; i++) + { + ToastAttrInfo *attr = &ttc->ttc_attr[i]; + + if ((attr->tai_colflags & TOASTCOL_NEEDS_DELETE_OLD) != 0) + toast_delete_datum(ttc->ttc_rel, ttc->ttc_oldvalues[i], false); + } + } +} + +/* + * Check for external stored attributes and delete them from the secondary + * relation. + */ +void +toast_delete_external(Relation rel, Datum *values, bool *isnull, + bool is_speculative) +{ + TupleDesc tupleDesc = rel->rd_att; + int numAttrs = tupleDesc->natts; + int i; + + for (i = 0; i < numAttrs; i++) + { + if (TupleDescAttr(tupleDesc, i)->attlen == -1) + { + Datum value = values[i]; + + if (isnull[i]) + continue; + else if (VARATT_IS_EXTERNAL_ONDISK(PointerGetDatum(value))) + toast_delete_datum(rel, value, is_speculative); + } + } +} diff --git a/src/backend/access/tablesample/Makefile b/src/backend/access/tablesample/Makefile new file mode 100644 index 0000000..01641e5 --- /dev/null +++ b/src/backend/access/tablesample/Makefile @@ -0,0 +1,20 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/tablesample +# +# IDENTIFICATION +# src/backend/access/tablesample/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/tablesample +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + bernoulli.o \ + system.o \ + tablesample.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/tablesample/bernoulli.c b/src/backend/access/tablesample/bernoulli.c new file mode 100644 index 0000000..ae6e4f5 --- /dev/null +++ b/src/backend/access/tablesample/bernoulli.c @@ -0,0 +1,229 @@ +/*------------------------------------------------------------------------- + * + * bernoulli.c + * support routines for BERNOULLI tablesample method + * + * To ensure repeatability of samples, it is necessary that selection of a + * given tuple be history-independent; otherwise syncscanning would break + * repeatability, to say nothing of logically-irrelevant maintenance such + * as physical extension or shortening of the relation. + * + * To achieve that, we proceed by hashing each candidate TID together with + * the active seed, and then selecting it if the hash is less than the + * cutoff value computed from the selection probability by BeginSampleScan. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/tablesample/bernoulli.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <math.h> + +#include "access/tsmapi.h" +#include "catalog/pg_type.h" +#include "common/hashfn.h" +#include "optimizer/optimizer.h" +#include "utils/builtins.h" + + +/* Private state */ +typedef struct +{ + uint64 cutoff; /* select tuples with hash less than this */ + uint32 seed; /* random seed */ + OffsetNumber lt; /* last tuple returned from current block */ +} BernoulliSamplerData; + + +static void bernoulli_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples); +static void bernoulli_initsamplescan(SampleScanState *node, + int eflags); +static void bernoulli_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed); +static OffsetNumber bernoulli_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset); + + +/* + * Create a TsmRoutine descriptor for the BERNOULLI method. + */ +Datum +tsm_bernoulli_handler(PG_FUNCTION_ARGS) +{ + TsmRoutine *tsm = makeNode(TsmRoutine); + + tsm->parameterTypes = list_make1_oid(FLOAT4OID); + tsm->repeatable_across_queries = true; + tsm->repeatable_across_scans = true; + tsm->SampleScanGetSampleSize = bernoulli_samplescangetsamplesize; + tsm->InitSampleScan = bernoulli_initsamplescan; + tsm->BeginSampleScan = bernoulli_beginsamplescan; + tsm->NextSampleBlock = NULL; + tsm->NextSampleTuple = bernoulli_nextsampletuple; + tsm->EndSampleScan = NULL; + + PG_RETURN_POINTER(tsm); +} + +/* + * Sample size estimation. + */ +static void +bernoulli_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples) +{ + Node *pctnode; + float4 samplefract; + + /* Try to extract an estimate for the sample percentage */ + pctnode = (Node *) linitial(paramexprs); + pctnode = estimate_expression_value(root, pctnode); + + if (IsA(pctnode, Const) && + !((Const *) pctnode)->constisnull) + { + samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue); + if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract)) + samplefract /= 100.0f; + else + { + /* Default samplefract if the value is bogus */ + samplefract = 0.1f; + } + } + else + { + /* Default samplefract if we didn't obtain a non-null Const */ + samplefract = 0.1f; + } + + /* We'll visit all pages of the baserel */ + *pages = baserel->pages; + + *tuples = clamp_row_est(baserel->tuples * samplefract); +} + +/* + * Initialize during executor setup. + */ +static void +bernoulli_initsamplescan(SampleScanState *node, int eflags) +{ + node->tsm_state = palloc0(sizeof(BernoulliSamplerData)); +} + +/* + * Examine parameters and prepare for a sample scan. + */ +static void +bernoulli_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed) +{ + BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state; + double percent = DatumGetFloat4(params[0]); + double dcutoff; + + if (percent < 0 || percent > 100 || isnan(percent)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT), + errmsg("sample percentage must be between 0 and 100"))); + + /* + * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to + * store that as a uint64, of course. Note that this gives strictly + * correct behavior at the limits of zero or one probability. + */ + dcutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100); + sampler->cutoff = (uint64) dcutoff; + sampler->seed = seed; + sampler->lt = InvalidOffsetNumber; + + /* + * Use bulkread, since we're scanning all pages. But pagemode visibility + * checking is a win only at larger sampling fractions. The 25% cutoff + * here is based on very limited experimentation. + */ + node->use_bulkread = true; + node->use_pagemode = (percent >= 25); +} + +/* + * Select next sampled tuple in current block. + * + * It is OK here to return an offset without knowing if the tuple is visible + * (or even exists). The reason is that we do the coinflip for every tuple + * offset in the table. Since all tuples have the same probability of being + * returned, it doesn't matter if we do extra coinflips for invisible tuples. + * + * When we reach end of the block, return InvalidOffsetNumber which tells + * SampleScan to go to next block. + */ +static OffsetNumber +bernoulli_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset) +{ + BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state; + OffsetNumber tupoffset = sampler->lt; + uint32 hashinput[3]; + + /* Advance to first/next tuple in block */ + if (tupoffset == InvalidOffsetNumber) + tupoffset = FirstOffsetNumber; + else + tupoffset++; + + /* + * We compute the hash by applying hash_any to an array of 3 uint32's + * containing the block, offset, and seed. This is efficient to set up, + * and with the current implementation of hash_any, it gives + * machine-independent results, which is a nice property for regression + * testing. + * + * These words in the hash input are the same throughout the block: + */ + hashinput[0] = blockno; + hashinput[2] = sampler->seed; + + /* + * Loop over tuple offsets until finding suitable TID or reaching end of + * block. + */ + for (; tupoffset <= maxoffset; tupoffset++) + { + uint32 hash; + + hashinput[1] = tupoffset; + + hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput, + (int) sizeof(hashinput))); + if (hash < sampler->cutoff) + break; + } + + if (tupoffset > maxoffset) + tupoffset = InvalidOffsetNumber; + + sampler->lt = tupoffset; + + return tupoffset; +} diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c new file mode 100644 index 0000000..b0869e5 --- /dev/null +++ b/src/backend/access/tablesample/system.c @@ -0,0 +1,257 @@ +/*------------------------------------------------------------------------- + * + * system.c + * support routines for SYSTEM tablesample method + * + * To ensure repeatability of samples, it is necessary that selection of a + * given tuple be history-independent; otherwise syncscanning would break + * repeatability, to say nothing of logically-irrelevant maintenance such + * as physical extension or shortening of the relation. + * + * To achieve that, we proceed by hashing each candidate block number together + * with the active seed, and then selecting it if the hash is less than the + * cutoff value computed from the selection probability by BeginSampleScan. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/tablesample/system.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <math.h> + +#include "access/relscan.h" +#include "access/tsmapi.h" +#include "catalog/pg_type.h" +#include "common/hashfn.h" +#include "optimizer/optimizer.h" +#include "utils/builtins.h" + + +/* Private state */ +typedef struct +{ + uint64 cutoff; /* select blocks with hash less than this */ + uint32 seed; /* random seed */ + BlockNumber nextblock; /* next block to consider sampling */ + OffsetNumber lt; /* last tuple returned from current block */ +} SystemSamplerData; + + +static void system_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples); +static void system_initsamplescan(SampleScanState *node, + int eflags); +static void system_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed); +static BlockNumber system_nextsampleblock(SampleScanState *node, BlockNumber nblocks); +static OffsetNumber system_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset); + + +/* + * Create a TsmRoutine descriptor for the SYSTEM method. + */ +Datum +tsm_system_handler(PG_FUNCTION_ARGS) +{ + TsmRoutine *tsm = makeNode(TsmRoutine); + + tsm->parameterTypes = list_make1_oid(FLOAT4OID); + tsm->repeatable_across_queries = true; + tsm->repeatable_across_scans = true; + tsm->SampleScanGetSampleSize = system_samplescangetsamplesize; + tsm->InitSampleScan = system_initsamplescan; + tsm->BeginSampleScan = system_beginsamplescan; + tsm->NextSampleBlock = system_nextsampleblock; + tsm->NextSampleTuple = system_nextsampletuple; + tsm->EndSampleScan = NULL; + + PG_RETURN_POINTER(tsm); +} + +/* + * Sample size estimation. + */ +static void +system_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples) +{ + Node *pctnode; + float4 samplefract; + + /* Try to extract an estimate for the sample percentage */ + pctnode = (Node *) linitial(paramexprs); + pctnode = estimate_expression_value(root, pctnode); + + if (IsA(pctnode, Const) && + !((Const *) pctnode)->constisnull) + { + samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue); + if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract)) + samplefract /= 100.0f; + else + { + /* Default samplefract if the value is bogus */ + samplefract = 0.1f; + } + } + else + { + /* Default samplefract if we didn't obtain a non-null Const */ + samplefract = 0.1f; + } + + /* We'll visit a sample of the pages ... */ + *pages = clamp_row_est(baserel->pages * samplefract); + + /* ... and hopefully get a representative number of tuples from them */ + *tuples = clamp_row_est(baserel->tuples * samplefract); +} + +/* + * Initialize during executor setup. + */ +static void +system_initsamplescan(SampleScanState *node, int eflags) +{ + node->tsm_state = palloc0(sizeof(SystemSamplerData)); +} + +/* + * Examine parameters and prepare for a sample scan. + */ +static void +system_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed) +{ + SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; + double percent = DatumGetFloat4(params[0]); + double dcutoff; + + if (percent < 0 || percent > 100 || isnan(percent)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT), + errmsg("sample percentage must be between 0 and 100"))); + + /* + * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to + * store that as a uint64, of course. Note that this gives strictly + * correct behavior at the limits of zero or one probability. + */ + dcutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100); + sampler->cutoff = (uint64) dcutoff; + sampler->seed = seed; + sampler->nextblock = 0; + sampler->lt = InvalidOffsetNumber; + + /* + * Bulkread buffer access strategy probably makes sense unless we're + * scanning a very small fraction of the table. The 1% cutoff here is a + * guess. We should use pagemode visibility checking, since we scan all + * tuples on each selected page. + */ + node->use_bulkread = (percent >= 1); + node->use_pagemode = true; +} + +/* + * Select next block to sample. + */ +static BlockNumber +system_nextsampleblock(SampleScanState *node, BlockNumber nblocks) +{ + SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; + BlockNumber nextblock = sampler->nextblock; + uint32 hashinput[2]; + + /* + * We compute the hash by applying hash_any to an array of 2 uint32's + * containing the block number and seed. This is efficient to set up, and + * with the current implementation of hash_any, it gives + * machine-independent results, which is a nice property for regression + * testing. + * + * These words in the hash input are the same throughout the block: + */ + hashinput[1] = sampler->seed; + + /* + * Loop over block numbers until finding suitable block or reaching end of + * relation. + */ + for (; nextblock < nblocks; nextblock++) + { + uint32 hash; + + hashinput[0] = nextblock; + + hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput, + (int) sizeof(hashinput))); + if (hash < sampler->cutoff) + break; + } + + if (nextblock < nblocks) + { + /* Found a suitable block; remember where we should start next time */ + sampler->nextblock = nextblock + 1; + return nextblock; + } + + /* Done, but let's reset nextblock to 0 for safety. */ + sampler->nextblock = 0; + return InvalidBlockNumber; +} + +/* + * Select next sampled tuple in current block. + * + * In block sampling, we just want to sample all the tuples in each selected + * block. + * + * It is OK here to return an offset without knowing if the tuple is visible + * (or even exists); nodeSamplescan.c will deal with that. + * + * When we reach end of the block, return InvalidOffsetNumber which tells + * SampleScan to go to next block. + */ +static OffsetNumber +system_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset) +{ + SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; + OffsetNumber tupoffset = sampler->lt; + + /* Advance to next possible offset on page */ + if (tupoffset == InvalidOffsetNumber) + tupoffset = FirstOffsetNumber; + else + tupoffset++; + + /* Done? */ + if (tupoffset > maxoffset) + tupoffset = InvalidOffsetNumber; + + sampler->lt = tupoffset; + + return tupoffset; +} diff --git a/src/backend/access/tablesample/tablesample.c b/src/backend/access/tablesample/tablesample.c new file mode 100644 index 0000000..02f2a95 --- /dev/null +++ b/src/backend/access/tablesample/tablesample.c @@ -0,0 +1,40 @@ +/*------------------------------------------------------------------------- + * + * tablesample.c + * Support functions for TABLESAMPLE feature + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/tablesample/tablesample.c + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/tsmapi.h" + + +/* + * GetTsmRoutine --- get a TsmRoutine struct by invoking the handler. + * + * This is a convenience routine that's just meant to check for errors. + */ +TsmRoutine * +GetTsmRoutine(Oid tsmhandler) +{ + Datum datum; + TsmRoutine *routine; + + datum = OidFunctionCall1(tsmhandler, PointerGetDatum(NULL)); + routine = (TsmRoutine *) DatumGetPointer(datum); + + if (routine == NULL || !IsA(routine, TsmRoutine)) + elog(ERROR, "tablesample handler function %u did not return a TsmRoutine struct", + tsmhandler); + + return routine; +} |