summaryrefslogtreecommitdiffstats
path: root/src/backend/commands/vacuum.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/commands/vacuum.c')
-rw-r--r--src/backend/commands/vacuum.c2117
1 files changed, 2117 insertions, 0 deletions
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
new file mode 100644
index 0000000..5ef6698
--- /dev/null
+++ b/src/backend/commands/vacuum.c
@@ -0,0 +1,2117 @@
+/*-------------------------------------------------------------------------
+ *
+ * vacuum.c
+ * The postgres vacuum cleaner.
+ *
+ * This file now includes only control and dispatch code for VACUUM and
+ * ANALYZE commands. Regular VACUUM is implemented in vacuumlazy.c,
+ * ANALYZE in analyze.c, and VACUUM FULL is a variant of CLUSTER, handled
+ * in cluster.c.
+ *
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/commands/vacuum.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "access/clog.h"
+#include "access/commit_ts.h"
+#include "access/genam.h"
+#include "access/heapam.h"
+#include "access/htup_details.h"
+#include "access/multixact.h"
+#include "access/tableam.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "catalog/namespace.h"
+#include "catalog/pg_database.h"
+#include "catalog/pg_inherits.h"
+#include "catalog/pg_namespace.h"
+#include "commands/cluster.h"
+#include "commands/defrem.h"
+#include "commands/vacuum.h"
+#include "miscadmin.h"
+#include "nodes/makefuncs.h"
+#include "pgstat.h"
+#include "postmaster/autovacuum.h"
+#include "postmaster/bgworker_internals.h"
+#include "storage/bufmgr.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "utils/acl.h"
+#include "utils/fmgroids.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/snapmgr.h"
+#include "utils/syscache.h"
+
+
+/*
+ * GUC parameters
+ */
+int vacuum_freeze_min_age;
+int vacuum_freeze_table_age;
+int vacuum_multixact_freeze_min_age;
+int vacuum_multixact_freeze_table_age;
+
+
+/* A few variables that don't seem worth passing around as parameters */
+static MemoryContext vac_context = NULL;
+static BufferAccessStrategy vac_strategy;
+
+
+/*
+ * Variables for cost-based parallel vacuum. See comments atop
+ * compute_parallel_delay to understand how it works.
+ */
+pg_atomic_uint32 *VacuumSharedCostBalance = NULL;
+pg_atomic_uint32 *VacuumActiveNWorkers = NULL;
+int VacuumCostBalanceLocal = 0;
+
+/* non-export function prototypes */
+static List *expand_vacuum_rel(VacuumRelation *vrel, int options);
+static List *get_all_vacuum_rels(int options);
+static void vac_truncate_clog(TransactionId frozenXID,
+ MultiXactId minMulti,
+ TransactionId lastSaneFrozenXid,
+ MultiXactId lastSaneMinMulti);
+static bool vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params);
+static double compute_parallel_delay(void);
+static VacOptTernaryValue get_vacopt_ternary_value(DefElem *def);
+
+/*
+ * Primary entry point for manual VACUUM and ANALYZE commands
+ *
+ * This is mainly a preparation wrapper for the real operations that will
+ * happen in vacuum().
+ */
+void
+ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel)
+{
+ VacuumParams params;
+ bool verbose = false;
+ bool skip_locked = false;
+ bool analyze = false;
+ bool freeze = false;
+ bool full = false;
+ bool disable_page_skipping = false;
+ ListCell *lc;
+
+ /* Set default value */
+ params.index_cleanup = VACOPT_TERNARY_DEFAULT;
+ params.truncate = VACOPT_TERNARY_DEFAULT;
+
+ /* By default parallel vacuum is enabled */
+ params.nworkers = 0;
+
+ /* Parse options list */
+ foreach(lc, vacstmt->options)
+ {
+ DefElem *opt = (DefElem *) lfirst(lc);
+
+ /* Parse common options for VACUUM and ANALYZE */
+ if (strcmp(opt->defname, "verbose") == 0)
+ verbose = defGetBoolean(opt);
+ else if (strcmp(opt->defname, "skip_locked") == 0)
+ skip_locked = defGetBoolean(opt);
+ else if (!vacstmt->is_vacuumcmd)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("unrecognized ANALYZE option \"%s\"", opt->defname),
+ parser_errposition(pstate, opt->location)));
+
+ /* Parse options available on VACUUM */
+ else if (strcmp(opt->defname, "analyze") == 0)
+ analyze = defGetBoolean(opt);
+ else if (strcmp(opt->defname, "freeze") == 0)
+ freeze = defGetBoolean(opt);
+ else if (strcmp(opt->defname, "full") == 0)
+ full = defGetBoolean(opt);
+ else if (strcmp(opt->defname, "disable_page_skipping") == 0)
+ disable_page_skipping = defGetBoolean(opt);
+ else if (strcmp(opt->defname, "index_cleanup") == 0)
+ params.index_cleanup = get_vacopt_ternary_value(opt);
+ else if (strcmp(opt->defname, "truncate") == 0)
+ params.truncate = get_vacopt_ternary_value(opt);
+ else if (strcmp(opt->defname, "parallel") == 0)
+ {
+ if (opt->arg == NULL)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("parallel option requires a value between 0 and %d",
+ MAX_PARALLEL_WORKER_LIMIT),
+ parser_errposition(pstate, opt->location)));
+ }
+ else
+ {
+ int nworkers;
+
+ nworkers = defGetInt32(opt);
+ if (nworkers < 0 || nworkers > MAX_PARALLEL_WORKER_LIMIT)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("parallel workers for vacuum must be between 0 and %d",
+ MAX_PARALLEL_WORKER_LIMIT),
+ parser_errposition(pstate, opt->location)));
+
+ /*
+ * Disable parallel vacuum, if user has specified parallel
+ * degree as zero.
+ */
+ if (nworkers == 0)
+ params.nworkers = -1;
+ else
+ params.nworkers = nworkers;
+ }
+ }
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("unrecognized VACUUM option \"%s\"", opt->defname),
+ parser_errposition(pstate, opt->location)));
+ }
+
+ /* Set vacuum options */
+ params.options =
+ (vacstmt->is_vacuumcmd ? VACOPT_VACUUM : VACOPT_ANALYZE) |
+ (verbose ? VACOPT_VERBOSE : 0) |
+ (skip_locked ? VACOPT_SKIP_LOCKED : 0) |
+ (analyze ? VACOPT_ANALYZE : 0) |
+ (freeze ? VACOPT_FREEZE : 0) |
+ (full ? VACOPT_FULL : 0) |
+ (disable_page_skipping ? VACOPT_DISABLE_PAGE_SKIPPING : 0);
+
+ /* sanity checks on options */
+ Assert(params.options & (VACOPT_VACUUM | VACOPT_ANALYZE));
+ Assert((params.options & VACOPT_VACUUM) ||
+ !(params.options & (VACOPT_FULL | VACOPT_FREEZE)));
+ Assert(!(params.options & VACOPT_SKIPTOAST));
+
+ if ((params.options & VACOPT_FULL) && params.nworkers > 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("VACUUM FULL cannot be performed in parallel")));
+
+ /*
+ * Make sure VACOPT_ANALYZE is specified if any column lists are present.
+ */
+ if (!(params.options & VACOPT_ANALYZE))
+ {
+ ListCell *lc;
+
+ foreach(lc, vacstmt->rels)
+ {
+ VacuumRelation *vrel = lfirst_node(VacuumRelation, lc);
+
+ if (vrel->va_cols != NIL)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("ANALYZE option must be specified when a column list is provided")));
+ }
+ }
+
+ /*
+ * All freeze ages are zero if the FREEZE option is given; otherwise pass
+ * them as -1 which means to use the default values.
+ */
+ if (params.options & VACOPT_FREEZE)
+ {
+ params.freeze_min_age = 0;
+ params.freeze_table_age = 0;
+ params.multixact_freeze_min_age = 0;
+ params.multixact_freeze_table_age = 0;
+ }
+ else
+ {
+ params.freeze_min_age = -1;
+ params.freeze_table_age = -1;
+ params.multixact_freeze_min_age = -1;
+ params.multixact_freeze_table_age = -1;
+ }
+
+ /* user-invoked vacuum is never "for wraparound" */
+ params.is_wraparound = false;
+
+ /* user-invoked vacuum never uses this parameter */
+ params.log_min_duration = -1;
+
+ /* Now go through the common routine */
+ vacuum(vacstmt->rels, &params, NULL, isTopLevel);
+}
+
+/*
+ * Internal entry point for VACUUM and ANALYZE commands.
+ *
+ * relations, if not NIL, is a list of VacuumRelation to process; otherwise,
+ * we process all relevant tables in the database. For each VacuumRelation,
+ * if a valid OID is supplied, the table with that OID is what to process;
+ * otherwise, the VacuumRelation's RangeVar indicates what to process.
+ *
+ * params contains a set of parameters that can be used to customize the
+ * behavior.
+ *
+ * bstrategy is normally given as NULL, but in autovacuum it can be passed
+ * in to use the same buffer strategy object across multiple vacuum() calls.
+ *
+ * isTopLevel should be passed down from ProcessUtility.
+ *
+ * It is the caller's responsibility that all parameters are allocated in a
+ * memory context that will not disappear at transaction commit.
+ */
+void
+vacuum(List *relations, VacuumParams *params,
+ BufferAccessStrategy bstrategy, bool isTopLevel)
+{
+ static bool in_vacuum = false;
+
+ const char *stmttype;
+ volatile bool in_outer_xact,
+ use_own_xacts;
+
+ Assert(params != NULL);
+
+ stmttype = (params->options & VACOPT_VACUUM) ? "VACUUM" : "ANALYZE";
+
+ /*
+ * We cannot run VACUUM inside a user transaction block; if we were inside
+ * a transaction, then our commit- and start-transaction-command calls
+ * would not have the intended effect! There are numerous other subtle
+ * dependencies on this, too.
+ *
+ * ANALYZE (without VACUUM) can run either way.
+ */
+ if (params->options & VACOPT_VACUUM)
+ {
+ PreventInTransactionBlock(isTopLevel, stmttype);
+ in_outer_xact = false;
+ }
+ else
+ in_outer_xact = IsInTransactionBlock(isTopLevel);
+
+ /*
+ * Due to static variables vac_context, anl_context and vac_strategy,
+ * vacuum() is not reentrant. This matters when VACUUM FULL or ANALYZE
+ * calls a hostile index expression that itself calls ANALYZE.
+ */
+ if (in_vacuum)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("%s cannot be executed from VACUUM or ANALYZE",
+ stmttype)));
+
+ /*
+ * Sanity check DISABLE_PAGE_SKIPPING option.
+ */
+ if ((params->options & VACOPT_FULL) != 0 &&
+ (params->options & VACOPT_DISABLE_PAGE_SKIPPING) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("VACUUM option DISABLE_PAGE_SKIPPING cannot be used with FULL")));
+
+ /*
+ * Send info about dead objects to the statistics collector, unless we are
+ * in autovacuum --- autovacuum.c does this for itself.
+ */
+ if ((params->options & VACOPT_VACUUM) && !IsAutoVacuumWorkerProcess())
+ pgstat_vacuum_stat();
+
+ /*
+ * Create special memory context for cross-transaction storage.
+ *
+ * Since it is a child of PortalContext, it will go away eventually even
+ * if we suffer an error; there's no need for special abort cleanup logic.
+ */
+ vac_context = AllocSetContextCreate(PortalContext,
+ "Vacuum",
+ ALLOCSET_DEFAULT_SIZES);
+
+ /*
+ * If caller didn't give us a buffer strategy object, make one in the
+ * cross-transaction memory context.
+ */
+ if (bstrategy == NULL)
+ {
+ MemoryContext old_context = MemoryContextSwitchTo(vac_context);
+
+ bstrategy = GetAccessStrategy(BAS_VACUUM);
+ MemoryContextSwitchTo(old_context);
+ }
+ vac_strategy = bstrategy;
+
+ /*
+ * Build list of relation(s) to process, putting any new data in
+ * vac_context for safekeeping.
+ */
+ if (relations != NIL)
+ {
+ List *newrels = NIL;
+ ListCell *lc;
+
+ foreach(lc, relations)
+ {
+ VacuumRelation *vrel = lfirst_node(VacuumRelation, lc);
+ List *sublist;
+ MemoryContext old_context;
+
+ sublist = expand_vacuum_rel(vrel, params->options);
+ old_context = MemoryContextSwitchTo(vac_context);
+ newrels = list_concat(newrels, sublist);
+ MemoryContextSwitchTo(old_context);
+ }
+ relations = newrels;
+ }
+ else
+ relations = get_all_vacuum_rels(params->options);
+
+ /*
+ * Decide whether we need to start/commit our own transactions.
+ *
+ * For VACUUM (with or without ANALYZE): always do so, so that we can
+ * release locks as soon as possible. (We could possibly use the outer
+ * transaction for a one-table VACUUM, but handling TOAST tables would be
+ * problematic.)
+ *
+ * For ANALYZE (no VACUUM): if inside a transaction block, we cannot
+ * start/commit our own transactions. Also, there's no need to do so if
+ * only processing one relation. For multiple relations when not within a
+ * transaction block, and also in an autovacuum worker, use own
+ * transactions so we can release locks sooner.
+ */
+ if (params->options & VACOPT_VACUUM)
+ use_own_xacts = true;
+ else
+ {
+ Assert(params->options & VACOPT_ANALYZE);
+ if (IsAutoVacuumWorkerProcess())
+ use_own_xacts = true;
+ else if (in_outer_xact)
+ use_own_xacts = false;
+ else if (list_length(relations) > 1)
+ use_own_xacts = true;
+ else
+ use_own_xacts = false;
+ }
+
+ /*
+ * vacuum_rel expects to be entered with no transaction active; it will
+ * start and commit its own transaction. But we are called by an SQL
+ * command, and so we are executing inside a transaction already. We
+ * commit the transaction started in PostgresMain() here, and start
+ * another one before exiting to match the commit waiting for us back in
+ * PostgresMain().
+ */
+ if (use_own_xacts)
+ {
+ Assert(!in_outer_xact);
+
+ /* ActiveSnapshot is not set by autovacuum */
+ if (ActiveSnapshotSet())
+ PopActiveSnapshot();
+
+ /* matches the StartTransaction in PostgresMain() */
+ CommitTransactionCommand();
+ }
+
+ /* Turn vacuum cost accounting on or off, and set/clear in_vacuum */
+ PG_TRY();
+ {
+ ListCell *cur;
+
+ in_vacuum = true;
+ VacuumCostActive = (VacuumCostDelay > 0);
+ VacuumCostBalance = 0;
+ VacuumPageHit = 0;
+ VacuumPageMiss = 0;
+ VacuumPageDirty = 0;
+ VacuumCostBalanceLocal = 0;
+ VacuumSharedCostBalance = NULL;
+ VacuumActiveNWorkers = NULL;
+
+ /*
+ * Loop to process each selected relation.
+ */
+ foreach(cur, relations)
+ {
+ VacuumRelation *vrel = lfirst_node(VacuumRelation, cur);
+
+ if (params->options & VACOPT_VACUUM)
+ {
+ if (!vacuum_rel(vrel->oid, vrel->relation, params))
+ continue;
+ }
+
+ if (params->options & VACOPT_ANALYZE)
+ {
+ /*
+ * If using separate xacts, start one for analyze. Otherwise,
+ * we can use the outer transaction.
+ */
+ if (use_own_xacts)
+ {
+ StartTransactionCommand();
+ /* functions in indexes may want a snapshot set */
+ PushActiveSnapshot(GetTransactionSnapshot());
+ }
+
+ analyze_rel(vrel->oid, vrel->relation, params,
+ vrel->va_cols, in_outer_xact, vac_strategy);
+
+ if (use_own_xacts)
+ {
+ PopActiveSnapshot();
+ CommitTransactionCommand();
+ }
+ else
+ {
+ /*
+ * If we're not using separate xacts, better separate the
+ * ANALYZE actions with CCIs. This avoids trouble if user
+ * says "ANALYZE t, t".
+ */
+ CommandCounterIncrement();
+ }
+ }
+ }
+ }
+ PG_FINALLY();
+ {
+ in_vacuum = false;
+ VacuumCostActive = false;
+ }
+ PG_END_TRY();
+
+ /*
+ * Finish up processing.
+ */
+ if (use_own_xacts)
+ {
+ /* here, we are not in a transaction */
+
+ /*
+ * This matches the CommitTransaction waiting for us in
+ * PostgresMain().
+ */
+ StartTransactionCommand();
+ }
+
+ if ((params->options & VACOPT_VACUUM) && !IsAutoVacuumWorkerProcess())
+ {
+ /*
+ * Update pg_database.datfrozenxid, and truncate pg_xact if possible.
+ * (autovacuum.c does this for itself.)
+ */
+ vac_update_datfrozenxid();
+ }
+
+ /*
+ * Clean up working storage --- note we must do this after
+ * StartTransactionCommand, else we might be trying to delete the active
+ * context!
+ */
+ MemoryContextDelete(vac_context);
+ vac_context = NULL;
+}
+
+/*
+ * Check if a given relation can be safely vacuumed or analyzed. If the
+ * user is not the relation owner, issue a WARNING log message and return
+ * false to let the caller decide what to do with this relation. This
+ * routine is used to decide if a relation can be processed for VACUUM or
+ * ANALYZE.
+ */
+bool
+vacuum_is_relation_owner(Oid relid, Form_pg_class reltuple, int options)
+{
+ char *relname;
+
+ Assert((options & (VACOPT_VACUUM | VACOPT_ANALYZE)) != 0);
+
+ /*
+ * Check permissions.
+ *
+ * We allow the user to vacuum or analyze a table if he is superuser, the
+ * table owner, or the database owner (but in the latter case, only if
+ * it's not a shared relation). pg_class_ownercheck includes the
+ * superuser case.
+ *
+ * Note we choose to treat permissions failure as a WARNING and keep
+ * trying to vacuum or analyze the rest of the DB --- is this appropriate?
+ */
+ if (pg_class_ownercheck(relid, GetUserId()) ||
+ (pg_database_ownercheck(MyDatabaseId, GetUserId()) && !reltuple->relisshared))
+ return true;
+
+ relname = NameStr(reltuple->relname);
+
+ if ((options & VACOPT_VACUUM) != 0)
+ {
+ if (reltuple->relisshared)
+ ereport(WARNING,
+ (errmsg("skipping \"%s\" --- only superuser can vacuum it",
+ relname)));
+ else if (reltuple->relnamespace == PG_CATALOG_NAMESPACE)
+ ereport(WARNING,
+ (errmsg("skipping \"%s\" --- only superuser or database owner can vacuum it",
+ relname)));
+ else
+ ereport(WARNING,
+ (errmsg("skipping \"%s\" --- only table or database owner can vacuum it",
+ relname)));
+
+ /*
+ * For VACUUM ANALYZE, both logs could show up, but just generate
+ * information for VACUUM as that would be the first one to be
+ * processed.
+ */
+ return false;
+ }
+
+ if ((options & VACOPT_ANALYZE) != 0)
+ {
+ if (reltuple->relisshared)
+ ereport(WARNING,
+ (errmsg("skipping \"%s\" --- only superuser can analyze it",
+ relname)));
+ else if (reltuple->relnamespace == PG_CATALOG_NAMESPACE)
+ ereport(WARNING,
+ (errmsg("skipping \"%s\" --- only superuser or database owner can analyze it",
+ relname)));
+ else
+ ereport(WARNING,
+ (errmsg("skipping \"%s\" --- only table or database owner can analyze it",
+ relname)));
+ }
+
+ return false;
+}
+
+
+/*
+ * vacuum_open_relation
+ *
+ * This routine is used for attempting to open and lock a relation which
+ * is going to be vacuumed or analyzed. If the relation cannot be opened
+ * or locked, a log is emitted if possible.
+ */
+Relation
+vacuum_open_relation(Oid relid, RangeVar *relation, int options,
+ bool verbose, LOCKMODE lmode)
+{
+ Relation onerel;
+ bool rel_lock = true;
+ int elevel;
+
+ Assert((options & (VACOPT_VACUUM | VACOPT_ANALYZE)) != 0);
+
+ /*
+ * Open the relation and get the appropriate lock on it.
+ *
+ * There's a race condition here: the relation may have gone away since
+ * the last time we saw it. If so, we don't need to vacuum or analyze it.
+ *
+ * If we've been asked not to wait for the relation lock, acquire it first
+ * in non-blocking mode, before calling try_relation_open().
+ */
+ if (!(options & VACOPT_SKIP_LOCKED))
+ onerel = try_relation_open(relid, lmode);
+ else if (ConditionalLockRelationOid(relid, lmode))
+ onerel = try_relation_open(relid, NoLock);
+ else
+ {
+ onerel = NULL;
+ rel_lock = false;
+ }
+
+ /* if relation is opened, leave */
+ if (onerel)
+ return onerel;
+
+ /*
+ * Relation could not be opened, hence generate if possible a log
+ * informing on the situation.
+ *
+ * If the RangeVar is not defined, we do not have enough information to
+ * provide a meaningful log statement. Chances are that the caller has
+ * intentionally not provided this information so that this logging is
+ * skipped, anyway.
+ */
+ if (relation == NULL)
+ return NULL;
+
+ /*
+ * Determine the log level.
+ *
+ * For manual VACUUM or ANALYZE, we emit a WARNING to match the log
+ * statements in the permission checks; otherwise, only log if the caller
+ * so requested.
+ */
+ if (!IsAutoVacuumWorkerProcess())
+ elevel = WARNING;
+ else if (verbose)
+ elevel = LOG;
+ else
+ return NULL;
+
+ if ((options & VACOPT_VACUUM) != 0)
+ {
+ if (!rel_lock)
+ ereport(elevel,
+ (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+ errmsg("skipping vacuum of \"%s\" --- lock not available",
+ relation->relname)));
+ else
+ ereport(elevel,
+ (errcode(ERRCODE_UNDEFINED_TABLE),
+ errmsg("skipping vacuum of \"%s\" --- relation no longer exists",
+ relation->relname)));
+
+ /*
+ * For VACUUM ANALYZE, both logs could show up, but just generate
+ * information for VACUUM as that would be the first one to be
+ * processed.
+ */
+ return NULL;
+ }
+
+ if ((options & VACOPT_ANALYZE) != 0)
+ {
+ if (!rel_lock)
+ ereport(elevel,
+ (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+ errmsg("skipping analyze of \"%s\" --- lock not available",
+ relation->relname)));
+ else
+ ereport(elevel,
+ (errcode(ERRCODE_UNDEFINED_TABLE),
+ errmsg("skipping analyze of \"%s\" --- relation no longer exists",
+ relation->relname)));
+ }
+
+ return NULL;
+}
+
+
+/*
+ * Given a VacuumRelation, fill in the table OID if it wasn't specified,
+ * and optionally add VacuumRelations for partitions of the table.
+ *
+ * If a VacuumRelation does not have an OID supplied and is a partitioned
+ * table, an extra entry will be added to the output for each partition.
+ * Presently, only autovacuum supplies OIDs when calling vacuum(), and
+ * it does not want us to expand partitioned tables.
+ *
+ * We take care not to modify the input data structure, but instead build
+ * new VacuumRelation(s) to return. (But note that they will reference
+ * unmodified parts of the input, eg column lists.) New data structures
+ * are made in vac_context.
+ */
+static List *
+expand_vacuum_rel(VacuumRelation *vrel, int options)
+{
+ List *vacrels = NIL;
+ MemoryContext oldcontext;
+
+ /* If caller supplied OID, there's nothing we need do here. */
+ if (OidIsValid(vrel->oid))
+ {
+ oldcontext = MemoryContextSwitchTo(vac_context);
+ vacrels = lappend(vacrels, vrel);
+ MemoryContextSwitchTo(oldcontext);
+ }
+ else
+ {
+ /* Process a specific relation, and possibly partitions thereof */
+ Oid relid;
+ HeapTuple tuple;
+ Form_pg_class classForm;
+ bool include_parts;
+ int rvr_opts;
+
+ /*
+ * Since autovacuum workers supply OIDs when calling vacuum(), no
+ * autovacuum worker should reach this code.
+ */
+ Assert(!IsAutoVacuumWorkerProcess());
+
+ /*
+ * We transiently take AccessShareLock to protect the syscache lookup
+ * below, as well as find_all_inheritors's expectation that the caller
+ * holds some lock on the starting relation.
+ */
+ rvr_opts = (options & VACOPT_SKIP_LOCKED) ? RVR_SKIP_LOCKED : 0;
+ relid = RangeVarGetRelidExtended(vrel->relation,
+ AccessShareLock,
+ rvr_opts,
+ NULL, NULL);
+
+ /*
+ * If the lock is unavailable, emit the same log statement that
+ * vacuum_rel() and analyze_rel() would.
+ */
+ if (!OidIsValid(relid))
+ {
+ if (options & VACOPT_VACUUM)
+ ereport(WARNING,
+ (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+ errmsg("skipping vacuum of \"%s\" --- lock not available",
+ vrel->relation->relname)));
+ else
+ ereport(WARNING,
+ (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+ errmsg("skipping analyze of \"%s\" --- lock not available",
+ vrel->relation->relname)));
+ return vacrels;
+ }
+
+ /*
+ * To check whether the relation is a partitioned table and its
+ * ownership, fetch its syscache entry.
+ */
+ tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relid));
+ if (!HeapTupleIsValid(tuple))
+ elog(ERROR, "cache lookup failed for relation %u", relid);
+ classForm = (Form_pg_class) GETSTRUCT(tuple);
+
+ /*
+ * Make a returnable VacuumRelation for this rel if user is a proper
+ * owner.
+ */
+ if (vacuum_is_relation_owner(relid, classForm, options))
+ {
+ oldcontext = MemoryContextSwitchTo(vac_context);
+ vacrels = lappend(vacrels, makeVacuumRelation(vrel->relation,
+ relid,
+ vrel->va_cols));
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+
+ include_parts = (classForm->relkind == RELKIND_PARTITIONED_TABLE);
+ ReleaseSysCache(tuple);
+
+ /*
+ * If it is, make relation list entries for its partitions. Note that
+ * the list returned by find_all_inheritors() includes the passed-in
+ * OID, so we have to skip that. There's no point in taking locks on
+ * the individual partitions yet, and doing so would just add
+ * unnecessary deadlock risk. For this last reason we do not check
+ * yet the ownership of the partitions, which get added to the list to
+ * process. Ownership will be checked later on anyway.
+ */
+ if (include_parts)
+ {
+ List *part_oids = find_all_inheritors(relid, NoLock, NULL);
+ ListCell *part_lc;
+
+ foreach(part_lc, part_oids)
+ {
+ Oid part_oid = lfirst_oid(part_lc);
+
+ if (part_oid == relid)
+ continue; /* ignore original table */
+
+ /*
+ * We omit a RangeVar since it wouldn't be appropriate to
+ * complain about failure to open one of these relations
+ * later.
+ */
+ oldcontext = MemoryContextSwitchTo(vac_context);
+ vacrels = lappend(vacrels, makeVacuumRelation(NULL,
+ part_oid,
+ vrel->va_cols));
+ MemoryContextSwitchTo(oldcontext);
+ }
+ }
+
+ /*
+ * Release lock again. This means that by the time we actually try to
+ * process the table, it might be gone or renamed. In the former case
+ * we'll silently ignore it; in the latter case we'll process it
+ * anyway, but we must beware that the RangeVar doesn't necessarily
+ * identify it anymore. This isn't ideal, perhaps, but there's little
+ * practical alternative, since we're typically going to commit this
+ * transaction and begin a new one between now and then. Moreover,
+ * holding locks on multiple relations would create significant risk
+ * of deadlock.
+ */
+ UnlockRelationOid(relid, AccessShareLock);
+ }
+
+ return vacrels;
+}
+
+/*
+ * Construct a list of VacuumRelations for all vacuumable rels in
+ * the current database. The list is built in vac_context.
+ */
+static List *
+get_all_vacuum_rels(int options)
+{
+ List *vacrels = NIL;
+ Relation pgclass;
+ TableScanDesc scan;
+ HeapTuple tuple;
+
+ pgclass = table_open(RelationRelationId, AccessShareLock);
+
+ scan = table_beginscan_catalog(pgclass, 0, NULL);
+
+ while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+ {
+ Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
+ MemoryContext oldcontext;
+ Oid relid = classForm->oid;
+
+ /* check permissions of relation */
+ if (!vacuum_is_relation_owner(relid, classForm, options))
+ continue;
+
+ /*
+ * We include partitioned tables here; depending on which operation is
+ * to be performed, caller will decide whether to process or ignore
+ * them.
+ */
+ if (classForm->relkind != RELKIND_RELATION &&
+ classForm->relkind != RELKIND_MATVIEW &&
+ classForm->relkind != RELKIND_PARTITIONED_TABLE)
+ continue;
+
+ /*
+ * Build VacuumRelation(s) specifying the table OIDs to be processed.
+ * We omit a RangeVar since it wouldn't be appropriate to complain
+ * about failure to open one of these relations later.
+ */
+ oldcontext = MemoryContextSwitchTo(vac_context);
+ vacrels = lappend(vacrels, makeVacuumRelation(NULL,
+ relid,
+ NIL));
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ table_endscan(scan);
+ table_close(pgclass, AccessShareLock);
+
+ return vacrels;
+}
+
+/*
+ * vacuum_set_xid_limits() -- compute oldestXmin and freeze cutoff points
+ *
+ * The output parameters are:
+ * - oldestXmin is the cutoff value used to distinguish whether tuples are
+ * DEAD or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
+ * - freezeLimit is the Xid below which all Xids are replaced by
+ * FrozenTransactionId during vacuum.
+ * - xidFullScanLimit (computed from freeze_table_age parameter)
+ * represents a minimum Xid value; a table whose relfrozenxid is older than
+ * this will have a full-table vacuum applied to it, to freeze tuples across
+ * the whole table. Vacuuming a table younger than this value can use a
+ * partial scan.
+ * - multiXactCutoff is the value below which all MultiXactIds are removed from
+ * Xmax.
+ * - mxactFullScanLimit is a value against which a table's relminmxid value is
+ * compared to produce a full-table vacuum, as with xidFullScanLimit.
+ *
+ * xidFullScanLimit and mxactFullScanLimit can be passed as NULL if caller is
+ * not interested.
+ */
+void
+vacuum_set_xid_limits(Relation rel,
+ int freeze_min_age,
+ int freeze_table_age,
+ int multixact_freeze_min_age,
+ int multixact_freeze_table_age,
+ TransactionId *oldestXmin,
+ TransactionId *freezeLimit,
+ TransactionId *xidFullScanLimit,
+ MultiXactId *multiXactCutoff,
+ MultiXactId *mxactFullScanLimit)
+{
+ int freezemin;
+ int mxid_freezemin;
+ int effective_multixact_freeze_max_age;
+ TransactionId limit;
+ TransactionId safeLimit;
+ MultiXactId oldestMxact;
+ MultiXactId mxactLimit;
+ MultiXactId safeMxactLimit;
+
+ /*
+ * We can always ignore processes running lazy vacuum. This is because we
+ * use these values only for deciding which tuples we must keep in the
+ * tables. Since lazy vacuum doesn't write its XID anywhere, it's safe to
+ * ignore it. In theory it could be problematic to ignore lazy vacuums in
+ * a full vacuum, but keep in mind that only one vacuum process can be
+ * working on a particular table at any time, and that each vacuum is
+ * always an independent transaction.
+ */
+ *oldestXmin =
+ TransactionIdLimitedForOldSnapshots(GetOldestXmin(rel, PROCARRAY_FLAGS_VACUUM), rel);
+
+ Assert(TransactionIdIsNormal(*oldestXmin));
+
+ /*
+ * Determine the minimum freeze age to use: as specified by the caller, or
+ * vacuum_freeze_min_age, but in any case not more than half
+ * autovacuum_freeze_max_age, so that autovacuums to prevent XID
+ * wraparound won't occur too frequently.
+ */
+ freezemin = freeze_min_age;
+ if (freezemin < 0)
+ freezemin = vacuum_freeze_min_age;
+ freezemin = Min(freezemin, autovacuum_freeze_max_age / 2);
+ Assert(freezemin >= 0);
+
+ /*
+ * Compute the cutoff XID, being careful not to generate a "permanent" XID
+ */
+ limit = *oldestXmin - freezemin;
+ if (!TransactionIdIsNormal(limit))
+ limit = FirstNormalTransactionId;
+
+ /*
+ * If oldestXmin is very far back (in practice, more than
+ * autovacuum_freeze_max_age / 2 XIDs old), complain and force a minimum
+ * freeze age of zero.
+ */
+ safeLimit = ReadNewTransactionId() - autovacuum_freeze_max_age;
+ if (!TransactionIdIsNormal(safeLimit))
+ safeLimit = FirstNormalTransactionId;
+
+ if (TransactionIdPrecedes(limit, safeLimit))
+ {
+ ereport(WARNING,
+ (errmsg("oldest xmin is far in the past"),
+ errhint("Close open transactions soon to avoid wraparound problems.\n"
+ "You might also need to commit or roll back old prepared transactions, or drop stale replication slots.")));
+ limit = *oldestXmin;
+ }
+
+ *freezeLimit = limit;
+
+ /*
+ * Compute the multixact age for which freezing is urgent. This is
+ * normally autovacuum_multixact_freeze_max_age, but may be less if we are
+ * short of multixact member space.
+ */
+ effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold();
+
+ /*
+ * Determine the minimum multixact freeze age to use: as specified by
+ * caller, or vacuum_multixact_freeze_min_age, but in any case not more
+ * than half effective_multixact_freeze_max_age, so that autovacuums to
+ * prevent MultiXact wraparound won't occur too frequently.
+ */
+ mxid_freezemin = multixact_freeze_min_age;
+ if (mxid_freezemin < 0)
+ mxid_freezemin = vacuum_multixact_freeze_min_age;
+ mxid_freezemin = Min(mxid_freezemin,
+ effective_multixact_freeze_max_age / 2);
+ Assert(mxid_freezemin >= 0);
+
+ /* compute the cutoff multi, being careful to generate a valid value */
+ oldestMxact = GetOldestMultiXactId();
+ mxactLimit = oldestMxact - mxid_freezemin;
+ if (mxactLimit < FirstMultiXactId)
+ mxactLimit = FirstMultiXactId;
+
+ safeMxactLimit =
+ ReadNextMultiXactId() - effective_multixact_freeze_max_age;
+ if (safeMxactLimit < FirstMultiXactId)
+ safeMxactLimit = FirstMultiXactId;
+
+ if (MultiXactIdPrecedes(mxactLimit, safeMxactLimit))
+ {
+ ereport(WARNING,
+ (errmsg("oldest multixact is far in the past"),
+ errhint("Close open transactions with multixacts soon to avoid wraparound problems.")));
+ /* Use the safe limit, unless an older mxact is still running */
+ if (MultiXactIdPrecedes(oldestMxact, safeMxactLimit))
+ mxactLimit = oldestMxact;
+ else
+ mxactLimit = safeMxactLimit;
+ }
+
+ *multiXactCutoff = mxactLimit;
+
+ if (xidFullScanLimit != NULL)
+ {
+ int freezetable;
+
+ Assert(mxactFullScanLimit != NULL);
+
+ /*
+ * Determine the table freeze age to use: as specified by the caller,
+ * or vacuum_freeze_table_age, but in any case not more than
+ * autovacuum_freeze_max_age * 0.95, so that if you have e.g nightly
+ * VACUUM schedule, the nightly VACUUM gets a chance to freeze tuples
+ * before anti-wraparound autovacuum is launched.
+ */
+ freezetable = freeze_table_age;
+ if (freezetable < 0)
+ freezetable = vacuum_freeze_table_age;
+ freezetable = Min(freezetable, autovacuum_freeze_max_age * 0.95);
+ Assert(freezetable >= 0);
+
+ /*
+ * Compute XID limit causing a full-table vacuum, being careful not to
+ * generate a "permanent" XID.
+ */
+ limit = ReadNewTransactionId() - freezetable;
+ if (!TransactionIdIsNormal(limit))
+ limit = FirstNormalTransactionId;
+
+ *xidFullScanLimit = limit;
+
+ /*
+ * Similar to the above, determine the table freeze age to use for
+ * multixacts: as specified by the caller, or
+ * vacuum_multixact_freeze_table_age, but in any case not more than
+ * autovacuum_multixact_freeze_table_age * 0.95, so that if you have
+ * e.g. nightly VACUUM schedule, the nightly VACUUM gets a chance to
+ * freeze multixacts before anti-wraparound autovacuum is launched.
+ */
+ freezetable = multixact_freeze_table_age;
+ if (freezetable < 0)
+ freezetable = vacuum_multixact_freeze_table_age;
+ freezetable = Min(freezetable,
+ effective_multixact_freeze_max_age * 0.95);
+ Assert(freezetable >= 0);
+
+ /*
+ * Compute MultiXact limit causing a full-table vacuum, being careful
+ * to generate a valid MultiXact value.
+ */
+ mxactLimit = ReadNextMultiXactId() - freezetable;
+ if (mxactLimit < FirstMultiXactId)
+ mxactLimit = FirstMultiXactId;
+
+ *mxactFullScanLimit = mxactLimit;
+ }
+ else
+ {
+ Assert(mxactFullScanLimit == NULL);
+ }
+}
+
+/*
+ * vac_estimate_reltuples() -- estimate the new value for pg_class.reltuples
+ *
+ * If we scanned the whole relation then we should just use the count of
+ * live tuples seen; but if we did not, we should not blindly extrapolate
+ * from that number, since VACUUM may have scanned a quite nonrandom
+ * subset of the table. When we have only partial information, we take
+ * the old value of pg_class.reltuples as a measurement of the
+ * tuple density in the unscanned pages.
+ *
+ * Note: scanned_tuples should count only *live* tuples, since
+ * pg_class.reltuples is defined that way.
+ */
+double
+vac_estimate_reltuples(Relation relation,
+ BlockNumber total_pages,
+ BlockNumber scanned_pages,
+ double scanned_tuples)
+{
+ BlockNumber old_rel_pages = relation->rd_rel->relpages;
+ double old_rel_tuples = relation->rd_rel->reltuples;
+ double old_density;
+ double unscanned_pages;
+ double total_tuples;
+
+ /* If we did scan the whole table, just use the count as-is */
+ if (scanned_pages >= total_pages)
+ return scanned_tuples;
+
+ /*
+ * If scanned_pages is zero but total_pages isn't, keep the existing value
+ * of reltuples. (Note: callers should avoid updating the pg_class
+ * statistics in this situation, since no new information has been
+ * provided.)
+ */
+ if (scanned_pages == 0)
+ return old_rel_tuples;
+
+ /*
+ * If old value of relpages is zero, old density is indeterminate; we
+ * can't do much except scale up scanned_tuples to match total_pages.
+ */
+ if (old_rel_pages == 0)
+ return floor((scanned_tuples / scanned_pages) * total_pages + 0.5);
+
+ /*
+ * Okay, we've covered the corner cases. The normal calculation is to
+ * convert the old measurement to a density (tuples per page), then
+ * estimate the number of tuples in the unscanned pages using that figure,
+ * and finally add on the number of tuples in the scanned pages.
+ */
+ old_density = old_rel_tuples / old_rel_pages;
+ unscanned_pages = (double) total_pages - (double) scanned_pages;
+ total_tuples = old_density * unscanned_pages + scanned_tuples;
+ return floor(total_tuples + 0.5);
+}
+
+
+/*
+ * vac_update_relstats() -- update statistics for one relation
+ *
+ * Update the whole-relation statistics that are kept in its pg_class
+ * row. There are additional stats that will be updated if we are
+ * doing ANALYZE, but we always update these stats. This routine works
+ * for both index and heap relation entries in pg_class.
+ *
+ * We violate transaction semantics here by overwriting the rel's
+ * existing pg_class tuple with the new values. This is reasonably
+ * safe as long as we're sure that the new values are correct whether or
+ * not this transaction commits. The reason for doing this is that if
+ * we updated these tuples in the usual way, vacuuming pg_class itself
+ * wouldn't work very well --- by the time we got done with a vacuum
+ * cycle, most of the tuples in pg_class would've been obsoleted. Of
+ * course, this only works for fixed-size not-null columns, but these are.
+ *
+ * Another reason for doing it this way is that when we are in a lazy
+ * VACUUM and have PROC_IN_VACUUM set, we mustn't do any regular updates.
+ * Somebody vacuuming pg_class might think they could delete a tuple
+ * marked with xmin = our xid.
+ *
+ * In addition to fundamentally nontransactional statistics such as
+ * relpages and relallvisible, we try to maintain certain lazily-updated
+ * DDL flags such as relhasindex, by clearing them if no longer correct.
+ * It's safe to do this in VACUUM, which can't run in parallel with
+ * CREATE INDEX/RULE/TRIGGER and can't be part of a transaction block.
+ * However, it's *not* safe to do it in an ANALYZE that's within an
+ * outer transaction, because for example the current transaction might
+ * have dropped the last index; then we'd think relhasindex should be
+ * cleared, but if the transaction later rolls back this would be wrong.
+ * So we refrain from updating the DDL flags if we're inside an outer
+ * transaction. This is OK since postponing the flag maintenance is
+ * always allowable.
+ *
+ * Note: num_tuples should count only *live* tuples, since
+ * pg_class.reltuples is defined that way.
+ *
+ * This routine is shared by VACUUM and ANALYZE.
+ */
+void
+vac_update_relstats(Relation relation,
+ BlockNumber num_pages, double num_tuples,
+ BlockNumber num_all_visible_pages,
+ bool hasindex, TransactionId frozenxid,
+ MultiXactId minmulti,
+ bool in_outer_xact)
+{
+ Oid relid = RelationGetRelid(relation);
+ Relation rd;
+ HeapTuple ctup;
+ Form_pg_class pgcform;
+ bool dirty;
+
+ rd = table_open(RelationRelationId, RowExclusiveLock);
+
+ /* Fetch a copy of the tuple to scribble on */
+ ctup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
+ if (!HeapTupleIsValid(ctup))
+ elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
+ relid);
+ pgcform = (Form_pg_class) GETSTRUCT(ctup);
+
+ /* Apply statistical updates, if any, to copied tuple */
+
+ dirty = false;
+ if (pgcform->relpages != (int32) num_pages)
+ {
+ pgcform->relpages = (int32) num_pages;
+ dirty = true;
+ }
+ if (pgcform->reltuples != (float4) num_tuples)
+ {
+ pgcform->reltuples = (float4) num_tuples;
+ dirty = true;
+ }
+ if (pgcform->relallvisible != (int32) num_all_visible_pages)
+ {
+ pgcform->relallvisible = (int32) num_all_visible_pages;
+ dirty = true;
+ }
+
+ /* Apply DDL updates, but not inside an outer transaction (see above) */
+
+ if (!in_outer_xact)
+ {
+ /*
+ * If we didn't find any indexes, reset relhasindex.
+ */
+ if (pgcform->relhasindex && !hasindex)
+ {
+ pgcform->relhasindex = false;
+ dirty = true;
+ }
+
+ /* We also clear relhasrules and relhastriggers if needed */
+ if (pgcform->relhasrules && relation->rd_rules == NULL)
+ {
+ pgcform->relhasrules = false;
+ dirty = true;
+ }
+ if (pgcform->relhastriggers && relation->trigdesc == NULL)
+ {
+ pgcform->relhastriggers = false;
+ dirty = true;
+ }
+ }
+
+ /*
+ * Update relfrozenxid, unless caller passed InvalidTransactionId
+ * indicating it has no new data.
+ *
+ * Ordinarily, we don't let relfrozenxid go backwards: if things are
+ * working correctly, the only way the new frozenxid could be older would
+ * be if a previous VACUUM was done with a tighter freeze_min_age, in
+ * which case we don't want to forget the work it already did. However,
+ * if the stored relfrozenxid is "in the future", then it must be corrupt
+ * and it seems best to overwrite it with the cutoff we used this time.
+ * This should match vac_update_datfrozenxid() concerning what we consider
+ * to be "in the future".
+ */
+ if (TransactionIdIsNormal(frozenxid) &&
+ pgcform->relfrozenxid != frozenxid &&
+ (TransactionIdPrecedes(pgcform->relfrozenxid, frozenxid) ||
+ TransactionIdPrecedes(ReadNewTransactionId(),
+ pgcform->relfrozenxid)))
+ {
+ pgcform->relfrozenxid = frozenxid;
+ dirty = true;
+ }
+
+ /* Similarly for relminmxid */
+ if (MultiXactIdIsValid(minmulti) &&
+ pgcform->relminmxid != minmulti &&
+ (MultiXactIdPrecedes(pgcform->relminmxid, minmulti) ||
+ MultiXactIdPrecedes(ReadNextMultiXactId(), pgcform->relminmxid)))
+ {
+ pgcform->relminmxid = minmulti;
+ dirty = true;
+ }
+
+ /* If anything changed, write out the tuple. */
+ if (dirty)
+ heap_inplace_update(rd, ctup);
+
+ table_close(rd, RowExclusiveLock);
+}
+
+
+/*
+ * vac_update_datfrozenxid() -- update pg_database.datfrozenxid for our DB
+ *
+ * Update pg_database's datfrozenxid entry for our database to be the
+ * minimum of the pg_class.relfrozenxid values.
+ *
+ * Similarly, update our datminmxid to be the minimum of the
+ * pg_class.relminmxid values.
+ *
+ * If we are able to advance either pg_database value, also try to
+ * truncate pg_xact and pg_multixact.
+ *
+ * We violate transaction semantics here by overwriting the database's
+ * existing pg_database tuple with the new values. This is reasonably
+ * safe since the new values are correct whether or not this transaction
+ * commits. As with vac_update_relstats, this avoids leaving dead tuples
+ * behind after a VACUUM.
+ */
+void
+vac_update_datfrozenxid(void)
+{
+ HeapTuple tuple;
+ Form_pg_database dbform;
+ Relation relation;
+ SysScanDesc scan;
+ HeapTuple classTup;
+ TransactionId newFrozenXid;
+ MultiXactId newMinMulti;
+ TransactionId lastSaneFrozenXid;
+ MultiXactId lastSaneMinMulti;
+ bool bogus = false;
+ bool dirty = false;
+
+ /*
+ * Restrict this task to one backend per database. This avoids race
+ * conditions that would move datfrozenxid or datminmxid backward. It
+ * avoids calling vac_truncate_clog() with a datfrozenxid preceding a
+ * datfrozenxid passed to an earlier vac_truncate_clog() call.
+ */
+ LockDatabaseFrozenIds(ExclusiveLock);
+
+ /*
+ * Initialize the "min" calculation with GetOldestXmin, which is a
+ * reasonable approximation to the minimum relfrozenxid for not-yet-
+ * committed pg_class entries for new tables; see AddNewRelationTuple().
+ * So we cannot produce a wrong minimum by starting with this.
+ */
+ newFrozenXid = GetOldestXmin(NULL, PROCARRAY_FLAGS_VACUUM);
+
+ /*
+ * Similarly, initialize the MultiXact "min" with the value that would be
+ * used on pg_class for new tables. See AddNewRelationTuple().
+ */
+ newMinMulti = GetOldestMultiXactId();
+
+ /*
+ * Identify the latest relfrozenxid and relminmxid values that we could
+ * validly see during the scan. These are conservative values, but it's
+ * not really worth trying to be more exact.
+ */
+ lastSaneFrozenXid = ReadNewTransactionId();
+ lastSaneMinMulti = ReadNextMultiXactId();
+
+ /*
+ * We must seqscan pg_class to find the minimum Xid, because there is no
+ * index that can help us here.
+ */
+ relation = table_open(RelationRelationId, AccessShareLock);
+
+ scan = systable_beginscan(relation, InvalidOid, false,
+ NULL, 0, NULL);
+
+ while ((classTup = systable_getnext(scan)) != NULL)
+ {
+ Form_pg_class classForm = (Form_pg_class) GETSTRUCT(classTup);
+
+ /*
+ * Only consider relations able to hold unfrozen XIDs (anything else
+ * should have InvalidTransactionId in relfrozenxid anyway).
+ */
+ if (classForm->relkind != RELKIND_RELATION &&
+ classForm->relkind != RELKIND_MATVIEW &&
+ classForm->relkind != RELKIND_TOASTVALUE)
+ {
+ Assert(!TransactionIdIsValid(classForm->relfrozenxid));
+ Assert(!MultiXactIdIsValid(classForm->relminmxid));
+ continue;
+ }
+
+ /*
+ * Some table AMs might not need per-relation xid / multixid horizons.
+ * It therefore seems reasonable to allow relfrozenxid and relminmxid
+ * to not be set (i.e. set to their respective Invalid*Id)
+ * independently. Thus validate and compute horizon for each only if
+ * set.
+ *
+ * If things are working properly, no relation should have a
+ * relfrozenxid or relminmxid that is "in the future". However, such
+ * cases have been known to arise due to bugs in pg_upgrade. If we
+ * see any entries that are "in the future", chicken out and don't do
+ * anything. This ensures we won't truncate clog & multixact SLRUs
+ * before those relations have been scanned and cleaned up.
+ */
+
+ if (TransactionIdIsValid(classForm->relfrozenxid))
+ {
+ Assert(TransactionIdIsNormal(classForm->relfrozenxid));
+
+ /* check for values in the future */
+ if (TransactionIdPrecedes(lastSaneFrozenXid, classForm->relfrozenxid))
+ {
+ bogus = true;
+ break;
+ }
+
+ /* determine new horizon */
+ if (TransactionIdPrecedes(classForm->relfrozenxid, newFrozenXid))
+ newFrozenXid = classForm->relfrozenxid;
+ }
+
+ if (MultiXactIdIsValid(classForm->relminmxid))
+ {
+ /* check for values in the future */
+ if (MultiXactIdPrecedes(lastSaneMinMulti, classForm->relminmxid))
+ {
+ bogus = true;
+ break;
+ }
+
+ /* determine new horizon */
+ if (MultiXactIdPrecedes(classForm->relminmxid, newMinMulti))
+ newMinMulti = classForm->relminmxid;
+ }
+ }
+
+ /* we're done with pg_class */
+ systable_endscan(scan);
+ table_close(relation, AccessShareLock);
+
+ /* chicken out if bogus data found */
+ if (bogus)
+ return;
+
+ Assert(TransactionIdIsNormal(newFrozenXid));
+ Assert(MultiXactIdIsValid(newMinMulti));
+
+ /* Now fetch the pg_database tuple we need to update. */
+ relation = table_open(DatabaseRelationId, RowExclusiveLock);
+
+ /* Fetch a copy of the tuple to scribble on */
+ tuple = SearchSysCacheCopy1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
+ if (!HeapTupleIsValid(tuple))
+ elog(ERROR, "could not find tuple for database %u", MyDatabaseId);
+ dbform = (Form_pg_database) GETSTRUCT(tuple);
+
+ /*
+ * As in vac_update_relstats(), we ordinarily don't want to let
+ * datfrozenxid go backward; but if it's "in the future" then it must be
+ * corrupt and it seems best to overwrite it.
+ */
+ if (dbform->datfrozenxid != newFrozenXid &&
+ (TransactionIdPrecedes(dbform->datfrozenxid, newFrozenXid) ||
+ TransactionIdPrecedes(lastSaneFrozenXid, dbform->datfrozenxid)))
+ {
+ dbform->datfrozenxid = newFrozenXid;
+ dirty = true;
+ }
+ else
+ newFrozenXid = dbform->datfrozenxid;
+
+ /* Ditto for datminmxid */
+ if (dbform->datminmxid != newMinMulti &&
+ (MultiXactIdPrecedes(dbform->datminmxid, newMinMulti) ||
+ MultiXactIdPrecedes(lastSaneMinMulti, dbform->datminmxid)))
+ {
+ dbform->datminmxid = newMinMulti;
+ dirty = true;
+ }
+ else
+ newMinMulti = dbform->datminmxid;
+
+ if (dirty)
+ heap_inplace_update(relation, tuple);
+
+ heap_freetuple(tuple);
+ table_close(relation, RowExclusiveLock);
+
+ /*
+ * If we were able to advance datfrozenxid or datminmxid, see if we can
+ * truncate pg_xact and/or pg_multixact. Also do it if the shared
+ * XID-wrap-limit info is stale, since this action will update that too.
+ */
+ if (dirty || ForceTransactionIdLimitUpdate())
+ vac_truncate_clog(newFrozenXid, newMinMulti,
+ lastSaneFrozenXid, lastSaneMinMulti);
+}
+
+
+/*
+ * vac_truncate_clog() -- attempt to truncate the commit log
+ *
+ * Scan pg_database to determine the system-wide oldest datfrozenxid,
+ * and use it to truncate the transaction commit log (pg_xact).
+ * Also update the XID wrap limit info maintained by varsup.c.
+ * Likewise for datminmxid.
+ *
+ * The passed frozenXID and minMulti are the updated values for my own
+ * pg_database entry. They're used to initialize the "min" calculations.
+ * The caller also passes the "last sane" XID and MXID, since it has
+ * those at hand already.
+ *
+ * This routine is only invoked when we've managed to change our
+ * DB's datfrozenxid/datminmxid values, or we found that the shared
+ * XID-wrap-limit info is stale.
+ */
+static void
+vac_truncate_clog(TransactionId frozenXID,
+ MultiXactId minMulti,
+ TransactionId lastSaneFrozenXid,
+ MultiXactId lastSaneMinMulti)
+{
+ TransactionId nextXID = ReadNewTransactionId();
+ Relation relation;
+ TableScanDesc scan;
+ HeapTuple tuple;
+ Oid oldestxid_datoid;
+ Oid minmulti_datoid;
+ bool bogus = false;
+ bool frozenAlreadyWrapped = false;
+
+ /* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
+ LWLockAcquire(WrapLimitsVacuumLock, LW_EXCLUSIVE);
+
+ /* init oldest datoids to sync with my frozenXID/minMulti values */
+ oldestxid_datoid = MyDatabaseId;
+ minmulti_datoid = MyDatabaseId;
+
+ /*
+ * Scan pg_database to compute the minimum datfrozenxid/datminmxid
+ *
+ * Since vac_update_datfrozenxid updates datfrozenxid/datminmxid in-place,
+ * the values could change while we look at them. Fetch each one just
+ * once to ensure sane behavior of the comparison logic. (Here, as in
+ * many other places, we assume that fetching or updating an XID in shared
+ * storage is atomic.)
+ *
+ * Note: we need not worry about a race condition with new entries being
+ * inserted by CREATE DATABASE. Any such entry will have a copy of some
+ * existing DB's datfrozenxid, and that source DB cannot be ours because
+ * of the interlock against copying a DB containing an active backend.
+ * Hence the new entry will not reduce the minimum. Also, if two VACUUMs
+ * concurrently modify the datfrozenxid's of different databases, the
+ * worst possible outcome is that pg_xact is not truncated as aggressively
+ * as it could be.
+ */
+ relation = table_open(DatabaseRelationId, AccessShareLock);
+
+ scan = table_beginscan_catalog(relation, 0, NULL);
+
+ while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+ {
+ volatile FormData_pg_database *dbform = (Form_pg_database) GETSTRUCT(tuple);
+ TransactionId datfrozenxid = dbform->datfrozenxid;
+ TransactionId datminmxid = dbform->datminmxid;
+
+ Assert(TransactionIdIsNormal(datfrozenxid));
+ Assert(MultiXactIdIsValid(datminmxid));
+
+ /*
+ * If things are working properly, no database should have a
+ * datfrozenxid or datminmxid that is "in the future". However, such
+ * cases have been known to arise due to bugs in pg_upgrade. If we
+ * see any entries that are "in the future", chicken out and don't do
+ * anything. This ensures we won't truncate clog before those
+ * databases have been scanned and cleaned up. (We will issue the
+ * "already wrapped" warning if appropriate, though.)
+ */
+ if (TransactionIdPrecedes(lastSaneFrozenXid, datfrozenxid) ||
+ MultiXactIdPrecedes(lastSaneMinMulti, datminmxid))
+ bogus = true;
+
+ if (TransactionIdPrecedes(nextXID, datfrozenxid))
+ frozenAlreadyWrapped = true;
+ else if (TransactionIdPrecedes(datfrozenxid, frozenXID))
+ {
+ frozenXID = datfrozenxid;
+ oldestxid_datoid = dbform->oid;
+ }
+
+ if (MultiXactIdPrecedes(datminmxid, minMulti))
+ {
+ minMulti = datminmxid;
+ minmulti_datoid = dbform->oid;
+ }
+ }
+
+ table_endscan(scan);
+
+ table_close(relation, AccessShareLock);
+
+ /*
+ * Do not truncate CLOG if we seem to have suffered wraparound already;
+ * the computed minimum XID might be bogus. This case should now be
+ * impossible due to the defenses in GetNewTransactionId, but we keep the
+ * test anyway.
+ */
+ if (frozenAlreadyWrapped)
+ {
+ ereport(WARNING,
+ (errmsg("some databases have not been vacuumed in over 2 billion transactions"),
+ errdetail("You might have already suffered transaction-wraparound data loss.")));
+ return;
+ }
+
+ /* chicken out if data is bogus in any other way */
+ if (bogus)
+ return;
+
+ /*
+ * Advance the oldest value for commit timestamps before truncating, so
+ * that if a user requests a timestamp for a transaction we're truncating
+ * away right after this point, they get NULL instead of an ugly "file not
+ * found" error from slru.c. This doesn't matter for xact/multixact
+ * because they are not subject to arbitrary lookups from users.
+ */
+ AdvanceOldestCommitTsXid(frozenXID);
+
+ /*
+ * Truncate CLOG, multixact and CommitTs to the oldest computed value.
+ */
+ TruncateCLOG(frozenXID, oldestxid_datoid);
+ TruncateCommitTs(frozenXID);
+ TruncateMultiXact(minMulti, minmulti_datoid);
+
+ /*
+ * Update the wrap limit for GetNewTransactionId and creation of new
+ * MultiXactIds. Note: these functions will also signal the postmaster
+ * for an(other) autovac cycle if needed. XXX should we avoid possibly
+ * signaling twice?
+ */
+ SetTransactionIdLimit(frozenXID, oldestxid_datoid);
+ SetMultiXactIdLimit(minMulti, minmulti_datoid, false);
+
+ LWLockRelease(WrapLimitsVacuumLock);
+}
+
+
+/*
+ * vacuum_rel() -- vacuum one heap relation
+ *
+ * relid identifies the relation to vacuum. If relation is supplied,
+ * use the name therein for reporting any failure to open/lock the rel;
+ * do not use it once we've successfully opened the rel, since it might
+ * be stale.
+ *
+ * Returns true if it's okay to proceed with a requested ANALYZE
+ * operation on this table.
+ *
+ * Doing one heap at a time incurs extra overhead, since we need to
+ * check that the heap exists again just before we vacuum it. The
+ * reason that we do this is so that vacuuming can be spread across
+ * many small transactions. Otherwise, two-phase locking would require
+ * us to lock the entire database during one pass of the vacuum cleaner.
+ *
+ * At entry and exit, we are not inside a transaction.
+ */
+static bool
+vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params)
+{
+ LOCKMODE lmode;
+ Relation onerel;
+ LockRelId onerelid;
+ Oid toast_relid;
+ Oid save_userid;
+ int save_sec_context;
+ int save_nestlevel;
+
+ Assert(params != NULL);
+
+ /* Begin a transaction for vacuuming this relation */
+ StartTransactionCommand();
+
+ /*
+ * Functions in indexes may want a snapshot set. Also, setting a snapshot
+ * ensures that RecentGlobalXmin is kept truly recent.
+ */
+ PushActiveSnapshot(GetTransactionSnapshot());
+
+ if (!(params->options & VACOPT_FULL))
+ {
+ /*
+ * In lazy vacuum, we can set the PROC_IN_VACUUM flag, which lets
+ * other concurrent VACUUMs know that they can ignore this one while
+ * determining their OldestXmin. (The reason we don't set it during a
+ * full VACUUM is exactly that we may have to run user-defined
+ * functions for functional indexes, and we want to make sure that if
+ * they use the snapshot set above, any tuples it requires can't get
+ * removed from other tables. An index function that depends on the
+ * contents of other tables is arguably broken, but we won't break it
+ * here by violating transaction semantics.)
+ *
+ * We also set the VACUUM_FOR_WRAPAROUND flag, which is passed down by
+ * autovacuum; it's used to avoid canceling a vacuum that was invoked
+ * in an emergency.
+ *
+ * Note: these flags remain set until CommitTransaction or
+ * AbortTransaction. We don't want to clear them until we reset
+ * MyPgXact->xid/xmin, else OldestXmin might appear to go backwards,
+ * which is probably Not Good.
+ */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ MyPgXact->vacuumFlags |= PROC_IN_VACUUM;
+ if (params->is_wraparound)
+ MyPgXact->vacuumFlags |= PROC_VACUUM_FOR_WRAPAROUND;
+ LWLockRelease(ProcArrayLock);
+ }
+
+ /*
+ * Check for user-requested abort. Note we want this to be inside a
+ * transaction, so xact.c doesn't issue useless WARNING.
+ */
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * Determine the type of lock we want --- hard exclusive lock for a FULL
+ * vacuum, but just ShareUpdateExclusiveLock for concurrent vacuum. Either
+ * way, we can be sure that no other backend is vacuuming the same table.
+ */
+ lmode = (params->options & VACOPT_FULL) ?
+ AccessExclusiveLock : ShareUpdateExclusiveLock;
+
+ /* open the relation and get the appropriate lock on it */
+ onerel = vacuum_open_relation(relid, relation, params->options,
+ params->log_min_duration >= 0, lmode);
+
+ /* leave if relation could not be opened or locked */
+ if (!onerel)
+ {
+ PopActiveSnapshot();
+ CommitTransactionCommand();
+ return false;
+ }
+
+ /*
+ * Check if relation needs to be skipped based on ownership. This check
+ * happens also when building the relation list to vacuum for a manual
+ * operation, and needs to be done additionally here as VACUUM could
+ * happen across multiple transactions where relation ownership could have
+ * changed in-between. Make sure to only generate logs for VACUUM in this
+ * case.
+ */
+ if (!vacuum_is_relation_owner(RelationGetRelid(onerel),
+ onerel->rd_rel,
+ params->options & VACOPT_VACUUM))
+ {
+ relation_close(onerel, lmode);
+ PopActiveSnapshot();
+ CommitTransactionCommand();
+ return false;
+ }
+
+ /*
+ * Check that it's of a vacuumable relkind.
+ */
+ if (onerel->rd_rel->relkind != RELKIND_RELATION &&
+ onerel->rd_rel->relkind != RELKIND_MATVIEW &&
+ onerel->rd_rel->relkind != RELKIND_TOASTVALUE &&
+ onerel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+ {
+ ereport(WARNING,
+ (errmsg("skipping \"%s\" --- cannot vacuum non-tables or special system tables",
+ RelationGetRelationName(onerel))));
+ relation_close(onerel, lmode);
+ PopActiveSnapshot();
+ CommitTransactionCommand();
+ return false;
+ }
+
+ /*
+ * Silently ignore tables that are temp tables of other backends ---
+ * trying to vacuum these will lead to great unhappiness, since their
+ * contents are probably not up-to-date on disk. (We don't throw a
+ * warning here; it would just lead to chatter during a database-wide
+ * VACUUM.)
+ */
+ if (RELATION_IS_OTHER_TEMP(onerel))
+ {
+ relation_close(onerel, lmode);
+ PopActiveSnapshot();
+ CommitTransactionCommand();
+ return false;
+ }
+
+ /*
+ * Silently ignore partitioned tables as there is no work to be done. The
+ * useful work is on their child partitions, which have been queued up for
+ * us separately.
+ */
+ if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+ {
+ relation_close(onerel, lmode);
+ PopActiveSnapshot();
+ CommitTransactionCommand();
+ /* It's OK to proceed with ANALYZE on this table */
+ return true;
+ }
+
+ /*
+ * Get a session-level lock too. This will protect our access to the
+ * relation across multiple transactions, so that we can vacuum the
+ * relation's TOAST table (if any) secure in the knowledge that no one is
+ * deleting the parent relation.
+ *
+ * NOTE: this cannot block, even if someone else is waiting for access,
+ * because the lock manager knows that both lock requests are from the
+ * same process.
+ */
+ onerelid = onerel->rd_lockInfo.lockRelId;
+ LockRelationIdForSession(&onerelid, lmode);
+
+ /* Set index cleanup option based on reloptions if not yet */
+ if (params->index_cleanup == VACOPT_TERNARY_DEFAULT)
+ {
+ if (onerel->rd_options == NULL ||
+ ((StdRdOptions *) onerel->rd_options)->vacuum_index_cleanup)
+ params->index_cleanup = VACOPT_TERNARY_ENABLED;
+ else
+ params->index_cleanup = VACOPT_TERNARY_DISABLED;
+ }
+
+ /* Set truncate option based on reloptions if not yet */
+ if (params->truncate == VACOPT_TERNARY_DEFAULT)
+ {
+ if (onerel->rd_options == NULL ||
+ ((StdRdOptions *) onerel->rd_options)->vacuum_truncate)
+ params->truncate = VACOPT_TERNARY_ENABLED;
+ else
+ params->truncate = VACOPT_TERNARY_DISABLED;
+ }
+
+ /*
+ * Remember the relation's TOAST relation for later, if the caller asked
+ * us to process it. In VACUUM FULL, though, the toast table is
+ * automatically rebuilt by cluster_rel so we shouldn't recurse to it.
+ */
+ if (!(params->options & VACOPT_SKIPTOAST) && !(params->options & VACOPT_FULL))
+ toast_relid = onerel->rd_rel->reltoastrelid;
+ else
+ toast_relid = InvalidOid;
+
+ /*
+ * Switch to the table owner's userid, so that any index functions are run
+ * as that user. Also lock down security-restricted operations and
+ * arrange to make GUC variable changes local to this command. (This is
+ * unnecessary, but harmless, for lazy VACUUM.)
+ */
+ GetUserIdAndSecContext(&save_userid, &save_sec_context);
+ SetUserIdAndSecContext(onerel->rd_rel->relowner,
+ save_sec_context | SECURITY_RESTRICTED_OPERATION);
+ save_nestlevel = NewGUCNestLevel();
+
+ /*
+ * Do the actual work --- either FULL or "lazy" vacuum
+ */
+ if (params->options & VACOPT_FULL)
+ {
+ int cluster_options = 0;
+
+ /* close relation before vacuuming, but hold lock until commit */
+ relation_close(onerel, NoLock);
+ onerel = NULL;
+
+ if ((params->options & VACOPT_VERBOSE) != 0)
+ cluster_options |= CLUOPT_VERBOSE;
+
+ /* VACUUM FULL is now a variant of CLUSTER; see cluster.c */
+ cluster_rel(relid, InvalidOid, cluster_options);
+ }
+ else
+ table_relation_vacuum(onerel, params, vac_strategy);
+
+ /* Roll back any GUC changes executed by index functions */
+ AtEOXact_GUC(false, save_nestlevel);
+
+ /* Restore userid and security context */
+ SetUserIdAndSecContext(save_userid, save_sec_context);
+
+ /* all done with this class, but hold lock until commit */
+ if (onerel)
+ relation_close(onerel, NoLock);
+
+ /*
+ * Complete the transaction and free all temporary memory used.
+ */
+ PopActiveSnapshot();
+ CommitTransactionCommand();
+
+ /*
+ * If the relation has a secondary toast rel, vacuum that too while we
+ * still hold the session lock on the master table. Note however that
+ * "analyze" will not get done on the toast table. This is good, because
+ * the toaster always uses hardcoded index access and statistics are
+ * totally unimportant for toast relations.
+ */
+ if (toast_relid != InvalidOid)
+ vacuum_rel(toast_relid, NULL, params);
+
+ /*
+ * Now release the session-level lock on the master table.
+ */
+ UnlockRelationIdForSession(&onerelid, lmode);
+
+ /* Report that we really did it. */
+ return true;
+}
+
+
+/*
+ * Open all the vacuumable indexes of the given relation, obtaining the
+ * specified kind of lock on each. Return an array of Relation pointers for
+ * the indexes into *Irel, and the number of indexes into *nindexes.
+ *
+ * We consider an index vacuumable if it is marked insertable (indisready).
+ * If it isn't, probably a CREATE INDEX CONCURRENTLY command failed early in
+ * execution, and what we have is too corrupt to be processable. We will
+ * vacuum even if the index isn't indisvalid; this is important because in a
+ * unique index, uniqueness checks will be performed anyway and had better not
+ * hit dangling index pointers.
+ */
+void
+vac_open_indexes(Relation relation, LOCKMODE lockmode,
+ int *nindexes, Relation **Irel)
+{
+ List *indexoidlist;
+ ListCell *indexoidscan;
+ int i;
+
+ Assert(lockmode != NoLock);
+
+ indexoidlist = RelationGetIndexList(relation);
+
+ /* allocate enough memory for all indexes */
+ i = list_length(indexoidlist);
+
+ if (i > 0)
+ *Irel = (Relation *) palloc(i * sizeof(Relation));
+ else
+ *Irel = NULL;
+
+ /* collect just the ready indexes */
+ i = 0;
+ foreach(indexoidscan, indexoidlist)
+ {
+ Oid indexoid = lfirst_oid(indexoidscan);
+ Relation indrel;
+
+ indrel = index_open(indexoid, lockmode);
+ if (indrel->rd_index->indisready)
+ (*Irel)[i++] = indrel;
+ else
+ index_close(indrel, lockmode);
+ }
+
+ *nindexes = i;
+
+ list_free(indexoidlist);
+}
+
+/*
+ * Release the resources acquired by vac_open_indexes. Optionally release
+ * the locks (say NoLock to keep 'em).
+ */
+void
+vac_close_indexes(int nindexes, Relation *Irel, LOCKMODE lockmode)
+{
+ if (Irel == NULL)
+ return;
+
+ while (nindexes--)
+ {
+ Relation ind = Irel[nindexes];
+
+ index_close(ind, lockmode);
+ }
+ pfree(Irel);
+}
+
+/*
+ * vacuum_delay_point --- check for interrupts and cost-based delay.
+ *
+ * This should be called in each major loop of VACUUM processing,
+ * typically once per page processed.
+ */
+void
+vacuum_delay_point(void)
+{
+ double msec = 0;
+
+ /* Always check for interrupts */
+ CHECK_FOR_INTERRUPTS();
+
+ if (!VacuumCostActive || InterruptPending)
+ return;
+
+ /*
+ * For parallel vacuum, the delay is computed based on the shared cost
+ * balance. See compute_parallel_delay.
+ */
+ if (VacuumSharedCostBalance != NULL)
+ msec = compute_parallel_delay();
+ else if (VacuumCostBalance >= VacuumCostLimit)
+ msec = VacuumCostDelay * VacuumCostBalance / VacuumCostLimit;
+
+ /* Nap if appropriate */
+ if (msec > 0)
+ {
+ if (msec > VacuumCostDelay * 4)
+ msec = VacuumCostDelay * 4;
+
+ pgstat_report_wait_start(WAIT_EVENT_VACUUM_DELAY);
+ pg_usleep((long) (msec * 1000));
+ pgstat_report_wait_end();
+
+ VacuumCostBalance = 0;
+
+ /* update balance values for workers */
+ AutoVacuumUpdateDelay();
+
+ /* Might have gotten an interrupt while sleeping */
+ CHECK_FOR_INTERRUPTS();
+ }
+}
+
+/*
+ * Computes the vacuum delay for parallel workers.
+ *
+ * The basic idea of a cost-based delay for parallel vacuum is to allow each
+ * worker to sleep in proportion to the share of work it's done. We achieve this
+ * by allowing all parallel vacuum workers including the leader process to
+ * have a shared view of cost related parameters (mainly VacuumCostBalance).
+ * We allow each worker to update it as and when it has incurred any cost and
+ * then based on that decide whether it needs to sleep. We compute the time
+ * to sleep for a worker based on the cost it has incurred
+ * (VacuumCostBalanceLocal) and then reduce the VacuumSharedCostBalance by
+ * that amount. This avoids putting to sleep those workers which have done less
+ * I/O than other workers and therefore ensure that workers
+ * which are doing more I/O got throttled more.
+ *
+ * We allow a worker to sleep only if it has performed I/O above a certain
+ * threshold, which is calculated based on the number of active workers
+ * (VacuumActiveNWorkers), and the overall cost balance is more than
+ * VacuumCostLimit set by the system. Testing reveals that we achieve
+ * the required throttling if we force a worker that has done more than 50%
+ * of its share of work to sleep.
+ */
+static double
+compute_parallel_delay(void)
+{
+ double msec = 0;
+ uint32 shared_balance;
+ int nworkers;
+
+ /* Parallel vacuum must be active */
+ Assert(VacuumSharedCostBalance);
+
+ nworkers = pg_atomic_read_u32(VacuumActiveNWorkers);
+
+ /* At least count itself */
+ Assert(nworkers >= 1);
+
+ /* Update the shared cost balance value atomically */
+ shared_balance = pg_atomic_add_fetch_u32(VacuumSharedCostBalance, VacuumCostBalance);
+
+ /* Compute the total local balance for the current worker */
+ VacuumCostBalanceLocal += VacuumCostBalance;
+
+ if ((shared_balance >= VacuumCostLimit) &&
+ (VacuumCostBalanceLocal > 0.5 * ((double) VacuumCostLimit / nworkers)))
+ {
+ /* Compute sleep time based on the local cost balance */
+ msec = VacuumCostDelay * VacuumCostBalanceLocal / VacuumCostLimit;
+ pg_atomic_sub_fetch_u32(VacuumSharedCostBalance, VacuumCostBalanceLocal);
+ VacuumCostBalanceLocal = 0;
+ }
+
+ /*
+ * Reset the local balance as we accumulated it into the shared value.
+ */
+ VacuumCostBalance = 0;
+
+ return msec;
+}
+
+/*
+ * A wrapper function of defGetBoolean().
+ *
+ * This function returns VACOPT_TERNARY_ENABLED and VACOPT_TERNARY_DISABLED
+ * instead of true and false.
+ */
+static VacOptTernaryValue
+get_vacopt_ternary_value(DefElem *def)
+{
+ return defGetBoolean(def) ? VACOPT_TERNARY_ENABLED : VACOPT_TERNARY_DISABLED;
+}