summaryrefslogtreecommitdiffstats
path: root/src/backend/commands/cluster.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:17:33 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:17:33 +0000
commit5e45211a64149b3c659b90ff2de6fa982a5a93ed (patch)
tree739caf8c461053357daa9f162bef34516c7bf452 /src/backend/commands/cluster.c
parentInitial commit. (diff)
downloadpostgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.tar.xz
postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.zip
Adding upstream version 15.5.upstream/15.5
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/commands/cluster.c')
-rw-r--r--src/backend/commands/cluster.c1736
1 files changed, 1736 insertions, 0 deletions
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
new file mode 100644
index 0000000..e4b7ffd
--- /dev/null
+++ b/src/backend/commands/cluster.c
@@ -0,0 +1,1736 @@
+/*-------------------------------------------------------------------------
+ *
+ * cluster.c
+ * CLUSTER a table on an index. This is now also used for VACUUM FULL.
+ *
+ * There is hardly anything left of Paul Brown's original implementation...
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994-5, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/commands/cluster.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/amapi.h"
+#include "access/heapam.h"
+#include "access/multixact.h"
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "access/toast_internals.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "catalog/catalog.h"
+#include "catalog/dependency.h"
+#include "catalog/heap.h"
+#include "catalog/index.h"
+#include "catalog/namespace.h"
+#include "catalog/objectaccess.h"
+#include "catalog/partition.h"
+#include "catalog/pg_am.h"
+#include "catalog/pg_inherits.h"
+#include "catalog/toasting.h"
+#include "commands/cluster.h"
+#include "commands/defrem.h"
+#include "commands/progress.h"
+#include "commands/tablecmds.h"
+#include "commands/vacuum.h"
+#include "miscadmin.h"
+#include "optimizer/optimizer.h"
+#include "pgstat.h"
+#include "storage/bufmgr.h"
+#include "storage/lmgr.h"
+#include "storage/predicate.h"
+#include "utils/acl.h"
+#include "utils/fmgroids.h"
+#include "utils/inval.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/pg_rusage.h"
+#include "utils/relmapper.h"
+#include "utils/snapmgr.h"
+#include "utils/syscache.h"
+#include "utils/tuplesort.h"
+
+/*
+ * This struct is used to pass around the information on tables to be
+ * clustered. We need this so we can make a list of them when invoked without
+ * a specific table/index pair.
+ */
+typedef struct
+{
+ Oid tableOid;
+ Oid indexOid;
+} RelToCluster;
+
+
+static void cluster_multiple_rels(List *rtcs, ClusterParams *params);
+static void rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose);
+static void copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
+ bool verbose, bool *pSwapToastByContent,
+ TransactionId *pFreezeXid, MultiXactId *pCutoffMulti);
+static List *get_tables_to_cluster(MemoryContext cluster_context);
+static List *get_tables_to_cluster_partitioned(MemoryContext cluster_context,
+ Oid indexOid);
+
+
+/*---------------------------------------------------------------------------
+ * This cluster code allows for clustering multiple tables at once. Because
+ * of this, we cannot just run everything on a single transaction, or we
+ * would be forced to acquire exclusive locks on all the tables being
+ * clustered, simultaneously --- very likely leading to deadlock.
+ *
+ * To solve this we follow a similar strategy to VACUUM code,
+ * clustering each relation in a separate transaction. For this to work,
+ * we need to:
+ * - provide a separate memory context so that we can pass information in
+ * a way that survives across transactions
+ * - start a new transaction every time a new relation is clustered
+ * - check for validity of the information on to-be-clustered relations,
+ * as someone might have deleted a relation behind our back, or
+ * clustered one on a different index
+ * - end the transaction
+ *
+ * The single-relation case does not have any such overhead.
+ *
+ * We also allow a relation to be specified without index. In that case,
+ * the indisclustered bit will be looked up, and an ERROR will be thrown
+ * if there is no index with the bit set.
+ *---------------------------------------------------------------------------
+ */
+void
+cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel)
+{
+ ListCell *lc;
+ ClusterParams params = {0};
+ bool verbose = false;
+ Relation rel = NULL;
+ Oid indexOid = InvalidOid;
+ MemoryContext cluster_context;
+ List *rtcs;
+
+ /* Parse option list */
+ foreach(lc, stmt->params)
+ {
+ DefElem *opt = (DefElem *) lfirst(lc);
+
+ if (strcmp(opt->defname, "verbose") == 0)
+ verbose = defGetBoolean(opt);
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("unrecognized CLUSTER option \"%s\"",
+ opt->defname),
+ parser_errposition(pstate, opt->location)));
+ }
+
+ params.options = (verbose ? CLUOPT_VERBOSE : 0);
+
+ if (stmt->relation != NULL)
+ {
+ /* This is the single-relation case. */
+ Oid tableOid;
+
+ /*
+ * Find, lock, and check permissions on the table. We obtain
+ * AccessExclusiveLock right away to avoid lock-upgrade hazard in the
+ * single-transaction case.
+ */
+ tableOid = RangeVarGetRelidExtended(stmt->relation,
+ AccessExclusiveLock,
+ 0,
+ RangeVarCallbackOwnsTable, NULL);
+ rel = table_open(tableOid, NoLock);
+
+ /*
+ * Reject clustering a remote temp table ... their local buffer
+ * manager is not going to cope.
+ */
+ if (RELATION_IS_OTHER_TEMP(rel))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot cluster temporary tables of other sessions")));
+
+ if (stmt->indexname == NULL)
+ {
+ ListCell *index;
+
+ /* We need to find the index that has indisclustered set. */
+ foreach(index, RelationGetIndexList(rel))
+ {
+ indexOid = lfirst_oid(index);
+ if (get_index_isclustered(indexOid))
+ break;
+ indexOid = InvalidOid;
+ }
+
+ if (!OidIsValid(indexOid))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("there is no previously clustered index for table \"%s\"",
+ stmt->relation->relname)));
+ }
+ else
+ {
+ /*
+ * The index is expected to be in the same namespace as the
+ * relation.
+ */
+ indexOid = get_relname_relid(stmt->indexname,
+ rel->rd_rel->relnamespace);
+ if (!OidIsValid(indexOid))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("index \"%s\" for table \"%s\" does not exist",
+ stmt->indexname, stmt->relation->relname)));
+ }
+
+ if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+ {
+ /* close relation, keep lock till commit */
+ table_close(rel, NoLock);
+
+ /* Do the job. */
+ cluster_rel(tableOid, indexOid, &params);
+
+ return;
+ }
+ }
+
+ /*
+ * By here, we know we are in a multi-table situation. In order to avoid
+ * holding locks for too long, we want to process each table in its own
+ * transaction. This forces us to disallow running inside a user
+ * transaction block.
+ */
+ PreventInTransactionBlock(isTopLevel, "CLUSTER");
+
+ /* Also, we need a memory context to hold our list of relations */
+ cluster_context = AllocSetContextCreate(PortalContext,
+ "Cluster",
+ ALLOCSET_DEFAULT_SIZES);
+
+ /*
+ * Either we're processing a partitioned table, or we were not given any
+ * table name at all. In either case, obtain a list of relations to
+ * process.
+ *
+ * In the former case, an index name must have been given, so we don't
+ * need to recheck its "indisclustered" bit, but we have to check that it
+ * is an index that we can cluster on. In the latter case, we set the
+ * option bit to have indisclustered verified.
+ *
+ * Rechecking the relation itself is necessary here in all cases.
+ */
+ params.options |= CLUOPT_RECHECK;
+ if (rel != NULL)
+ {
+ Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
+ check_index_is_clusterable(rel, indexOid, AccessShareLock);
+ rtcs = get_tables_to_cluster_partitioned(cluster_context, indexOid);
+
+ /* close relation, releasing lock on parent table */
+ table_close(rel, AccessExclusiveLock);
+ }
+ else
+ {
+ rtcs = get_tables_to_cluster(cluster_context);
+ params.options |= CLUOPT_RECHECK_ISCLUSTERED;
+ }
+
+ /* Do the job. */
+ cluster_multiple_rels(rtcs, &params);
+
+ /* Start a new transaction for the cleanup work. */
+ StartTransactionCommand();
+
+ /* Clean up working storage */
+ MemoryContextDelete(cluster_context);
+}
+
+/*
+ * Given a list of relations to cluster, process each of them in a separate
+ * transaction.
+ *
+ * We expect to be in a transaction at start, but there isn't one when we
+ * return.
+ */
+static void
+cluster_multiple_rels(List *rtcs, ClusterParams *params)
+{
+ ListCell *lc;
+
+ /* Commit to get out of starting transaction */
+ PopActiveSnapshot();
+ CommitTransactionCommand();
+
+ /* Cluster the tables, each in a separate transaction */
+ foreach(lc, rtcs)
+ {
+ RelToCluster *rtc = (RelToCluster *) lfirst(lc);
+
+ /* Start a new transaction for each relation. */
+ StartTransactionCommand();
+
+ /* functions in indexes may want a snapshot set */
+ PushActiveSnapshot(GetTransactionSnapshot());
+
+ /* Do the job. */
+ cluster_rel(rtc->tableOid, rtc->indexOid, params);
+
+ PopActiveSnapshot();
+ CommitTransactionCommand();
+ }
+}
+
+/*
+ * cluster_rel
+ *
+ * This clusters the table by creating a new, clustered table and
+ * swapping the relfilenodes of the new table and the old table, so
+ * the OID of the original table is preserved. Thus we do not lose
+ * GRANT, inheritance nor references to this table (this was a bug
+ * in releases through 7.3).
+ *
+ * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading
+ * the new table, it's better to create the indexes afterwards than to fill
+ * them incrementally while we load the table.
+ *
+ * If indexOid is InvalidOid, the table will be rewritten in physical order
+ * instead of index order. This is the new implementation of VACUUM FULL,
+ * and error messages should refer to the operation as VACUUM not CLUSTER.
+ */
+void
+cluster_rel(Oid tableOid, Oid indexOid, ClusterParams *params)
+{
+ Relation OldHeap;
+ Oid save_userid;
+ int save_sec_context;
+ int save_nestlevel;
+ bool verbose = ((params->options & CLUOPT_VERBOSE) != 0);
+ bool recheck = ((params->options & CLUOPT_RECHECK) != 0);
+
+ /* Check for user-requested abort. */
+ CHECK_FOR_INTERRUPTS();
+
+ pgstat_progress_start_command(PROGRESS_COMMAND_CLUSTER, tableOid);
+ if (OidIsValid(indexOid))
+ pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
+ PROGRESS_CLUSTER_COMMAND_CLUSTER);
+ else
+ pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
+ PROGRESS_CLUSTER_COMMAND_VACUUM_FULL);
+
+ /*
+ * We grab exclusive access to the target rel and index for the duration
+ * of the transaction. (This is redundant for the single-transaction
+ * case, since cluster() already did it.) The index lock is taken inside
+ * check_index_is_clusterable.
+ */
+ OldHeap = try_relation_open(tableOid, AccessExclusiveLock);
+
+ /* If the table has gone away, we can skip processing it */
+ if (!OldHeap)
+ {
+ pgstat_progress_end_command();
+ return;
+ }
+
+ /*
+ * Switch to the table owner's userid, so that any index functions are run
+ * as that user. Also lock down security-restricted operations and
+ * arrange to make GUC variable changes local to this command.
+ */
+ GetUserIdAndSecContext(&save_userid, &save_sec_context);
+ SetUserIdAndSecContext(OldHeap->rd_rel->relowner,
+ save_sec_context | SECURITY_RESTRICTED_OPERATION);
+ save_nestlevel = NewGUCNestLevel();
+
+ /*
+ * Since we may open a new transaction for each relation, we have to check
+ * that the relation still is what we think it is.
+ *
+ * If this is a single-transaction CLUSTER, we can skip these tests. We
+ * *must* skip the one on indisclustered since it would reject an attempt
+ * to cluster a not-previously-clustered index.
+ */
+ if (recheck)
+ {
+ /* Check that the user still owns the relation */
+ if (!pg_class_ownercheck(tableOid, save_userid))
+ {
+ relation_close(OldHeap, AccessExclusiveLock);
+ goto out;
+ }
+
+ /*
+ * Silently skip a temp table for a remote session. Only doing this
+ * check in the "recheck" case is appropriate (which currently means
+ * somebody is executing a database-wide CLUSTER or on a partitioned
+ * table), because there is another check in cluster() which will stop
+ * any attempt to cluster remote temp tables by name. There is
+ * another check in cluster_rel which is redundant, but we leave it
+ * for extra safety.
+ */
+ if (RELATION_IS_OTHER_TEMP(OldHeap))
+ {
+ relation_close(OldHeap, AccessExclusiveLock);
+ goto out;
+ }
+
+ if (OidIsValid(indexOid))
+ {
+ /*
+ * Check that the index still exists
+ */
+ if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid)))
+ {
+ relation_close(OldHeap, AccessExclusiveLock);
+ goto out;
+ }
+
+ /*
+ * Check that the index is still the one with indisclustered set,
+ * if needed.
+ */
+ if ((params->options & CLUOPT_RECHECK_ISCLUSTERED) != 0 &&
+ !get_index_isclustered(indexOid))
+ {
+ relation_close(OldHeap, AccessExclusiveLock);
+ goto out;
+ }
+ }
+ }
+
+ /*
+ * We allow VACUUM FULL, but not CLUSTER, on shared catalogs. CLUSTER
+ * would work in most respects, but the index would only get marked as
+ * indisclustered in the current database, leading to unexpected behavior
+ * if CLUSTER were later invoked in another database.
+ */
+ if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot cluster a shared catalog")));
+
+ /*
+ * Don't process temp tables of other backends ... their local buffer
+ * manager is not going to cope.
+ */
+ if (RELATION_IS_OTHER_TEMP(OldHeap))
+ {
+ if (OidIsValid(indexOid))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot cluster temporary tables of other sessions")));
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot vacuum temporary tables of other sessions")));
+ }
+
+ /*
+ * Also check for active uses of the relation in the current transaction,
+ * including open scans and pending AFTER trigger events.
+ */
+ CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM");
+
+ /* Check heap and index are valid to cluster on */
+ if (OidIsValid(indexOid))
+ check_index_is_clusterable(OldHeap, indexOid, AccessExclusiveLock);
+
+ /*
+ * Quietly ignore the request if this is a materialized view which has not
+ * been populated from its query. No harm is done because there is no data
+ * to deal with, and we don't want to throw an error if this is part of a
+ * multi-relation request -- for example, CLUSTER was run on the entire
+ * database.
+ */
+ if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW &&
+ !RelationIsPopulated(OldHeap))
+ {
+ relation_close(OldHeap, AccessExclusiveLock);
+ goto out;
+ }
+
+ Assert(OldHeap->rd_rel->relkind == RELKIND_RELATION ||
+ OldHeap->rd_rel->relkind == RELKIND_MATVIEW ||
+ OldHeap->rd_rel->relkind == RELKIND_TOASTVALUE);
+
+ /*
+ * All predicate locks on the tuples or pages are about to be made
+ * invalid, because we move tuples around. Promote them to relation
+ * locks. Predicate locks on indexes will be promoted when they are
+ * reindexed.
+ */
+ TransferPredicateLocksToHeapRelation(OldHeap);
+
+ /* rebuild_relation does all the dirty work */
+ rebuild_relation(OldHeap, indexOid, verbose);
+
+ /* NB: rebuild_relation does table_close() on OldHeap */
+
+out:
+ /* Roll back any GUC changes executed by index functions */
+ AtEOXact_GUC(false, save_nestlevel);
+
+ /* Restore userid and security context */
+ SetUserIdAndSecContext(save_userid, save_sec_context);
+
+ pgstat_progress_end_command();
+}
+
+/*
+ * Verify that the specified heap and index are valid to cluster on
+ *
+ * Side effect: obtains lock on the index. The caller may
+ * in some cases already have AccessExclusiveLock on the table, but
+ * not in all cases so we can't rely on the table-level lock for
+ * protection here.
+ */
+void
+check_index_is_clusterable(Relation OldHeap, Oid indexOid, LOCKMODE lockmode)
+{
+ Relation OldIndex;
+
+ OldIndex = index_open(indexOid, lockmode);
+
+ /*
+ * Check that index is in fact an index on the given relation
+ */
+ if (OldIndex->rd_index == NULL ||
+ OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("\"%s\" is not an index for table \"%s\"",
+ RelationGetRelationName(OldIndex),
+ RelationGetRelationName(OldHeap))));
+
+ /* Index AM must allow clustering */
+ if (!OldIndex->rd_indam->amclusterable)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
+ RelationGetRelationName(OldIndex))));
+
+ /*
+ * Disallow clustering on incomplete indexes (those that might not index
+ * every row of the relation). We could relax this by making a separate
+ * seqscan pass over the table to copy the missing rows, but that seems
+ * expensive and tedious.
+ */
+ if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred, NULL))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot cluster on partial index \"%s\"",
+ RelationGetRelationName(OldIndex))));
+
+ /*
+ * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
+ * it might well not contain entries for every heap row, or might not even
+ * be internally consistent. (But note that we don't check indcheckxmin;
+ * the worst consequence of following broken HOT chains would be that we
+ * might put recently-dead tuples out-of-order in the new table, and there
+ * is little harm in that.)
+ */
+ if (!OldIndex->rd_index->indisvalid)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot cluster on invalid index \"%s\"",
+ RelationGetRelationName(OldIndex))));
+
+ /* Drop relcache refcnt on OldIndex, but keep lock */
+ index_close(OldIndex, NoLock);
+}
+
+/*
+ * mark_index_clustered: mark the specified index as the one clustered on
+ *
+ * With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
+ */
+void
+mark_index_clustered(Relation rel, Oid indexOid, bool is_internal)
+{
+ HeapTuple indexTuple;
+ Form_pg_index indexForm;
+ Relation pg_index;
+ ListCell *index;
+
+ /* Disallow applying to a partitioned table */
+ if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot mark index clustered in partitioned table")));
+
+ /*
+ * If the index is already marked clustered, no need to do anything.
+ */
+ if (OidIsValid(indexOid))
+ {
+ if (get_index_isclustered(indexOid))
+ return;
+ }
+
+ /*
+ * Check each index of the relation and set/clear the bit as needed.
+ */
+ pg_index = table_open(IndexRelationId, RowExclusiveLock);
+
+ foreach(index, RelationGetIndexList(rel))
+ {
+ Oid thisIndexOid = lfirst_oid(index);
+
+ indexTuple = SearchSysCacheCopy1(INDEXRELID,
+ ObjectIdGetDatum(thisIndexOid));
+ if (!HeapTupleIsValid(indexTuple))
+ elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
+ indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
+
+ /*
+ * Unset the bit if set. We know it's wrong because we checked this
+ * earlier.
+ */
+ if (indexForm->indisclustered)
+ {
+ indexForm->indisclustered = false;
+ CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
+ }
+ else if (thisIndexOid == indexOid)
+ {
+ /* this was checked earlier, but let's be real sure */
+ if (!indexForm->indisvalid)
+ elog(ERROR, "cannot cluster on invalid index %u", indexOid);
+ indexForm->indisclustered = true;
+ CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
+ }
+
+ InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, 0,
+ InvalidOid, is_internal);
+
+ heap_freetuple(indexTuple);
+ }
+
+ table_close(pg_index, RowExclusiveLock);
+}
+
+/*
+ * rebuild_relation: rebuild an existing relation in index or physical order
+ *
+ * OldHeap: table to rebuild --- must be opened and exclusive-locked!
+ * indexOid: index to cluster by, or InvalidOid to rewrite in physical order.
+ *
+ * NB: this routine closes OldHeap at the right time; caller should not.
+ */
+static void
+rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose)
+{
+ Oid tableOid = RelationGetRelid(OldHeap);
+ Oid accessMethod = OldHeap->rd_rel->relam;
+ Oid tableSpace = OldHeap->rd_rel->reltablespace;
+ Oid OIDNewHeap;
+ char relpersistence;
+ bool is_system_catalog;
+ bool swap_toast_by_content;
+ TransactionId frozenXid;
+ MultiXactId cutoffMulti;
+
+ if (OidIsValid(indexOid))
+ /* Mark the correct index as clustered */
+ mark_index_clustered(OldHeap, indexOid, true);
+
+ /* Remember info about rel before closing OldHeap */
+ relpersistence = OldHeap->rd_rel->relpersistence;
+ is_system_catalog = IsSystemRelation(OldHeap);
+
+ /* Close relcache entry, but keep lock until transaction commit */
+ table_close(OldHeap, NoLock);
+
+ /* Create the transient table that will receive the re-ordered data */
+ OIDNewHeap = make_new_heap(tableOid, tableSpace,
+ accessMethod,
+ relpersistence,
+ AccessExclusiveLock);
+
+ /* Copy the heap data into the new table in the desired order */
+ copy_table_data(OIDNewHeap, tableOid, indexOid, verbose,
+ &swap_toast_by_content, &frozenXid, &cutoffMulti);
+
+ /*
+ * Swap the physical files of the target and transient tables, then
+ * rebuild the target's indexes and throw away the transient table.
+ */
+ finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
+ swap_toast_by_content, false, true,
+ frozenXid, cutoffMulti,
+ relpersistence);
+}
+
+
+/*
+ * Create the transient table that will be filled with new data during
+ * CLUSTER, ALTER TABLE, and similar operations. The transient table
+ * duplicates the logical structure of the OldHeap; but will have the
+ * specified physical storage properties NewTableSpace, NewAccessMethod, and
+ * relpersistence.
+ *
+ * After this, the caller should load the new heap with transferred/modified
+ * data, then call finish_heap_swap to complete the operation.
+ */
+Oid
+make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, Oid NewAccessMethod,
+ char relpersistence, LOCKMODE lockmode)
+{
+ TupleDesc OldHeapDesc;
+ char NewHeapName[NAMEDATALEN];
+ Oid OIDNewHeap;
+ Oid toastid;
+ Relation OldHeap;
+ HeapTuple tuple;
+ Datum reloptions;
+ bool isNull;
+ Oid namespaceid;
+
+ OldHeap = table_open(OIDOldHeap, lockmode);
+ OldHeapDesc = RelationGetDescr(OldHeap);
+
+ /*
+ * Note that the NewHeap will not receive any of the defaults or
+ * constraints associated with the OldHeap; we don't need 'em, and there's
+ * no reason to spend cycles inserting them into the catalogs only to
+ * delete them.
+ */
+
+ /*
+ * But we do want to use reloptions of the old heap for new heap.
+ */
+ tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(OIDOldHeap));
+ if (!HeapTupleIsValid(tuple))
+ elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
+ reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
+ &isNull);
+ if (isNull)
+ reloptions = (Datum) 0;
+
+ if (relpersistence == RELPERSISTENCE_TEMP)
+ namespaceid = LookupCreationNamespace("pg_temp");
+ else
+ namespaceid = RelationGetNamespace(OldHeap);
+
+ /*
+ * Create the new heap, using a temporary name in the same namespace as
+ * the existing table. NOTE: there is some risk of collision with user
+ * relnames. Working around this seems more trouble than it's worth; in
+ * particular, we can't create the new heap in a different namespace from
+ * the old, or we will have problems with the TEMP status of temp tables.
+ *
+ * Note: the new heap is not a shared relation, even if we are rebuilding
+ * a shared rel. However, we do make the new heap mapped if the source is
+ * mapped. This simplifies swap_relation_files, and is absolutely
+ * necessary for rebuilding pg_class, for reasons explained there.
+ */
+ snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap);
+
+ OIDNewHeap = heap_create_with_catalog(NewHeapName,
+ namespaceid,
+ NewTableSpace,
+ InvalidOid,
+ InvalidOid,
+ InvalidOid,
+ OldHeap->rd_rel->relowner,
+ NewAccessMethod,
+ OldHeapDesc,
+ NIL,
+ RELKIND_RELATION,
+ relpersistence,
+ false,
+ RelationIsMapped(OldHeap),
+ ONCOMMIT_NOOP,
+ reloptions,
+ false,
+ true,
+ true,
+ OIDOldHeap,
+ NULL);
+ Assert(OIDNewHeap != InvalidOid);
+
+ ReleaseSysCache(tuple);
+
+ /*
+ * Advance command counter so that the newly-created relation's catalog
+ * tuples will be visible to table_open.
+ */
+ CommandCounterIncrement();
+
+ /*
+ * If necessary, create a TOAST table for the new relation.
+ *
+ * If the relation doesn't have a TOAST table already, we can't need one
+ * for the new relation. The other way around is possible though: if some
+ * wide columns have been dropped, NewHeapCreateToastTable can decide that
+ * no TOAST table is needed for the new table.
+ *
+ * Note that NewHeapCreateToastTable ends with CommandCounterIncrement, so
+ * that the TOAST table will be visible for insertion.
+ */
+ toastid = OldHeap->rd_rel->reltoastrelid;
+ if (OidIsValid(toastid))
+ {
+ /* keep the existing toast table's reloptions, if any */
+ tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastid));
+ if (!HeapTupleIsValid(tuple))
+ elog(ERROR, "cache lookup failed for relation %u", toastid);
+ reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
+ &isNull);
+ if (isNull)
+ reloptions = (Datum) 0;
+
+ NewHeapCreateToastTable(OIDNewHeap, reloptions, lockmode, toastid);
+
+ ReleaseSysCache(tuple);
+ }
+
+ table_close(OldHeap, NoLock);
+
+ return OIDNewHeap;
+}
+
+/*
+ * Do the physical copying of table data.
+ *
+ * There are three output parameters:
+ * *pSwapToastByContent is set true if toast tables must be swapped by content.
+ * *pFreezeXid receives the TransactionId used as freeze cutoff point.
+ * *pCutoffMulti receives the MultiXactId used as a cutoff point.
+ */
+static void
+copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
+ bool *pSwapToastByContent, TransactionId *pFreezeXid,
+ MultiXactId *pCutoffMulti)
+{
+ Relation NewHeap,
+ OldHeap,
+ OldIndex;
+ Relation relRelation;
+ HeapTuple reltup;
+ Form_pg_class relform;
+ TupleDesc oldTupDesc PG_USED_FOR_ASSERTS_ONLY;
+ TupleDesc newTupDesc PG_USED_FOR_ASSERTS_ONLY;
+ TransactionId OldestXmin,
+ FreezeXid;
+ MultiXactId OldestMxact,
+ MultiXactCutoff;
+ bool use_sort;
+ double num_tuples = 0,
+ tups_vacuumed = 0,
+ tups_recently_dead = 0;
+ BlockNumber num_pages;
+ int elevel = verbose ? INFO : DEBUG2;
+ PGRUsage ru0;
+ char *nspname;
+
+ pg_rusage_init(&ru0);
+
+ /*
+ * Open the relations we need.
+ */
+ NewHeap = table_open(OIDNewHeap, AccessExclusiveLock);
+ OldHeap = table_open(OIDOldHeap, AccessExclusiveLock);
+ if (OidIsValid(OIDOldIndex))
+ OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
+ else
+ OldIndex = NULL;
+
+ /* Store a copy of the namespace name for logging purposes */
+ nspname = get_namespace_name(RelationGetNamespace(OldHeap));
+
+ /*
+ * Their tuple descriptors should be exactly alike, but here we only need
+ * assume that they have the same number of columns.
+ */
+ oldTupDesc = RelationGetDescr(OldHeap);
+ newTupDesc = RelationGetDescr(NewHeap);
+ Assert(newTupDesc->natts == oldTupDesc->natts);
+
+ /*
+ * If the OldHeap has a toast table, get lock on the toast table to keep
+ * it from being vacuumed. This is needed because autovacuum processes
+ * toast tables independently of their main tables, with no lock on the
+ * latter. If an autovacuum were to start on the toast table after we
+ * compute our OldestXmin below, it would use a later OldestXmin, and then
+ * possibly remove as DEAD toast tuples belonging to main tuples we think
+ * are only RECENTLY_DEAD. Then we'd fail while trying to copy those
+ * tuples.
+ *
+ * We don't need to open the toast relation here, just lock it. The lock
+ * will be held till end of transaction.
+ */
+ if (OldHeap->rd_rel->reltoastrelid)
+ LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
+
+ /*
+ * If both tables have TOAST tables, perform toast swap by content. It is
+ * possible that the old table has a toast table but the new one doesn't,
+ * if toastable columns have been dropped. In that case we have to do
+ * swap by links. This is okay because swap by content is only essential
+ * for system catalogs, and we don't support schema changes for them.
+ */
+ if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid)
+ {
+ *pSwapToastByContent = true;
+
+ /*
+ * When doing swap by content, any toast pointers written into NewHeap
+ * must use the old toast table's OID, because that's where the toast
+ * data will eventually be found. Set this up by setting rd_toastoid.
+ * This also tells toast_save_datum() to preserve the toast value
+ * OIDs, which we want so as not to invalidate toast pointers in
+ * system catalog caches, and to avoid making multiple copies of a
+ * single toast value.
+ *
+ * Note that we must hold NewHeap open until we are done writing data,
+ * since the relcache will not guarantee to remember this setting once
+ * the relation is closed. Also, this technique depends on the fact
+ * that no one will try to read from the NewHeap until after we've
+ * finished writing it and swapping the rels --- otherwise they could
+ * follow the toast pointers to the wrong place. (It would actually
+ * work for values copied over from the old toast table, but not for
+ * any values that we toast which were previously not toasted.)
+ */
+ NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid;
+ }
+ else
+ *pSwapToastByContent = false;
+
+ /*
+ * Compute xids used to freeze and weed out dead tuples and multixacts.
+ * Since we're going to rewrite the whole table anyway, there's no reason
+ * not to be aggressive about this.
+ */
+ vacuum_set_xid_limits(OldHeap, 0, 0, 0, 0, &OldestXmin, &OldestMxact,
+ &FreezeXid, &MultiXactCutoff);
+
+ /*
+ * FreezeXid will become the table's new relfrozenxid, and that mustn't go
+ * backwards, so take the max.
+ */
+ if (TransactionIdIsValid(OldHeap->rd_rel->relfrozenxid) &&
+ TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
+ FreezeXid = OldHeap->rd_rel->relfrozenxid;
+
+ /*
+ * MultiXactCutoff, similarly, shouldn't go backwards either.
+ */
+ if (MultiXactIdIsValid(OldHeap->rd_rel->relminmxid) &&
+ MultiXactIdPrecedes(MultiXactCutoff, OldHeap->rd_rel->relminmxid))
+ MultiXactCutoff = OldHeap->rd_rel->relminmxid;
+
+ /*
+ * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
+ * the OldHeap. We know how to use a sort to duplicate the ordering of a
+ * btree index, and will use seqscan-and-sort for that case if the planner
+ * tells us it's cheaper. Otherwise, always indexscan if an index is
+ * provided, else plain seqscan.
+ */
+ if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
+ use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex);
+ else
+ use_sort = false;
+
+ /* Log what we're doing */
+ if (OldIndex != NULL && !use_sort)
+ ereport(elevel,
+ (errmsg("clustering \"%s.%s\" using index scan on \"%s\"",
+ nspname,
+ RelationGetRelationName(OldHeap),
+ RelationGetRelationName(OldIndex))));
+ else if (use_sort)
+ ereport(elevel,
+ (errmsg("clustering \"%s.%s\" using sequential scan and sort",
+ nspname,
+ RelationGetRelationName(OldHeap))));
+ else
+ ereport(elevel,
+ (errmsg("vacuuming \"%s.%s\"",
+ nspname,
+ RelationGetRelationName(OldHeap))));
+
+ /*
+ * Hand off the actual copying to AM specific function, the generic code
+ * cannot know how to deal with visibility across AMs. Note that this
+ * routine is allowed to set FreezeXid / MultiXactCutoff to different
+ * values (e.g. because the AM doesn't use freezing).
+ */
+ table_relation_copy_for_cluster(OldHeap, NewHeap, OldIndex, use_sort,
+ OldestXmin, &FreezeXid, &MultiXactCutoff,
+ &num_tuples, &tups_vacuumed,
+ &tups_recently_dead);
+
+ /* return selected values to caller, get set as relfrozenxid/minmxid */
+ *pFreezeXid = FreezeXid;
+ *pCutoffMulti = MultiXactCutoff;
+
+ /* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */
+ NewHeap->rd_toastoid = InvalidOid;
+
+ num_pages = RelationGetNumberOfBlocks(NewHeap);
+
+ /* Log what we did */
+ ereport(elevel,
+ (errmsg("\"%s.%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
+ nspname,
+ RelationGetRelationName(OldHeap),
+ tups_vacuumed, num_tuples,
+ RelationGetNumberOfBlocks(OldHeap)),
+ errdetail("%.0f dead row versions cannot be removed yet.\n"
+ "%s.",
+ tups_recently_dead,
+ pg_rusage_show(&ru0))));
+
+ if (OldIndex != NULL)
+ index_close(OldIndex, NoLock);
+ table_close(OldHeap, NoLock);
+ table_close(NewHeap, NoLock);
+
+ /* Update pg_class to reflect the correct values of pages and tuples. */
+ relRelation = table_open(RelationRelationId, RowExclusiveLock);
+
+ reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDNewHeap));
+ if (!HeapTupleIsValid(reltup))
+ elog(ERROR, "cache lookup failed for relation %u", OIDNewHeap);
+ relform = (Form_pg_class) GETSTRUCT(reltup);
+
+ relform->relpages = num_pages;
+ relform->reltuples = num_tuples;
+
+ /* Don't update the stats for pg_class. See swap_relation_files. */
+ if (OIDOldHeap != RelationRelationId)
+ CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
+ else
+ CacheInvalidateRelcacheByTuple(reltup);
+
+ /* Clean up. */
+ heap_freetuple(reltup);
+ table_close(relRelation, RowExclusiveLock);
+
+ /* Make the update visible */
+ CommandCounterIncrement();
+}
+
+/*
+ * Swap the physical files of two given relations.
+ *
+ * We swap the physical identity (reltablespace, relfilenode) while keeping the
+ * same logical identities of the two relations. relpersistence is also
+ * swapped, which is critical since it determines where buffers live for each
+ * relation.
+ *
+ * We can swap associated TOAST data in either of two ways: recursively swap
+ * the physical content of the toast tables (and their indexes), or swap the
+ * TOAST links in the given relations' pg_class entries. The former is needed
+ * to manage rewrites of shared catalogs (where we cannot change the pg_class
+ * links) while the latter is the only way to handle cases in which a toast
+ * table is added or removed altogether.
+ *
+ * Additionally, the first relation is marked with relfrozenxid set to
+ * frozenXid. It seems a bit ugly to have this here, but the caller would
+ * have to do it anyway, so having it here saves a heap_update. Note: in
+ * the swap-toast-links case, we assume we don't need to change the toast
+ * table's relfrozenxid: the new version of the toast table should already
+ * have relfrozenxid set to RecentXmin, which is good enough.
+ *
+ * Lastly, if r2 and its toast table and toast index (if any) are mapped,
+ * their OIDs are emitted into mapped_tables[]. This is hacky but beats
+ * having to look the information up again later in finish_heap_swap.
+ */
+static void
+swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
+ bool swap_toast_by_content,
+ bool is_internal,
+ TransactionId frozenXid,
+ MultiXactId cutoffMulti,
+ Oid *mapped_tables)
+{
+ Relation relRelation;
+ HeapTuple reltup1,
+ reltup2;
+ Form_pg_class relform1,
+ relform2;
+ Oid relfilenode1,
+ relfilenode2;
+ Oid swaptemp;
+ char swptmpchr;
+ Oid relam1,
+ relam2;
+
+ /* We need writable copies of both pg_class tuples. */
+ relRelation = table_open(RelationRelationId, RowExclusiveLock);
+
+ reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1));
+ if (!HeapTupleIsValid(reltup1))
+ elog(ERROR, "cache lookup failed for relation %u", r1);
+ relform1 = (Form_pg_class) GETSTRUCT(reltup1);
+
+ reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2));
+ if (!HeapTupleIsValid(reltup2))
+ elog(ERROR, "cache lookup failed for relation %u", r2);
+ relform2 = (Form_pg_class) GETSTRUCT(reltup2);
+
+ relfilenode1 = relform1->relfilenode;
+ relfilenode2 = relform2->relfilenode;
+ relam1 = relform1->relam;
+ relam2 = relform2->relam;
+
+ if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2))
+ {
+ /*
+ * Normal non-mapped relations: swap relfilenodes, reltablespaces,
+ * relpersistence
+ */
+ Assert(!target_is_pg_class);
+
+ swaptemp = relform1->relfilenode;
+ relform1->relfilenode = relform2->relfilenode;
+ relform2->relfilenode = swaptemp;
+
+ swaptemp = relform1->reltablespace;
+ relform1->reltablespace = relform2->reltablespace;
+ relform2->reltablespace = swaptemp;
+
+ swaptemp = relform1->relam;
+ relform1->relam = relform2->relam;
+ relform2->relam = swaptemp;
+
+ swptmpchr = relform1->relpersistence;
+ relform1->relpersistence = relform2->relpersistence;
+ relform2->relpersistence = swptmpchr;
+
+ /* Also swap toast links, if we're swapping by links */
+ if (!swap_toast_by_content)
+ {
+ swaptemp = relform1->reltoastrelid;
+ relform1->reltoastrelid = relform2->reltoastrelid;
+ relform2->reltoastrelid = swaptemp;
+ }
+ }
+ else
+ {
+ /*
+ * Mapped-relation case. Here we have to swap the relation mappings
+ * instead of modifying the pg_class columns. Both must be mapped.
+ */
+ if (OidIsValid(relfilenode1) || OidIsValid(relfilenode2))
+ elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation",
+ NameStr(relform1->relname));
+
+ /*
+ * We can't change the tablespace nor persistence of a mapped rel, and
+ * we can't handle toast link swapping for one either, because we must
+ * not apply any critical changes to its pg_class row. These cases
+ * should be prevented by upstream permissions tests, so these checks
+ * are non-user-facing emergency backstop.
+ */
+ if (relform1->reltablespace != relform2->reltablespace)
+ elog(ERROR, "cannot change tablespace of mapped relation \"%s\"",
+ NameStr(relform1->relname));
+ if (relform1->relpersistence != relform2->relpersistence)
+ elog(ERROR, "cannot change persistence of mapped relation \"%s\"",
+ NameStr(relform1->relname));
+ if (relform1->relam != relform2->relam)
+ elog(ERROR, "cannot change access method of mapped relation \"%s\"",
+ NameStr(relform1->relname));
+ if (!swap_toast_by_content &&
+ (relform1->reltoastrelid || relform2->reltoastrelid))
+ elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"",
+ NameStr(relform1->relname));
+
+ /*
+ * Fetch the mappings --- shouldn't fail, but be paranoid
+ */
+ relfilenode1 = RelationMapOidToFilenode(r1, relform1->relisshared);
+ if (!OidIsValid(relfilenode1))
+ elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
+ NameStr(relform1->relname), r1);
+ relfilenode2 = RelationMapOidToFilenode(r2, relform2->relisshared);
+ if (!OidIsValid(relfilenode2))
+ elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
+ NameStr(relform2->relname), r2);
+
+ /*
+ * Send replacement mappings to relmapper. Note these won't actually
+ * take effect until CommandCounterIncrement.
+ */
+ RelationMapUpdateMap(r1, relfilenode2, relform1->relisshared, false);
+ RelationMapUpdateMap(r2, relfilenode1, relform2->relisshared, false);
+
+ /* Pass OIDs of mapped r2 tables back to caller */
+ *mapped_tables++ = r2;
+ }
+
+ /*
+ * Recognize that rel1's relfilenode (swapped from rel2) is new in this
+ * subtransaction. The rel2 storage (swapped from rel1) may or may not be
+ * new.
+ */
+ {
+ Relation rel1,
+ rel2;
+
+ rel1 = relation_open(r1, NoLock);
+ rel2 = relation_open(r2, NoLock);
+ rel2->rd_createSubid = rel1->rd_createSubid;
+ rel2->rd_newRelfilenodeSubid = rel1->rd_newRelfilenodeSubid;
+ rel2->rd_firstRelfilenodeSubid = rel1->rd_firstRelfilenodeSubid;
+ RelationAssumeNewRelfilenode(rel1);
+ relation_close(rel1, NoLock);
+ relation_close(rel2, NoLock);
+ }
+
+ /*
+ * In the case of a shared catalog, these next few steps will only affect
+ * our own database's pg_class row; but that's okay, because they are all
+ * noncritical updates. That's also an important fact for the case of a
+ * mapped catalog, because it's possible that we'll commit the map change
+ * and then fail to commit the pg_class update.
+ */
+
+ /* set rel1's frozen Xid and minimum MultiXid */
+ if (relform1->relkind != RELKIND_INDEX)
+ {
+ Assert(!TransactionIdIsValid(frozenXid) ||
+ TransactionIdIsNormal(frozenXid));
+ relform1->relfrozenxid = frozenXid;
+ relform1->relminmxid = cutoffMulti;
+ }
+
+ /* swap size statistics too, since new rel has freshly-updated stats */
+ {
+ int32 swap_pages;
+ float4 swap_tuples;
+ int32 swap_allvisible;
+
+ swap_pages = relform1->relpages;
+ relform1->relpages = relform2->relpages;
+ relform2->relpages = swap_pages;
+
+ swap_tuples = relform1->reltuples;
+ relform1->reltuples = relform2->reltuples;
+ relform2->reltuples = swap_tuples;
+
+ swap_allvisible = relform1->relallvisible;
+ relform1->relallvisible = relform2->relallvisible;
+ relform2->relallvisible = swap_allvisible;
+ }
+
+ /*
+ * Update the tuples in pg_class --- unless the target relation of the
+ * swap is pg_class itself. In that case, there is zero point in making
+ * changes because we'd be updating the old data that we're about to throw
+ * away. Because the real work being done here for a mapped relation is
+ * just to change the relation map settings, it's all right to not update
+ * the pg_class rows in this case. The most important changes will instead
+ * performed later, in finish_heap_swap() itself.
+ */
+ if (!target_is_pg_class)
+ {
+ CatalogIndexState indstate;
+
+ indstate = CatalogOpenIndexes(relRelation);
+ CatalogTupleUpdateWithInfo(relRelation, &reltup1->t_self, reltup1,
+ indstate);
+ CatalogTupleUpdateWithInfo(relRelation, &reltup2->t_self, reltup2,
+ indstate);
+ CatalogCloseIndexes(indstate);
+ }
+ else
+ {
+ /* no update ... but we do still need relcache inval */
+ CacheInvalidateRelcacheByTuple(reltup1);
+ CacheInvalidateRelcacheByTuple(reltup2);
+ }
+
+ /*
+ * Now that pg_class has been updated with its relevant information for
+ * the swap, update the dependency of the relations to point to their new
+ * table AM, if it has changed.
+ */
+ if (relam1 != relam2)
+ {
+ if (changeDependencyFor(RelationRelationId,
+ r1,
+ AccessMethodRelationId,
+ relam1,
+ relam2) != 1)
+ elog(ERROR, "failed to change access method dependency for relation \"%s.%s\"",
+ get_namespace_name(get_rel_namespace(r1)),
+ get_rel_name(r1));
+ if (changeDependencyFor(RelationRelationId,
+ r2,
+ AccessMethodRelationId,
+ relam2,
+ relam1) != 1)
+ elog(ERROR, "failed to change access method dependency for relation \"%s.%s\"",
+ get_namespace_name(get_rel_namespace(r2)),
+ get_rel_name(r2));
+ }
+
+ /*
+ * Post alter hook for modified relations. The change to r2 is always
+ * internal, but r1 depends on the invocation context.
+ */
+ InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0,
+ InvalidOid, is_internal);
+ InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0,
+ InvalidOid, true);
+
+ /*
+ * If we have toast tables associated with the relations being swapped,
+ * deal with them too.
+ */
+ if (relform1->reltoastrelid || relform2->reltoastrelid)
+ {
+ if (swap_toast_by_content)
+ {
+ if (relform1->reltoastrelid && relform2->reltoastrelid)
+ {
+ /* Recursively swap the contents of the toast tables */
+ swap_relation_files(relform1->reltoastrelid,
+ relform2->reltoastrelid,
+ target_is_pg_class,
+ swap_toast_by_content,
+ is_internal,
+ frozenXid,
+ cutoffMulti,
+ mapped_tables);
+ }
+ else
+ {
+ /* caller messed up */
+ elog(ERROR, "cannot swap toast files by content when there's only one");
+ }
+ }
+ else
+ {
+ /*
+ * We swapped the ownership links, so we need to change dependency
+ * data to match.
+ *
+ * NOTE: it is possible that only one table has a toast table.
+ *
+ * NOTE: at present, a TOAST table's only dependency is the one on
+ * its owning table. If more are ever created, we'd need to use
+ * something more selective than deleteDependencyRecordsFor() to
+ * get rid of just the link we want.
+ */
+ ObjectAddress baseobject,
+ toastobject;
+ long count;
+
+ /*
+ * We disallow this case for system catalogs, to avoid the
+ * possibility that the catalog we're rebuilding is one of the
+ * ones the dependency changes would change. It's too late to be
+ * making any data changes to the target catalog.
+ */
+ if (IsSystemClass(r1, relform1))
+ elog(ERROR, "cannot swap toast files by links for system catalogs");
+
+ /* Delete old dependencies */
+ if (relform1->reltoastrelid)
+ {
+ count = deleteDependencyRecordsFor(RelationRelationId,
+ relform1->reltoastrelid,
+ false);
+ if (count != 1)
+ elog(ERROR, "expected one dependency record for TOAST table, found %ld",
+ count);
+ }
+ if (relform2->reltoastrelid)
+ {
+ count = deleteDependencyRecordsFor(RelationRelationId,
+ relform2->reltoastrelid,
+ false);
+ if (count != 1)
+ elog(ERROR, "expected one dependency record for TOAST table, found %ld",
+ count);
+ }
+
+ /* Register new dependencies */
+ baseobject.classId = RelationRelationId;
+ baseobject.objectSubId = 0;
+ toastobject.classId = RelationRelationId;
+ toastobject.objectSubId = 0;
+
+ if (relform1->reltoastrelid)
+ {
+ baseobject.objectId = r1;
+ toastobject.objectId = relform1->reltoastrelid;
+ recordDependencyOn(&toastobject, &baseobject,
+ DEPENDENCY_INTERNAL);
+ }
+
+ if (relform2->reltoastrelid)
+ {
+ baseobject.objectId = r2;
+ toastobject.objectId = relform2->reltoastrelid;
+ recordDependencyOn(&toastobject, &baseobject,
+ DEPENDENCY_INTERNAL);
+ }
+ }
+ }
+
+ /*
+ * If we're swapping two toast tables by content, do the same for their
+ * valid index. The swap can actually be safely done only if the relations
+ * have indexes.
+ */
+ if (swap_toast_by_content &&
+ relform1->relkind == RELKIND_TOASTVALUE &&
+ relform2->relkind == RELKIND_TOASTVALUE)
+ {
+ Oid toastIndex1,
+ toastIndex2;
+
+ /* Get valid index for each relation */
+ toastIndex1 = toast_get_valid_index(r1,
+ AccessExclusiveLock);
+ toastIndex2 = toast_get_valid_index(r2,
+ AccessExclusiveLock);
+
+ swap_relation_files(toastIndex1,
+ toastIndex2,
+ target_is_pg_class,
+ swap_toast_by_content,
+ is_internal,
+ InvalidTransactionId,
+ InvalidMultiXactId,
+ mapped_tables);
+ }
+
+ /* Clean up. */
+ heap_freetuple(reltup1);
+ heap_freetuple(reltup2);
+
+ table_close(relRelation, RowExclusiveLock);
+
+ /*
+ * Close both relcache entries' smgr links. We need this kluge because
+ * both links will be invalidated during upcoming CommandCounterIncrement.
+ * Whichever of the rels is the second to be cleared will have a dangling
+ * reference to the other's smgr entry. Rather than trying to avoid this
+ * by ordering operations just so, it's easiest to close the links first.
+ * (Fortunately, since one of the entries is local in our transaction,
+ * it's sufficient to clear out our own relcache this way; the problem
+ * cannot arise for other backends when they see our update on the
+ * non-transient relation.)
+ *
+ * Caution: the placement of this step interacts with the decision to
+ * handle toast rels by recursion. When we are trying to rebuild pg_class
+ * itself, the smgr close on pg_class must happen after all accesses in
+ * this function.
+ */
+ RelationCloseSmgrByOid(r1);
+ RelationCloseSmgrByOid(r2);
+}
+
+/*
+ * Remove the transient table that was built by make_new_heap, and finish
+ * cleaning up (including rebuilding all indexes on the old heap).
+ */
+void
+finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
+ bool is_system_catalog,
+ bool swap_toast_by_content,
+ bool check_constraints,
+ bool is_internal,
+ TransactionId frozenXid,
+ MultiXactId cutoffMulti,
+ char newrelpersistence)
+{
+ ObjectAddress object;
+ Oid mapped_tables[4];
+ int reindex_flags;
+ ReindexParams reindex_params = {0};
+ int i;
+
+ /* Report that we are now swapping relation files */
+ pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
+ PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES);
+
+ /* Zero out possible results from swapped_relation_files */
+ memset(mapped_tables, 0, sizeof(mapped_tables));
+
+ /*
+ * Swap the contents of the heap relations (including any toast tables).
+ * Also set old heap's relfrozenxid to frozenXid.
+ */
+ swap_relation_files(OIDOldHeap, OIDNewHeap,
+ (OIDOldHeap == RelationRelationId),
+ swap_toast_by_content, is_internal,
+ frozenXid, cutoffMulti, mapped_tables);
+
+ /*
+ * If it's a system catalog, queue a sinval message to flush all catcaches
+ * on the catalog when we reach CommandCounterIncrement.
+ */
+ if (is_system_catalog)
+ CacheInvalidateCatalog(OIDOldHeap);
+
+ /*
+ * Rebuild each index on the relation (but not the toast table, which is
+ * all-new at this point). It is important to do this before the DROP
+ * step because if we are processing a system catalog that will be used
+ * during DROP, we want to have its indexes available. There is no
+ * advantage to the other order anyway because this is all transactional,
+ * so no chance to reclaim disk space before commit. We do not need a
+ * final CommandCounterIncrement() because reindex_relation does it.
+ *
+ * Note: because index_build is called via reindex_relation, it will never
+ * set indcheckxmin true for the indexes. This is OK even though in some
+ * sense we are building new indexes rather than rebuilding existing ones,
+ * because the new heap won't contain any HOT chains at all, let alone
+ * broken ones, so it can't be necessary to set indcheckxmin.
+ */
+ reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE;
+ if (check_constraints)
+ reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS;
+
+ /*
+ * Ensure that the indexes have the same persistence as the parent
+ * relation.
+ */
+ if (newrelpersistence == RELPERSISTENCE_UNLOGGED)
+ reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED;
+ else if (newrelpersistence == RELPERSISTENCE_PERMANENT)
+ reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT;
+
+ /* Report that we are now reindexing relations */
+ pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
+ PROGRESS_CLUSTER_PHASE_REBUILD_INDEX);
+
+ reindex_relation(OIDOldHeap, reindex_flags, &reindex_params);
+
+ /* Report that we are now doing clean up */
+ pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
+ PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP);
+
+ /*
+ * If the relation being rebuilt is pg_class, swap_relation_files()
+ * couldn't update pg_class's own pg_class entry (check comments in
+ * swap_relation_files()), thus relfrozenxid was not updated. That's
+ * annoying because a potential reason for doing a VACUUM FULL is a
+ * imminent or actual anti-wraparound shutdown. So, now that we can
+ * access the new relation using its indices, update relfrozenxid.
+ * pg_class doesn't have a toast relation, so we don't need to update the
+ * corresponding toast relation. Not that there's little point moving all
+ * relfrozenxid updates here since swap_relation_files() needs to write to
+ * pg_class for non-mapped relations anyway.
+ */
+ if (OIDOldHeap == RelationRelationId)
+ {
+ Relation relRelation;
+ HeapTuple reltup;
+ Form_pg_class relform;
+
+ relRelation = table_open(RelationRelationId, RowExclusiveLock);
+
+ reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDOldHeap));
+ if (!HeapTupleIsValid(reltup))
+ elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
+ relform = (Form_pg_class) GETSTRUCT(reltup);
+
+ relform->relfrozenxid = frozenXid;
+ relform->relminmxid = cutoffMulti;
+
+ CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
+
+ table_close(relRelation, RowExclusiveLock);
+ }
+
+ /* Destroy new heap with old filenode */
+ object.classId = RelationRelationId;
+ object.objectId = OIDNewHeap;
+ object.objectSubId = 0;
+
+ /*
+ * The new relation is local to our transaction and we know nothing
+ * depends on it, so DROP_RESTRICT should be OK.
+ */
+ performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
+
+ /* performDeletion does CommandCounterIncrement at end */
+
+ /*
+ * Now we must remove any relation mapping entries that we set up for the
+ * transient table, as well as its toast table and toast index if any. If
+ * we fail to do this before commit, the relmapper will complain about new
+ * permanent map entries being added post-bootstrap.
+ */
+ for (i = 0; OidIsValid(mapped_tables[i]); i++)
+ RelationMapRemoveMapping(mapped_tables[i]);
+
+ /*
+ * At this point, everything is kosher except that, if we did toast swap
+ * by links, the toast table's name corresponds to the transient table.
+ * The name is irrelevant to the backend because it's referenced by OID,
+ * but users looking at the catalogs could be confused. Rename it to
+ * prevent this problem.
+ *
+ * Note no lock required on the relation, because we already hold an
+ * exclusive lock on it.
+ */
+ if (!swap_toast_by_content)
+ {
+ Relation newrel;
+
+ newrel = table_open(OIDOldHeap, NoLock);
+ if (OidIsValid(newrel->rd_rel->reltoastrelid))
+ {
+ Oid toastidx;
+ char NewToastName[NAMEDATALEN];
+
+ /* Get the associated valid index to be renamed */
+ toastidx = toast_get_valid_index(newrel->rd_rel->reltoastrelid,
+ NoLock);
+
+ /* rename the toast table ... */
+ snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u",
+ OIDOldHeap);
+ RenameRelationInternal(newrel->rd_rel->reltoastrelid,
+ NewToastName, true, false);
+
+ /* ... and its valid index too. */
+ snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index",
+ OIDOldHeap);
+
+ RenameRelationInternal(toastidx,
+ NewToastName, true, true);
+
+ /*
+ * Reset the relrewrite for the toast. The command-counter
+ * increment is required here as we are about to update the tuple
+ * that is updated as part of RenameRelationInternal.
+ */
+ CommandCounterIncrement();
+ ResetRelRewrite(newrel->rd_rel->reltoastrelid);
+ }
+ relation_close(newrel, NoLock);
+ }
+
+ /* if it's not a catalog table, clear any missing attribute settings */
+ if (!is_system_catalog)
+ {
+ Relation newrel;
+
+ newrel = table_open(OIDOldHeap, NoLock);
+ RelationClearMissing(newrel);
+ relation_close(newrel, NoLock);
+ }
+}
+
+
+/*
+ * Get a list of tables that the current user owns and
+ * have indisclustered set. Return the list in a List * of RelToCluster
+ * (stored in the specified memory context), each one giving the tableOid
+ * and the indexOid on which the table is already clustered.
+ */
+static List *
+get_tables_to_cluster(MemoryContext cluster_context)
+{
+ Relation indRelation;
+ TableScanDesc scan;
+ ScanKeyData entry;
+ HeapTuple indexTuple;
+ Form_pg_index index;
+ MemoryContext old_context;
+ List *rtcs = NIL;
+
+ /*
+ * Get all indexes that have indisclustered set and are owned by
+ * appropriate user.
+ */
+ indRelation = table_open(IndexRelationId, AccessShareLock);
+ ScanKeyInit(&entry,
+ Anum_pg_index_indisclustered,
+ BTEqualStrategyNumber, F_BOOLEQ,
+ BoolGetDatum(true));
+ scan = table_beginscan_catalog(indRelation, 1, &entry);
+ while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+ {
+ RelToCluster *rtc;
+
+ index = (Form_pg_index) GETSTRUCT(indexTuple);
+
+ if (!pg_class_ownercheck(index->indrelid, GetUserId()))
+ continue;
+
+ /* Use a permanent memory context for the result list */
+ old_context = MemoryContextSwitchTo(cluster_context);
+
+ rtc = (RelToCluster *) palloc(sizeof(RelToCluster));
+ rtc->tableOid = index->indrelid;
+ rtc->indexOid = index->indexrelid;
+ rtcs = lappend(rtcs, rtc);
+
+ MemoryContextSwitchTo(old_context);
+ }
+ table_endscan(scan);
+
+ relation_close(indRelation, AccessShareLock);
+
+ return rtcs;
+}
+
+/*
+ * Given an index on a partitioned table, return a list of RelToCluster for
+ * all the children leaves tables/indexes.
+ *
+ * Like expand_vacuum_rel, but here caller must hold AccessExclusiveLock
+ * on the table containing the index.
+ */
+static List *
+get_tables_to_cluster_partitioned(MemoryContext cluster_context, Oid indexOid)
+{
+ List *inhoids;
+ ListCell *lc;
+ List *rtcs = NIL;
+ MemoryContext old_context;
+
+ /* Do not lock the children until they're processed */
+ inhoids = find_all_inheritors(indexOid, NoLock, NULL);
+
+ foreach(lc, inhoids)
+ {
+ Oid indexrelid = lfirst_oid(lc);
+ Oid relid = IndexGetRelation(indexrelid, false);
+ RelToCluster *rtc;
+
+ /* consider only leaf indexes */
+ if (get_rel_relkind(indexrelid) != RELKIND_INDEX)
+ continue;
+
+ /* Silently skip partitions which the user has no access to. */
+ if (!pg_class_ownercheck(relid, GetUserId()) &&
+ (!pg_database_ownercheck(MyDatabaseId, GetUserId()) ||
+ IsSharedRelation(relid)))
+ continue;
+
+ /* Use a permanent memory context for the result list */
+ old_context = MemoryContextSwitchTo(cluster_context);
+
+ rtc = (RelToCluster *) palloc(sizeof(RelToCluster));
+ rtc->tableOid = relid;
+ rtc->indexOid = indexrelid;
+ rtcs = lappend(rtcs, rtc);
+
+ MemoryContextSwitchTo(old_context);
+ }
+
+ return rtcs;
+}