diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 13:44:03 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 13:44:03 +0000 |
commit | 293913568e6a7a86fd1479e1cff8e2ecb58d6568 (patch) | |
tree | fc3b469a3ec5ab71b36ea97cc7aaddb838423a0c /src/backend/commands/cluster.c | |
parent | Initial commit. (diff) | |
download | postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.tar.xz postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.zip |
Adding upstream version 16.2.upstream/16.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/commands/cluster.c')
-rw-r--r-- | src/backend/commands/cluster.c | 1741 |
1 files changed, 1741 insertions, 0 deletions
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c new file mode 100644 index 0000000..92c465c --- /dev/null +++ b/src/backend/commands/cluster.c @@ -0,0 +1,1741 @@ +/*------------------------------------------------------------------------- + * + * cluster.c + * CLUSTER a table on an index. This is now also used for VACUUM FULL. + * + * There is hardly anything left of Paul Brown's original implementation... + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994-5, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/commands/cluster.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/amapi.h" +#include "access/heapam.h" +#include "access/multixact.h" +#include "access/relscan.h" +#include "access/tableam.h" +#include "access/toast_internals.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "catalog/catalog.h" +#include "catalog/dependency.h" +#include "catalog/heap.h" +#include "catalog/index.h" +#include "catalog/namespace.h" +#include "catalog/objectaccess.h" +#include "catalog/partition.h" +#include "catalog/pg_am.h" +#include "catalog/pg_database.h" +#include "catalog/pg_inherits.h" +#include "catalog/toasting.h" +#include "commands/cluster.h" +#include "commands/defrem.h" +#include "commands/progress.h" +#include "commands/tablecmds.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "optimizer/optimizer.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "utils/acl.h" +#include "utils/fmgroids.h" +#include "utils/guc.h" +#include "utils/inval.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/pg_rusage.h" +#include "utils/relmapper.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" +#include "utils/tuplesort.h" + +/* + * This struct is used to pass around the information on tables to be + * clustered. We need this so we can make a list of them when invoked without + * a specific table/index pair. + */ +typedef struct +{ + Oid tableOid; + Oid indexOid; +} RelToCluster; + + +static void cluster_multiple_rels(List *rtcs, ClusterParams *params); +static void rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose); +static void copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, + bool verbose, bool *pSwapToastByContent, + TransactionId *pFreezeXid, MultiXactId *pCutoffMulti); +static List *get_tables_to_cluster(MemoryContext cluster_context); +static List *get_tables_to_cluster_partitioned(MemoryContext cluster_context, + Oid indexOid); + + +/*--------------------------------------------------------------------------- + * This cluster code allows for clustering multiple tables at once. Because + * of this, we cannot just run everything on a single transaction, or we + * would be forced to acquire exclusive locks on all the tables being + * clustered, simultaneously --- very likely leading to deadlock. + * + * To solve this we follow a similar strategy to VACUUM code, + * clustering each relation in a separate transaction. For this to work, + * we need to: + * - provide a separate memory context so that we can pass information in + * a way that survives across transactions + * - start a new transaction every time a new relation is clustered + * - check for validity of the information on to-be-clustered relations, + * as someone might have deleted a relation behind our back, or + * clustered one on a different index + * - end the transaction + * + * The single-relation case does not have any such overhead. + * + * We also allow a relation to be specified without index. In that case, + * the indisclustered bit will be looked up, and an ERROR will be thrown + * if there is no index with the bit set. + *--------------------------------------------------------------------------- + */ +void +cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel) +{ + ListCell *lc; + ClusterParams params = {0}; + bool verbose = false; + Relation rel = NULL; + Oid indexOid = InvalidOid; + MemoryContext cluster_context; + List *rtcs; + + /* Parse option list */ + foreach(lc, stmt->params) + { + DefElem *opt = (DefElem *) lfirst(lc); + + if (strcmp(opt->defname, "verbose") == 0) + verbose = defGetBoolean(opt); + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("unrecognized CLUSTER option \"%s\"", + opt->defname), + parser_errposition(pstate, opt->location))); + } + + params.options = (verbose ? CLUOPT_VERBOSE : 0); + + if (stmt->relation != NULL) + { + /* This is the single-relation case. */ + Oid tableOid; + + /* + * Find, lock, and check permissions on the table. We obtain + * AccessExclusiveLock right away to avoid lock-upgrade hazard in the + * single-transaction case. + */ + tableOid = RangeVarGetRelidExtended(stmt->relation, + AccessExclusiveLock, + 0, + RangeVarCallbackOwnsTable, NULL); + rel = table_open(tableOid, NoLock); + + /* + * Reject clustering a remote temp table ... their local buffer + * manager is not going to cope. + */ + if (RELATION_IS_OTHER_TEMP(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot cluster temporary tables of other sessions"))); + + if (stmt->indexname == NULL) + { + ListCell *index; + + /* We need to find the index that has indisclustered set. */ + foreach(index, RelationGetIndexList(rel)) + { + indexOid = lfirst_oid(index); + if (get_index_isclustered(indexOid)) + break; + indexOid = InvalidOid; + } + + if (!OidIsValid(indexOid)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("there is no previously clustered index for table \"%s\"", + stmt->relation->relname))); + } + else + { + /* + * The index is expected to be in the same namespace as the + * relation. + */ + indexOid = get_relname_relid(stmt->indexname, + rel->rd_rel->relnamespace); + if (!OidIsValid(indexOid)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("index \"%s\" for table \"%s\" does not exist", + stmt->indexname, stmt->relation->relname))); + } + + if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + { + /* close relation, keep lock till commit */ + table_close(rel, NoLock); + + /* Do the job. */ + cluster_rel(tableOid, indexOid, ¶ms); + + return; + } + } + + /* + * By here, we know we are in a multi-table situation. In order to avoid + * holding locks for too long, we want to process each table in its own + * transaction. This forces us to disallow running inside a user + * transaction block. + */ + PreventInTransactionBlock(isTopLevel, "CLUSTER"); + + /* Also, we need a memory context to hold our list of relations */ + cluster_context = AllocSetContextCreate(PortalContext, + "Cluster", + ALLOCSET_DEFAULT_SIZES); + + /* + * Either we're processing a partitioned table, or we were not given any + * table name at all. In either case, obtain a list of relations to + * process. + * + * In the former case, an index name must have been given, so we don't + * need to recheck its "indisclustered" bit, but we have to check that it + * is an index that we can cluster on. In the latter case, we set the + * option bit to have indisclustered verified. + * + * Rechecking the relation itself is necessary here in all cases. + */ + params.options |= CLUOPT_RECHECK; + if (rel != NULL) + { + Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); + check_index_is_clusterable(rel, indexOid, AccessShareLock); + rtcs = get_tables_to_cluster_partitioned(cluster_context, indexOid); + + /* close relation, releasing lock on parent table */ + table_close(rel, AccessExclusiveLock); + } + else + { + rtcs = get_tables_to_cluster(cluster_context); + params.options |= CLUOPT_RECHECK_ISCLUSTERED; + } + + /* Do the job. */ + cluster_multiple_rels(rtcs, ¶ms); + + /* Start a new transaction for the cleanup work. */ + StartTransactionCommand(); + + /* Clean up working storage */ + MemoryContextDelete(cluster_context); +} + +/* + * Given a list of relations to cluster, process each of them in a separate + * transaction. + * + * We expect to be in a transaction at start, but there isn't one when we + * return. + */ +static void +cluster_multiple_rels(List *rtcs, ClusterParams *params) +{ + ListCell *lc; + + /* Commit to get out of starting transaction */ + PopActiveSnapshot(); + CommitTransactionCommand(); + + /* Cluster the tables, each in a separate transaction */ + foreach(lc, rtcs) + { + RelToCluster *rtc = (RelToCluster *) lfirst(lc); + + /* Start a new transaction for each relation. */ + StartTransactionCommand(); + + /* functions in indexes may want a snapshot set */ + PushActiveSnapshot(GetTransactionSnapshot()); + + /* Do the job. */ + cluster_rel(rtc->tableOid, rtc->indexOid, params); + + PopActiveSnapshot(); + CommitTransactionCommand(); + } +} + +/* + * cluster_rel + * + * This clusters the table by creating a new, clustered table and + * swapping the relfilenumbers of the new table and the old table, so + * the OID of the original table is preserved. Thus we do not lose + * GRANT, inheritance nor references to this table (this was a bug + * in releases through 7.3). + * + * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading + * the new table, it's better to create the indexes afterwards than to fill + * them incrementally while we load the table. + * + * If indexOid is InvalidOid, the table will be rewritten in physical order + * instead of index order. This is the new implementation of VACUUM FULL, + * and error messages should refer to the operation as VACUUM not CLUSTER. + */ +void +cluster_rel(Oid tableOid, Oid indexOid, ClusterParams *params) +{ + Relation OldHeap; + Oid save_userid; + int save_sec_context; + int save_nestlevel; + bool verbose = ((params->options & CLUOPT_VERBOSE) != 0); + bool recheck = ((params->options & CLUOPT_RECHECK) != 0); + + /* Check for user-requested abort. */ + CHECK_FOR_INTERRUPTS(); + + pgstat_progress_start_command(PROGRESS_COMMAND_CLUSTER, tableOid); + if (OidIsValid(indexOid)) + pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND, + PROGRESS_CLUSTER_COMMAND_CLUSTER); + else + pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND, + PROGRESS_CLUSTER_COMMAND_VACUUM_FULL); + + /* + * We grab exclusive access to the target rel and index for the duration + * of the transaction. (This is redundant for the single-transaction + * case, since cluster() already did it.) The index lock is taken inside + * check_index_is_clusterable. + */ + OldHeap = try_relation_open(tableOid, AccessExclusiveLock); + + /* If the table has gone away, we can skip processing it */ + if (!OldHeap) + { + pgstat_progress_end_command(); + return; + } + + /* + * Switch to the table owner's userid, so that any index functions are run + * as that user. Also lock down security-restricted operations and + * arrange to make GUC variable changes local to this command. + */ + GetUserIdAndSecContext(&save_userid, &save_sec_context); + SetUserIdAndSecContext(OldHeap->rd_rel->relowner, + save_sec_context | SECURITY_RESTRICTED_OPERATION); + save_nestlevel = NewGUCNestLevel(); + + /* + * Since we may open a new transaction for each relation, we have to check + * that the relation still is what we think it is. + * + * If this is a single-transaction CLUSTER, we can skip these tests. We + * *must* skip the one on indisclustered since it would reject an attempt + * to cluster a not-previously-clustered index. + */ + if (recheck) + { + /* Check that the user still owns the relation */ + if (!object_ownercheck(RelationRelationId, tableOid, save_userid)) + { + relation_close(OldHeap, AccessExclusiveLock); + goto out; + } + + /* + * Silently skip a temp table for a remote session. Only doing this + * check in the "recheck" case is appropriate (which currently means + * somebody is executing a database-wide CLUSTER or on a partitioned + * table), because there is another check in cluster() which will stop + * any attempt to cluster remote temp tables by name. There is + * another check in cluster_rel which is redundant, but we leave it + * for extra safety. + */ + if (RELATION_IS_OTHER_TEMP(OldHeap)) + { + relation_close(OldHeap, AccessExclusiveLock); + goto out; + } + + if (OidIsValid(indexOid)) + { + /* + * Check that the index still exists + */ + if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid))) + { + relation_close(OldHeap, AccessExclusiveLock); + goto out; + } + + /* + * Check that the index is still the one with indisclustered set, + * if needed. + */ + if ((params->options & CLUOPT_RECHECK_ISCLUSTERED) != 0 && + !get_index_isclustered(indexOid)) + { + relation_close(OldHeap, AccessExclusiveLock); + goto out; + } + } + } + + /* + * We allow VACUUM FULL, but not CLUSTER, on shared catalogs. CLUSTER + * would work in most respects, but the index would only get marked as + * indisclustered in the current database, leading to unexpected behavior + * if CLUSTER were later invoked in another database. + */ + if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot cluster a shared catalog"))); + + /* + * Don't process temp tables of other backends ... their local buffer + * manager is not going to cope. + */ + if (RELATION_IS_OTHER_TEMP(OldHeap)) + { + if (OidIsValid(indexOid)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot cluster temporary tables of other sessions"))); + else + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot vacuum temporary tables of other sessions"))); + } + + /* + * Also check for active uses of the relation in the current transaction, + * including open scans and pending AFTER trigger events. + */ + CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM"); + + /* Check heap and index are valid to cluster on */ + if (OidIsValid(indexOid)) + check_index_is_clusterable(OldHeap, indexOid, AccessExclusiveLock); + + /* + * Quietly ignore the request if this is a materialized view which has not + * been populated from its query. No harm is done because there is no data + * to deal with, and we don't want to throw an error if this is part of a + * multi-relation request -- for example, CLUSTER was run on the entire + * database. + */ + if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW && + !RelationIsPopulated(OldHeap)) + { + relation_close(OldHeap, AccessExclusiveLock); + goto out; + } + + Assert(OldHeap->rd_rel->relkind == RELKIND_RELATION || + OldHeap->rd_rel->relkind == RELKIND_MATVIEW || + OldHeap->rd_rel->relkind == RELKIND_TOASTVALUE); + + /* + * All predicate locks on the tuples or pages are about to be made + * invalid, because we move tuples around. Promote them to relation + * locks. Predicate locks on indexes will be promoted when they are + * reindexed. + */ + TransferPredicateLocksToHeapRelation(OldHeap); + + /* rebuild_relation does all the dirty work */ + rebuild_relation(OldHeap, indexOid, verbose); + + /* NB: rebuild_relation does table_close() on OldHeap */ + +out: + /* Roll back any GUC changes executed by index functions */ + AtEOXact_GUC(false, save_nestlevel); + + /* Restore userid and security context */ + SetUserIdAndSecContext(save_userid, save_sec_context); + + pgstat_progress_end_command(); +} + +/* + * Verify that the specified heap and index are valid to cluster on + * + * Side effect: obtains lock on the index. The caller may + * in some cases already have AccessExclusiveLock on the table, but + * not in all cases so we can't rely on the table-level lock for + * protection here. + */ +void +check_index_is_clusterable(Relation OldHeap, Oid indexOid, LOCKMODE lockmode) +{ + Relation OldIndex; + + OldIndex = index_open(indexOid, lockmode); + + /* + * Check that index is in fact an index on the given relation + */ + if (OldIndex->rd_index == NULL || + OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap)) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("\"%s\" is not an index for table \"%s\"", + RelationGetRelationName(OldIndex), + RelationGetRelationName(OldHeap)))); + + /* Index AM must allow clustering */ + if (!OldIndex->rd_indam->amclusterable) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot cluster on index \"%s\" because access method does not support clustering", + RelationGetRelationName(OldIndex)))); + + /* + * Disallow clustering on incomplete indexes (those that might not index + * every row of the relation). We could relax this by making a separate + * seqscan pass over the table to copy the missing rows, but that seems + * expensive and tedious. + */ + if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred, NULL)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot cluster on partial index \"%s\"", + RelationGetRelationName(OldIndex)))); + + /* + * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY; + * it might well not contain entries for every heap row, or might not even + * be internally consistent. (But note that we don't check indcheckxmin; + * the worst consequence of following broken HOT chains would be that we + * might put recently-dead tuples out-of-order in the new table, and there + * is little harm in that.) + */ + if (!OldIndex->rd_index->indisvalid) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot cluster on invalid index \"%s\"", + RelationGetRelationName(OldIndex)))); + + /* Drop relcache refcnt on OldIndex, but keep lock */ + index_close(OldIndex, NoLock); +} + +/* + * mark_index_clustered: mark the specified index as the one clustered on + * + * With indexOid == InvalidOid, will mark all indexes of rel not-clustered. + */ +void +mark_index_clustered(Relation rel, Oid indexOid, bool is_internal) +{ + HeapTuple indexTuple; + Form_pg_index indexForm; + Relation pg_index; + ListCell *index; + + /* Disallow applying to a partitioned table */ + if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot mark index clustered in partitioned table"))); + + /* + * If the index is already marked clustered, no need to do anything. + */ + if (OidIsValid(indexOid)) + { + if (get_index_isclustered(indexOid)) + return; + } + + /* + * Check each index of the relation and set/clear the bit as needed. + */ + pg_index = table_open(IndexRelationId, RowExclusiveLock); + + foreach(index, RelationGetIndexList(rel)) + { + Oid thisIndexOid = lfirst_oid(index); + + indexTuple = SearchSysCacheCopy1(INDEXRELID, + ObjectIdGetDatum(thisIndexOid)); + if (!HeapTupleIsValid(indexTuple)) + elog(ERROR, "cache lookup failed for index %u", thisIndexOid); + indexForm = (Form_pg_index) GETSTRUCT(indexTuple); + + /* + * Unset the bit if set. We know it's wrong because we checked this + * earlier. + */ + if (indexForm->indisclustered) + { + indexForm->indisclustered = false; + CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple); + } + else if (thisIndexOid == indexOid) + { + /* this was checked earlier, but let's be real sure */ + if (!indexForm->indisvalid) + elog(ERROR, "cannot cluster on invalid index %u", indexOid); + indexForm->indisclustered = true; + CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple); + } + + InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, 0, + InvalidOid, is_internal); + + heap_freetuple(indexTuple); + } + + table_close(pg_index, RowExclusiveLock); +} + +/* + * rebuild_relation: rebuild an existing relation in index or physical order + * + * OldHeap: table to rebuild --- must be opened and exclusive-locked! + * indexOid: index to cluster by, or InvalidOid to rewrite in physical order. + * + * NB: this routine closes OldHeap at the right time; caller should not. + */ +static void +rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose) +{ + Oid tableOid = RelationGetRelid(OldHeap); + Oid accessMethod = OldHeap->rd_rel->relam; + Oid tableSpace = OldHeap->rd_rel->reltablespace; + Oid OIDNewHeap; + char relpersistence; + bool is_system_catalog; + bool swap_toast_by_content; + TransactionId frozenXid; + MultiXactId cutoffMulti; + + if (OidIsValid(indexOid)) + /* Mark the correct index as clustered */ + mark_index_clustered(OldHeap, indexOid, true); + + /* Remember info about rel before closing OldHeap */ + relpersistence = OldHeap->rd_rel->relpersistence; + is_system_catalog = IsSystemRelation(OldHeap); + + /* Close relcache entry, but keep lock until transaction commit */ + table_close(OldHeap, NoLock); + + /* Create the transient table that will receive the re-ordered data */ + OIDNewHeap = make_new_heap(tableOid, tableSpace, + accessMethod, + relpersistence, + AccessExclusiveLock); + + /* Copy the heap data into the new table in the desired order */ + copy_table_data(OIDNewHeap, tableOid, indexOid, verbose, + &swap_toast_by_content, &frozenXid, &cutoffMulti); + + /* + * Swap the physical files of the target and transient tables, then + * rebuild the target's indexes and throw away the transient table. + */ + finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog, + swap_toast_by_content, false, true, + frozenXid, cutoffMulti, + relpersistence); +} + + +/* + * Create the transient table that will be filled with new data during + * CLUSTER, ALTER TABLE, and similar operations. The transient table + * duplicates the logical structure of the OldHeap; but will have the + * specified physical storage properties NewTableSpace, NewAccessMethod, and + * relpersistence. + * + * After this, the caller should load the new heap with transferred/modified + * data, then call finish_heap_swap to complete the operation. + */ +Oid +make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, Oid NewAccessMethod, + char relpersistence, LOCKMODE lockmode) +{ + TupleDesc OldHeapDesc; + char NewHeapName[NAMEDATALEN]; + Oid OIDNewHeap; + Oid toastid; + Relation OldHeap; + HeapTuple tuple; + Datum reloptions; + bool isNull; + Oid namespaceid; + + OldHeap = table_open(OIDOldHeap, lockmode); + OldHeapDesc = RelationGetDescr(OldHeap); + + /* + * Note that the NewHeap will not receive any of the defaults or + * constraints associated with the OldHeap; we don't need 'em, and there's + * no reason to spend cycles inserting them into the catalogs only to + * delete them. + */ + + /* + * But we do want to use reloptions of the old heap for new heap. + */ + tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(OIDOldHeap)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap); + reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions, + &isNull); + if (isNull) + reloptions = (Datum) 0; + + if (relpersistence == RELPERSISTENCE_TEMP) + namespaceid = LookupCreationNamespace("pg_temp"); + else + namespaceid = RelationGetNamespace(OldHeap); + + /* + * Create the new heap, using a temporary name in the same namespace as + * the existing table. NOTE: there is some risk of collision with user + * relnames. Working around this seems more trouble than it's worth; in + * particular, we can't create the new heap in a different namespace from + * the old, or we will have problems with the TEMP status of temp tables. + * + * Note: the new heap is not a shared relation, even if we are rebuilding + * a shared rel. However, we do make the new heap mapped if the source is + * mapped. This simplifies swap_relation_files, and is absolutely + * necessary for rebuilding pg_class, for reasons explained there. + */ + snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap); + + OIDNewHeap = heap_create_with_catalog(NewHeapName, + namespaceid, + NewTableSpace, + InvalidOid, + InvalidOid, + InvalidOid, + OldHeap->rd_rel->relowner, + NewAccessMethod, + OldHeapDesc, + NIL, + RELKIND_RELATION, + relpersistence, + false, + RelationIsMapped(OldHeap), + ONCOMMIT_NOOP, + reloptions, + false, + true, + true, + OIDOldHeap, + NULL); + Assert(OIDNewHeap != InvalidOid); + + ReleaseSysCache(tuple); + + /* + * Advance command counter so that the newly-created relation's catalog + * tuples will be visible to table_open. + */ + CommandCounterIncrement(); + + /* + * If necessary, create a TOAST table for the new relation. + * + * If the relation doesn't have a TOAST table already, we can't need one + * for the new relation. The other way around is possible though: if some + * wide columns have been dropped, NewHeapCreateToastTable can decide that + * no TOAST table is needed for the new table. + * + * Note that NewHeapCreateToastTable ends with CommandCounterIncrement, so + * that the TOAST table will be visible for insertion. + */ + toastid = OldHeap->rd_rel->reltoastrelid; + if (OidIsValid(toastid)) + { + /* keep the existing toast table's reloptions, if any */ + tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", toastid); + reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions, + &isNull); + if (isNull) + reloptions = (Datum) 0; + + NewHeapCreateToastTable(OIDNewHeap, reloptions, lockmode, toastid); + + ReleaseSysCache(tuple); + } + + table_close(OldHeap, NoLock); + + return OIDNewHeap; +} + +/* + * Do the physical copying of table data. + * + * There are three output parameters: + * *pSwapToastByContent is set true if toast tables must be swapped by content. + * *pFreezeXid receives the TransactionId used as freeze cutoff point. + * *pCutoffMulti receives the MultiXactId used as a cutoff point. + */ +static void +copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose, + bool *pSwapToastByContent, TransactionId *pFreezeXid, + MultiXactId *pCutoffMulti) +{ + Relation NewHeap, + OldHeap, + OldIndex; + Relation relRelation; + HeapTuple reltup; + Form_pg_class relform; + TupleDesc oldTupDesc PG_USED_FOR_ASSERTS_ONLY; + TupleDesc newTupDesc PG_USED_FOR_ASSERTS_ONLY; + VacuumParams params; + struct VacuumCutoffs cutoffs; + bool use_sort; + double num_tuples = 0, + tups_vacuumed = 0, + tups_recently_dead = 0; + BlockNumber num_pages; + int elevel = verbose ? INFO : DEBUG2; + PGRUsage ru0; + char *nspname; + + pg_rusage_init(&ru0); + + /* + * Open the relations we need. + */ + NewHeap = table_open(OIDNewHeap, AccessExclusiveLock); + OldHeap = table_open(OIDOldHeap, AccessExclusiveLock); + if (OidIsValid(OIDOldIndex)) + OldIndex = index_open(OIDOldIndex, AccessExclusiveLock); + else + OldIndex = NULL; + + /* Store a copy of the namespace name for logging purposes */ + nspname = get_namespace_name(RelationGetNamespace(OldHeap)); + + /* + * Their tuple descriptors should be exactly alike, but here we only need + * assume that they have the same number of columns. + */ + oldTupDesc = RelationGetDescr(OldHeap); + newTupDesc = RelationGetDescr(NewHeap); + Assert(newTupDesc->natts == oldTupDesc->natts); + + /* + * If the OldHeap has a toast table, get lock on the toast table to keep + * it from being vacuumed. This is needed because autovacuum processes + * toast tables independently of their main tables, with no lock on the + * latter. If an autovacuum were to start on the toast table after we + * compute our OldestXmin below, it would use a later OldestXmin, and then + * possibly remove as DEAD toast tuples belonging to main tuples we think + * are only RECENTLY_DEAD. Then we'd fail while trying to copy those + * tuples. + * + * We don't need to open the toast relation here, just lock it. The lock + * will be held till end of transaction. + */ + if (OldHeap->rd_rel->reltoastrelid) + LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock); + + /* + * If both tables have TOAST tables, perform toast swap by content. It is + * possible that the old table has a toast table but the new one doesn't, + * if toastable columns have been dropped. In that case we have to do + * swap by links. This is okay because swap by content is only essential + * for system catalogs, and we don't support schema changes for them. + */ + if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid) + { + *pSwapToastByContent = true; + + /* + * When doing swap by content, any toast pointers written into NewHeap + * must use the old toast table's OID, because that's where the toast + * data will eventually be found. Set this up by setting rd_toastoid. + * This also tells toast_save_datum() to preserve the toast value + * OIDs, which we want so as not to invalidate toast pointers in + * system catalog caches, and to avoid making multiple copies of a + * single toast value. + * + * Note that we must hold NewHeap open until we are done writing data, + * since the relcache will not guarantee to remember this setting once + * the relation is closed. Also, this technique depends on the fact + * that no one will try to read from the NewHeap until after we've + * finished writing it and swapping the rels --- otherwise they could + * follow the toast pointers to the wrong place. (It would actually + * work for values copied over from the old toast table, but not for + * any values that we toast which were previously not toasted.) + */ + NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid; + } + else + *pSwapToastByContent = false; + + /* + * Compute xids used to freeze and weed out dead tuples and multixacts. + * Since we're going to rewrite the whole table anyway, there's no reason + * not to be aggressive about this. + */ + memset(¶ms, 0, sizeof(VacuumParams)); + vacuum_get_cutoffs(OldHeap, ¶ms, &cutoffs); + + /* + * FreezeXid will become the table's new relfrozenxid, and that mustn't go + * backwards, so take the max. + */ + if (TransactionIdIsValid(OldHeap->rd_rel->relfrozenxid) && + TransactionIdPrecedes(cutoffs.FreezeLimit, + OldHeap->rd_rel->relfrozenxid)) + cutoffs.FreezeLimit = OldHeap->rd_rel->relfrozenxid; + + /* + * MultiXactCutoff, similarly, shouldn't go backwards either. + */ + if (MultiXactIdIsValid(OldHeap->rd_rel->relminmxid) && + MultiXactIdPrecedes(cutoffs.MultiXactCutoff, + OldHeap->rd_rel->relminmxid)) + cutoffs.MultiXactCutoff = OldHeap->rd_rel->relminmxid; + + /* + * Decide whether to use an indexscan or seqscan-and-optional-sort to scan + * the OldHeap. We know how to use a sort to duplicate the ordering of a + * btree index, and will use seqscan-and-sort for that case if the planner + * tells us it's cheaper. Otherwise, always indexscan if an index is + * provided, else plain seqscan. + */ + if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID) + use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex); + else + use_sort = false; + + /* Log what we're doing */ + if (OldIndex != NULL && !use_sort) + ereport(elevel, + (errmsg("clustering \"%s.%s\" using index scan on \"%s\"", + nspname, + RelationGetRelationName(OldHeap), + RelationGetRelationName(OldIndex)))); + else if (use_sort) + ereport(elevel, + (errmsg("clustering \"%s.%s\" using sequential scan and sort", + nspname, + RelationGetRelationName(OldHeap)))); + else + ereport(elevel, + (errmsg("vacuuming \"%s.%s\"", + nspname, + RelationGetRelationName(OldHeap)))); + + /* + * Hand off the actual copying to AM specific function, the generic code + * cannot know how to deal with visibility across AMs. Note that this + * routine is allowed to set FreezeXid / MultiXactCutoff to different + * values (e.g. because the AM doesn't use freezing). + */ + table_relation_copy_for_cluster(OldHeap, NewHeap, OldIndex, use_sort, + cutoffs.OldestXmin, &cutoffs.FreezeLimit, + &cutoffs.MultiXactCutoff, + &num_tuples, &tups_vacuumed, + &tups_recently_dead); + + /* return selected values to caller, get set as relfrozenxid/minmxid */ + *pFreezeXid = cutoffs.FreezeLimit; + *pCutoffMulti = cutoffs.MultiXactCutoff; + + /* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */ + NewHeap->rd_toastoid = InvalidOid; + + num_pages = RelationGetNumberOfBlocks(NewHeap); + + /* Log what we did */ + ereport(elevel, + (errmsg("\"%s.%s\": found %.0f removable, %.0f nonremovable row versions in %u pages", + nspname, + RelationGetRelationName(OldHeap), + tups_vacuumed, num_tuples, + RelationGetNumberOfBlocks(OldHeap)), + errdetail("%.0f dead row versions cannot be removed yet.\n" + "%s.", + tups_recently_dead, + pg_rusage_show(&ru0)))); + + if (OldIndex != NULL) + index_close(OldIndex, NoLock); + table_close(OldHeap, NoLock); + table_close(NewHeap, NoLock); + + /* Update pg_class to reflect the correct values of pages and tuples. */ + relRelation = table_open(RelationRelationId, RowExclusiveLock); + + reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDNewHeap)); + if (!HeapTupleIsValid(reltup)) + elog(ERROR, "cache lookup failed for relation %u", OIDNewHeap); + relform = (Form_pg_class) GETSTRUCT(reltup); + + relform->relpages = num_pages; + relform->reltuples = num_tuples; + + /* Don't update the stats for pg_class. See swap_relation_files. */ + if (OIDOldHeap != RelationRelationId) + CatalogTupleUpdate(relRelation, &reltup->t_self, reltup); + else + CacheInvalidateRelcacheByTuple(reltup); + + /* Clean up. */ + heap_freetuple(reltup); + table_close(relRelation, RowExclusiveLock); + + /* Make the update visible */ + CommandCounterIncrement(); +} + +/* + * Swap the physical files of two given relations. + * + * We swap the physical identity (reltablespace, relfilenumber) while keeping + * the same logical identities of the two relations. relpersistence is also + * swapped, which is critical since it determines where buffers live for each + * relation. + * + * We can swap associated TOAST data in either of two ways: recursively swap + * the physical content of the toast tables (and their indexes), or swap the + * TOAST links in the given relations' pg_class entries. The former is needed + * to manage rewrites of shared catalogs (where we cannot change the pg_class + * links) while the latter is the only way to handle cases in which a toast + * table is added or removed altogether. + * + * Additionally, the first relation is marked with relfrozenxid set to + * frozenXid. It seems a bit ugly to have this here, but the caller would + * have to do it anyway, so having it here saves a heap_update. Note: in + * the swap-toast-links case, we assume we don't need to change the toast + * table's relfrozenxid: the new version of the toast table should already + * have relfrozenxid set to RecentXmin, which is good enough. + * + * Lastly, if r2 and its toast table and toast index (if any) are mapped, + * their OIDs are emitted into mapped_tables[]. This is hacky but beats + * having to look the information up again later in finish_heap_swap. + */ +static void +swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class, + bool swap_toast_by_content, + bool is_internal, + TransactionId frozenXid, + MultiXactId cutoffMulti, + Oid *mapped_tables) +{ + Relation relRelation; + HeapTuple reltup1, + reltup2; + Form_pg_class relform1, + relform2; + RelFileNumber relfilenumber1, + relfilenumber2; + RelFileNumber swaptemp; + char swptmpchr; + Oid relam1, + relam2; + + /* We need writable copies of both pg_class tuples. */ + relRelation = table_open(RelationRelationId, RowExclusiveLock); + + reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1)); + if (!HeapTupleIsValid(reltup1)) + elog(ERROR, "cache lookup failed for relation %u", r1); + relform1 = (Form_pg_class) GETSTRUCT(reltup1); + + reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2)); + if (!HeapTupleIsValid(reltup2)) + elog(ERROR, "cache lookup failed for relation %u", r2); + relform2 = (Form_pg_class) GETSTRUCT(reltup2); + + relfilenumber1 = relform1->relfilenode; + relfilenumber2 = relform2->relfilenode; + relam1 = relform1->relam; + relam2 = relform2->relam; + + if (RelFileNumberIsValid(relfilenumber1) && + RelFileNumberIsValid(relfilenumber2)) + { + /* + * Normal non-mapped relations: swap relfilenumbers, reltablespaces, + * relpersistence + */ + Assert(!target_is_pg_class); + + swaptemp = relform1->relfilenode; + relform1->relfilenode = relform2->relfilenode; + relform2->relfilenode = swaptemp; + + swaptemp = relform1->reltablespace; + relform1->reltablespace = relform2->reltablespace; + relform2->reltablespace = swaptemp; + + swaptemp = relform1->relam; + relform1->relam = relform2->relam; + relform2->relam = swaptemp; + + swptmpchr = relform1->relpersistence; + relform1->relpersistence = relform2->relpersistence; + relform2->relpersistence = swptmpchr; + + /* Also swap toast links, if we're swapping by links */ + if (!swap_toast_by_content) + { + swaptemp = relform1->reltoastrelid; + relform1->reltoastrelid = relform2->reltoastrelid; + relform2->reltoastrelid = swaptemp; + } + } + else + { + /* + * Mapped-relation case. Here we have to swap the relation mappings + * instead of modifying the pg_class columns. Both must be mapped. + */ + if (RelFileNumberIsValid(relfilenumber1) || + RelFileNumberIsValid(relfilenumber2)) + elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation", + NameStr(relform1->relname)); + + /* + * We can't change the tablespace nor persistence of a mapped rel, and + * we can't handle toast link swapping for one either, because we must + * not apply any critical changes to its pg_class row. These cases + * should be prevented by upstream permissions tests, so these checks + * are non-user-facing emergency backstop. + */ + if (relform1->reltablespace != relform2->reltablespace) + elog(ERROR, "cannot change tablespace of mapped relation \"%s\"", + NameStr(relform1->relname)); + if (relform1->relpersistence != relform2->relpersistence) + elog(ERROR, "cannot change persistence of mapped relation \"%s\"", + NameStr(relform1->relname)); + if (relform1->relam != relform2->relam) + elog(ERROR, "cannot change access method of mapped relation \"%s\"", + NameStr(relform1->relname)); + if (!swap_toast_by_content && + (relform1->reltoastrelid || relform2->reltoastrelid)) + elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"", + NameStr(relform1->relname)); + + /* + * Fetch the mappings --- shouldn't fail, but be paranoid + */ + relfilenumber1 = RelationMapOidToFilenumber(r1, relform1->relisshared); + if (!RelFileNumberIsValid(relfilenumber1)) + elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u", + NameStr(relform1->relname), r1); + relfilenumber2 = RelationMapOidToFilenumber(r2, relform2->relisshared); + if (!RelFileNumberIsValid(relfilenumber2)) + elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u", + NameStr(relform2->relname), r2); + + /* + * Send replacement mappings to relmapper. Note these won't actually + * take effect until CommandCounterIncrement. + */ + RelationMapUpdateMap(r1, relfilenumber2, relform1->relisshared, false); + RelationMapUpdateMap(r2, relfilenumber1, relform2->relisshared, false); + + /* Pass OIDs of mapped r2 tables back to caller */ + *mapped_tables++ = r2; + } + + /* + * Recognize that rel1's relfilenumber (swapped from rel2) is new in this + * subtransaction. The rel2 storage (swapped from rel1) may or may not be + * new. + */ + { + Relation rel1, + rel2; + + rel1 = relation_open(r1, NoLock); + rel2 = relation_open(r2, NoLock); + rel2->rd_createSubid = rel1->rd_createSubid; + rel2->rd_newRelfilelocatorSubid = rel1->rd_newRelfilelocatorSubid; + rel2->rd_firstRelfilelocatorSubid = rel1->rd_firstRelfilelocatorSubid; + RelationAssumeNewRelfilelocator(rel1); + relation_close(rel1, NoLock); + relation_close(rel2, NoLock); + } + + /* + * In the case of a shared catalog, these next few steps will only affect + * our own database's pg_class row; but that's okay, because they are all + * noncritical updates. That's also an important fact for the case of a + * mapped catalog, because it's possible that we'll commit the map change + * and then fail to commit the pg_class update. + */ + + /* set rel1's frozen Xid and minimum MultiXid */ + if (relform1->relkind != RELKIND_INDEX) + { + Assert(!TransactionIdIsValid(frozenXid) || + TransactionIdIsNormal(frozenXid)); + relform1->relfrozenxid = frozenXid; + relform1->relminmxid = cutoffMulti; + } + + /* swap size statistics too, since new rel has freshly-updated stats */ + { + int32 swap_pages; + float4 swap_tuples; + int32 swap_allvisible; + + swap_pages = relform1->relpages; + relform1->relpages = relform2->relpages; + relform2->relpages = swap_pages; + + swap_tuples = relform1->reltuples; + relform1->reltuples = relform2->reltuples; + relform2->reltuples = swap_tuples; + + swap_allvisible = relform1->relallvisible; + relform1->relallvisible = relform2->relallvisible; + relform2->relallvisible = swap_allvisible; + } + + /* + * Update the tuples in pg_class --- unless the target relation of the + * swap is pg_class itself. In that case, there is zero point in making + * changes because we'd be updating the old data that we're about to throw + * away. Because the real work being done here for a mapped relation is + * just to change the relation map settings, it's all right to not update + * the pg_class rows in this case. The most important changes will instead + * performed later, in finish_heap_swap() itself. + */ + if (!target_is_pg_class) + { + CatalogIndexState indstate; + + indstate = CatalogOpenIndexes(relRelation); + CatalogTupleUpdateWithInfo(relRelation, &reltup1->t_self, reltup1, + indstate); + CatalogTupleUpdateWithInfo(relRelation, &reltup2->t_self, reltup2, + indstate); + CatalogCloseIndexes(indstate); + } + else + { + /* no update ... but we do still need relcache inval */ + CacheInvalidateRelcacheByTuple(reltup1); + CacheInvalidateRelcacheByTuple(reltup2); + } + + /* + * Now that pg_class has been updated with its relevant information for + * the swap, update the dependency of the relations to point to their new + * table AM, if it has changed. + */ + if (relam1 != relam2) + { + if (changeDependencyFor(RelationRelationId, + r1, + AccessMethodRelationId, + relam1, + relam2) != 1) + elog(ERROR, "failed to change access method dependency for relation \"%s.%s\"", + get_namespace_name(get_rel_namespace(r1)), + get_rel_name(r1)); + if (changeDependencyFor(RelationRelationId, + r2, + AccessMethodRelationId, + relam2, + relam1) != 1) + elog(ERROR, "failed to change access method dependency for relation \"%s.%s\"", + get_namespace_name(get_rel_namespace(r2)), + get_rel_name(r2)); + } + + /* + * Post alter hook for modified relations. The change to r2 is always + * internal, but r1 depends on the invocation context. + */ + InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0, + InvalidOid, is_internal); + InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0, + InvalidOid, true); + + /* + * If we have toast tables associated with the relations being swapped, + * deal with them too. + */ + if (relform1->reltoastrelid || relform2->reltoastrelid) + { + if (swap_toast_by_content) + { + if (relform1->reltoastrelid && relform2->reltoastrelid) + { + /* Recursively swap the contents of the toast tables */ + swap_relation_files(relform1->reltoastrelid, + relform2->reltoastrelid, + target_is_pg_class, + swap_toast_by_content, + is_internal, + frozenXid, + cutoffMulti, + mapped_tables); + } + else + { + /* caller messed up */ + elog(ERROR, "cannot swap toast files by content when there's only one"); + } + } + else + { + /* + * We swapped the ownership links, so we need to change dependency + * data to match. + * + * NOTE: it is possible that only one table has a toast table. + * + * NOTE: at present, a TOAST table's only dependency is the one on + * its owning table. If more are ever created, we'd need to use + * something more selective than deleteDependencyRecordsFor() to + * get rid of just the link we want. + */ + ObjectAddress baseobject, + toastobject; + long count; + + /* + * We disallow this case for system catalogs, to avoid the + * possibility that the catalog we're rebuilding is one of the + * ones the dependency changes would change. It's too late to be + * making any data changes to the target catalog. + */ + if (IsSystemClass(r1, relform1)) + elog(ERROR, "cannot swap toast files by links for system catalogs"); + + /* Delete old dependencies */ + if (relform1->reltoastrelid) + { + count = deleteDependencyRecordsFor(RelationRelationId, + relform1->reltoastrelid, + false); + if (count != 1) + elog(ERROR, "expected one dependency record for TOAST table, found %ld", + count); + } + if (relform2->reltoastrelid) + { + count = deleteDependencyRecordsFor(RelationRelationId, + relform2->reltoastrelid, + false); + if (count != 1) + elog(ERROR, "expected one dependency record for TOAST table, found %ld", + count); + } + + /* Register new dependencies */ + baseobject.classId = RelationRelationId; + baseobject.objectSubId = 0; + toastobject.classId = RelationRelationId; + toastobject.objectSubId = 0; + + if (relform1->reltoastrelid) + { + baseobject.objectId = r1; + toastobject.objectId = relform1->reltoastrelid; + recordDependencyOn(&toastobject, &baseobject, + DEPENDENCY_INTERNAL); + } + + if (relform2->reltoastrelid) + { + baseobject.objectId = r2; + toastobject.objectId = relform2->reltoastrelid; + recordDependencyOn(&toastobject, &baseobject, + DEPENDENCY_INTERNAL); + } + } + } + + /* + * If we're swapping two toast tables by content, do the same for their + * valid index. The swap can actually be safely done only if the relations + * have indexes. + */ + if (swap_toast_by_content && + relform1->relkind == RELKIND_TOASTVALUE && + relform2->relkind == RELKIND_TOASTVALUE) + { + Oid toastIndex1, + toastIndex2; + + /* Get valid index for each relation */ + toastIndex1 = toast_get_valid_index(r1, + AccessExclusiveLock); + toastIndex2 = toast_get_valid_index(r2, + AccessExclusiveLock); + + swap_relation_files(toastIndex1, + toastIndex2, + target_is_pg_class, + swap_toast_by_content, + is_internal, + InvalidTransactionId, + InvalidMultiXactId, + mapped_tables); + } + + /* Clean up. */ + heap_freetuple(reltup1); + heap_freetuple(reltup2); + + table_close(relRelation, RowExclusiveLock); + + /* + * Close both relcache entries' smgr links. We need this kluge because + * both links will be invalidated during upcoming CommandCounterIncrement. + * Whichever of the rels is the second to be cleared will have a dangling + * reference to the other's smgr entry. Rather than trying to avoid this + * by ordering operations just so, it's easiest to close the links first. + * (Fortunately, since one of the entries is local in our transaction, + * it's sufficient to clear out our own relcache this way; the problem + * cannot arise for other backends when they see our update on the + * non-transient relation.) + * + * Caution: the placement of this step interacts with the decision to + * handle toast rels by recursion. When we are trying to rebuild pg_class + * itself, the smgr close on pg_class must happen after all accesses in + * this function. + */ + RelationCloseSmgrByOid(r1); + RelationCloseSmgrByOid(r2); +} + +/* + * Remove the transient table that was built by make_new_heap, and finish + * cleaning up (including rebuilding all indexes on the old heap). + */ +void +finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, + bool is_system_catalog, + bool swap_toast_by_content, + bool check_constraints, + bool is_internal, + TransactionId frozenXid, + MultiXactId cutoffMulti, + char newrelpersistence) +{ + ObjectAddress object; + Oid mapped_tables[4]; + int reindex_flags; + ReindexParams reindex_params = {0}; + int i; + + /* Report that we are now swapping relation files */ + pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES); + + /* Zero out possible results from swapped_relation_files */ + memset(mapped_tables, 0, sizeof(mapped_tables)); + + /* + * Swap the contents of the heap relations (including any toast tables). + * Also set old heap's relfrozenxid to frozenXid. + */ + swap_relation_files(OIDOldHeap, OIDNewHeap, + (OIDOldHeap == RelationRelationId), + swap_toast_by_content, is_internal, + frozenXid, cutoffMulti, mapped_tables); + + /* + * If it's a system catalog, queue a sinval message to flush all catcaches + * on the catalog when we reach CommandCounterIncrement. + */ + if (is_system_catalog) + CacheInvalidateCatalog(OIDOldHeap); + + /* + * Rebuild each index on the relation (but not the toast table, which is + * all-new at this point). It is important to do this before the DROP + * step because if we are processing a system catalog that will be used + * during DROP, we want to have its indexes available. There is no + * advantage to the other order anyway because this is all transactional, + * so no chance to reclaim disk space before commit. We do not need a + * final CommandCounterIncrement() because reindex_relation does it. + * + * Note: because index_build is called via reindex_relation, it will never + * set indcheckxmin true for the indexes. This is OK even though in some + * sense we are building new indexes rather than rebuilding existing ones, + * because the new heap won't contain any HOT chains at all, let alone + * broken ones, so it can't be necessary to set indcheckxmin. + */ + reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE; + if (check_constraints) + reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS; + + /* + * Ensure that the indexes have the same persistence as the parent + * relation. + */ + if (newrelpersistence == RELPERSISTENCE_UNLOGGED) + reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED; + else if (newrelpersistence == RELPERSISTENCE_PERMANENT) + reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT; + + /* Report that we are now reindexing relations */ + pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_PHASE_REBUILD_INDEX); + + reindex_relation(OIDOldHeap, reindex_flags, &reindex_params); + + /* Report that we are now doing clean up */ + pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, + PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP); + + /* + * If the relation being rebuilt is pg_class, swap_relation_files() + * couldn't update pg_class's own pg_class entry (check comments in + * swap_relation_files()), thus relfrozenxid was not updated. That's + * annoying because a potential reason for doing a VACUUM FULL is a + * imminent or actual anti-wraparound shutdown. So, now that we can + * access the new relation using its indices, update relfrozenxid. + * pg_class doesn't have a toast relation, so we don't need to update the + * corresponding toast relation. Not that there's little point moving all + * relfrozenxid updates here since swap_relation_files() needs to write to + * pg_class for non-mapped relations anyway. + */ + if (OIDOldHeap == RelationRelationId) + { + Relation relRelation; + HeapTuple reltup; + Form_pg_class relform; + + relRelation = table_open(RelationRelationId, RowExclusiveLock); + + reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDOldHeap)); + if (!HeapTupleIsValid(reltup)) + elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap); + relform = (Form_pg_class) GETSTRUCT(reltup); + + relform->relfrozenxid = frozenXid; + relform->relminmxid = cutoffMulti; + + CatalogTupleUpdate(relRelation, &reltup->t_self, reltup); + + table_close(relRelation, RowExclusiveLock); + } + + /* Destroy new heap with old filenumber */ + object.classId = RelationRelationId; + object.objectId = OIDNewHeap; + object.objectSubId = 0; + + /* + * The new relation is local to our transaction and we know nothing + * depends on it, so DROP_RESTRICT should be OK. + */ + performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL); + + /* performDeletion does CommandCounterIncrement at end */ + + /* + * Now we must remove any relation mapping entries that we set up for the + * transient table, as well as its toast table and toast index if any. If + * we fail to do this before commit, the relmapper will complain about new + * permanent map entries being added post-bootstrap. + */ + for (i = 0; OidIsValid(mapped_tables[i]); i++) + RelationMapRemoveMapping(mapped_tables[i]); + + /* + * At this point, everything is kosher except that, if we did toast swap + * by links, the toast table's name corresponds to the transient table. + * The name is irrelevant to the backend because it's referenced by OID, + * but users looking at the catalogs could be confused. Rename it to + * prevent this problem. + * + * Note no lock required on the relation, because we already hold an + * exclusive lock on it. + */ + if (!swap_toast_by_content) + { + Relation newrel; + + newrel = table_open(OIDOldHeap, NoLock); + if (OidIsValid(newrel->rd_rel->reltoastrelid)) + { + Oid toastidx; + char NewToastName[NAMEDATALEN]; + + /* Get the associated valid index to be renamed */ + toastidx = toast_get_valid_index(newrel->rd_rel->reltoastrelid, + NoLock); + + /* rename the toast table ... */ + snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u", + OIDOldHeap); + RenameRelationInternal(newrel->rd_rel->reltoastrelid, + NewToastName, true, false); + + /* ... and its valid index too. */ + snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index", + OIDOldHeap); + + RenameRelationInternal(toastidx, + NewToastName, true, true); + + /* + * Reset the relrewrite for the toast. The command-counter + * increment is required here as we are about to update the tuple + * that is updated as part of RenameRelationInternal. + */ + CommandCounterIncrement(); + ResetRelRewrite(newrel->rd_rel->reltoastrelid); + } + relation_close(newrel, NoLock); + } + + /* if it's not a catalog table, clear any missing attribute settings */ + if (!is_system_catalog) + { + Relation newrel; + + newrel = table_open(OIDOldHeap, NoLock); + RelationClearMissing(newrel); + relation_close(newrel, NoLock); + } +} + + +/* + * Get a list of tables that the current user owns and + * have indisclustered set. Return the list in a List * of RelToCluster + * (stored in the specified memory context), each one giving the tableOid + * and the indexOid on which the table is already clustered. + */ +static List * +get_tables_to_cluster(MemoryContext cluster_context) +{ + Relation indRelation; + TableScanDesc scan; + ScanKeyData entry; + HeapTuple indexTuple; + Form_pg_index index; + MemoryContext old_context; + List *rtcs = NIL; + + /* + * Get all indexes that have indisclustered set and are owned by + * appropriate user. + */ + indRelation = table_open(IndexRelationId, AccessShareLock); + ScanKeyInit(&entry, + Anum_pg_index_indisclustered, + BTEqualStrategyNumber, F_BOOLEQ, + BoolGetDatum(true)); + scan = table_beginscan_catalog(indRelation, 1, &entry); + while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + RelToCluster *rtc; + + index = (Form_pg_index) GETSTRUCT(indexTuple); + + if (!object_ownercheck(RelationRelationId, index->indrelid, GetUserId())) + continue; + + /* Use a permanent memory context for the result list */ + old_context = MemoryContextSwitchTo(cluster_context); + + rtc = (RelToCluster *) palloc(sizeof(RelToCluster)); + rtc->tableOid = index->indrelid; + rtc->indexOid = index->indexrelid; + rtcs = lappend(rtcs, rtc); + + MemoryContextSwitchTo(old_context); + } + table_endscan(scan); + + relation_close(indRelation, AccessShareLock); + + return rtcs; +} + +/* + * Given an index on a partitioned table, return a list of RelToCluster for + * all the children leaves tables/indexes. + * + * Like expand_vacuum_rel, but here caller must hold AccessExclusiveLock + * on the table containing the index. + */ +static List * +get_tables_to_cluster_partitioned(MemoryContext cluster_context, Oid indexOid) +{ + List *inhoids; + ListCell *lc; + List *rtcs = NIL; + MemoryContext old_context; + + /* Do not lock the children until they're processed */ + inhoids = find_all_inheritors(indexOid, NoLock, NULL); + + foreach(lc, inhoids) + { + Oid indexrelid = lfirst_oid(lc); + Oid relid = IndexGetRelation(indexrelid, false); + RelToCluster *rtc; + + /* consider only leaf indexes */ + if (get_rel_relkind(indexrelid) != RELKIND_INDEX) + continue; + + /* Silently skip partitions which the user has no access to. */ + if (!object_ownercheck(RelationRelationId, relid, GetUserId()) && + (!object_ownercheck(DatabaseRelationId, MyDatabaseId, GetUserId()) || + IsSharedRelation(relid))) + continue; + + /* Use a permanent memory context for the result list */ + old_context = MemoryContextSwitchTo(cluster_context); + + rtc = (RelToCluster *) palloc(sizeof(RelToCluster)); + rtc->tableOid = relid; + rtc->indexOid = indexrelid; + rtcs = lappend(rtcs, rtc); + + MemoryContextSwitchTo(old_context); + } + + return rtcs; +} |