summaryrefslogtreecommitdiffstats
path: root/src/backend/partitioning/partdesc.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/partitioning/partdesc.c')
-rw-r--r--src/backend/partitioning/partdesc.c462
1 files changed, 462 insertions, 0 deletions
diff --git a/src/backend/partitioning/partdesc.c b/src/backend/partitioning/partdesc.c
new file mode 100644
index 0000000..8b6e0bd
--- /dev/null
+++ b/src/backend/partitioning/partdesc.c
@@ -0,0 +1,462 @@
+/*-------------------------------------------------------------------------
+ *
+ * partdesc.c
+ * Support routines for manipulating partition descriptors
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/partitioning/partdesc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/htup_details.h"
+#include "access/table.h"
+#include "catalog/partition.h"
+#include "catalog/pg_inherits.h"
+#include "partitioning/partbounds.h"
+#include "partitioning/partdesc.h"
+#include "storage/bufmgr.h"
+#include "storage/sinval.h"
+#include "utils/builtins.h"
+#include "utils/fmgroids.h"
+#include "utils/hsearch.h"
+#include "utils/inval.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/partcache.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+
+typedef struct PartitionDirectoryData
+{
+ MemoryContext pdir_mcxt;
+ HTAB *pdir_hash;
+ bool omit_detached;
+} PartitionDirectoryData;
+
+typedef struct PartitionDirectoryEntry
+{
+ Oid reloid;
+ Relation rel;
+ PartitionDesc pd;
+} PartitionDirectoryEntry;
+
+static PartitionDesc RelationBuildPartitionDesc(Relation rel,
+ bool omit_detached);
+
+
+/*
+ * RelationGetPartitionDesc -- get partition descriptor, if relation is partitioned
+ *
+ * We keep two partdescs in relcache: rd_partdesc includes all partitions
+ * (even those being concurrently marked detached), while rd_partdesc_nodetach
+ * omits (some of) those. We store the pg_inherits.xmin value for the latter,
+ * to determine whether it can be validly reused in each case, since that
+ * depends on the active snapshot.
+ *
+ * Note: we arrange for partition descriptors to not get freed until the
+ * relcache entry's refcount goes to zero (see hacks in RelationClose,
+ * RelationClearRelation, and RelationBuildPartitionDesc). Therefore, even
+ * though we hand back a direct pointer into the relcache entry, it's safe
+ * for callers to continue to use that pointer as long as (a) they hold the
+ * relation open, and (b) they hold a relation lock strong enough to ensure
+ * that the data doesn't become stale.
+ */
+PartitionDesc
+RelationGetPartitionDesc(Relation rel, bool omit_detached)
+{
+ Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
+
+ /*
+ * If relcache has a partition descriptor, use that. However, we can only
+ * do so when we are asked to include all partitions including detached;
+ * and also when we know that there are no detached partitions.
+ *
+ * If there is no active snapshot, detached partitions aren't omitted
+ * either, so we can use the cached descriptor too in that case.
+ */
+ if (likely(rel->rd_partdesc &&
+ (!rel->rd_partdesc->detached_exist || !omit_detached ||
+ !ActiveSnapshotSet())))
+ return rel->rd_partdesc;
+
+ /*
+ * If we're asked to omit detached partitions, we may be able to use a
+ * cached descriptor too. We determine that based on the pg_inherits.xmin
+ * that was saved alongside that descriptor: if the xmin that was not in
+ * progress for that active snapshot is also not in progress for the
+ * current active snapshot, then we can use it. Otherwise build one from
+ * scratch.
+ */
+ if (omit_detached &&
+ rel->rd_partdesc_nodetached &&
+ ActiveSnapshotSet())
+ {
+ Snapshot activesnap;
+
+ Assert(TransactionIdIsValid(rel->rd_partdesc_nodetached_xmin));
+ activesnap = GetActiveSnapshot();
+
+ if (!XidInMVCCSnapshot(rel->rd_partdesc_nodetached_xmin, activesnap))
+ return rel->rd_partdesc_nodetached;
+ }
+
+ return RelationBuildPartitionDesc(rel, omit_detached);
+}
+
+/*
+ * RelationBuildPartitionDesc
+ * Form rel's partition descriptor, and store in relcache entry
+ *
+ * Partition descriptor is a complex structure; to avoid complicated logic to
+ * free individual elements whenever the relcache entry is flushed, we give it
+ * its own memory context, a child of CacheMemoryContext, which can easily be
+ * deleted on its own. To avoid leaking memory in that context in case of an
+ * error partway through this function, the context is initially created as a
+ * child of CurTransactionContext and only re-parented to CacheMemoryContext
+ * at the end, when no further errors are possible. Also, we don't make this
+ * context the current context except in very brief code sections, out of fear
+ * that some of our callees allocate memory on their own which would be leaked
+ * permanently.
+ *
+ * As a special case, partition descriptors that are requested to omit
+ * partitions being detached (and which contain such partitions) are transient
+ * and are not associated with the relcache entry. Such descriptors only last
+ * through the requesting Portal, so we use the corresponding memory context
+ * for them.
+ */
+static PartitionDesc
+RelationBuildPartitionDesc(Relation rel, bool omit_detached)
+{
+ PartitionDesc partdesc;
+ PartitionBoundInfo boundinfo = NULL;
+ List *inhoids;
+ PartitionBoundSpec **boundspecs = NULL;
+ Oid *oids = NULL;
+ bool *is_leaf = NULL;
+ bool detached_exist;
+ bool is_omit;
+ TransactionId detached_xmin;
+ ListCell *cell;
+ int i,
+ nparts;
+ PartitionKey key = RelationGetPartitionKey(rel);
+ MemoryContext new_pdcxt;
+ MemoryContext oldcxt;
+ int *mapping;
+
+ /*
+ * Get partition oids from pg_inherits. This uses a single snapshot to
+ * fetch the list of children, so while more children may be getting added
+ * concurrently, whatever this function returns will be accurate as of
+ * some well-defined point in time.
+ */
+ detached_exist = false;
+ detached_xmin = InvalidTransactionId;
+ inhoids = find_inheritance_children_extended(RelationGetRelid(rel),
+ omit_detached, NoLock,
+ &detached_exist,
+ &detached_xmin);
+
+ nparts = list_length(inhoids);
+
+ /* Allocate working arrays for OIDs, leaf flags, and boundspecs. */
+ if (nparts > 0)
+ {
+ oids = (Oid *) palloc(nparts * sizeof(Oid));
+ is_leaf = (bool *) palloc(nparts * sizeof(bool));
+ boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *));
+ }
+
+ /* Collect bound spec nodes for each partition. */
+ i = 0;
+ foreach(cell, inhoids)
+ {
+ Oid inhrelid = lfirst_oid(cell);
+ HeapTuple tuple;
+ PartitionBoundSpec *boundspec = NULL;
+
+ /* Try fetching the tuple from the catcache, for speed. */
+ tuple = SearchSysCache1(RELOID, inhrelid);
+ if (HeapTupleIsValid(tuple))
+ {
+ Datum datum;
+ bool isnull;
+
+ datum = SysCacheGetAttr(RELOID, tuple,
+ Anum_pg_class_relpartbound,
+ &isnull);
+ if (!isnull)
+ boundspec = stringToNode(TextDatumGetCString(datum));
+ ReleaseSysCache(tuple);
+ }
+
+ /*
+ * The system cache may be out of date; if so, we may find no pg_class
+ * tuple or an old one where relpartbound is NULL. In that case, try
+ * the table directly. We can't just AcceptInvalidationMessages() and
+ * retry the system cache lookup because it's possible that a
+ * concurrent ATTACH PARTITION operation has removed itself from the
+ * ProcArray but not yet added invalidation messages to the shared
+ * queue; InvalidateSystemCaches() would work, but seems excessive.
+ *
+ * Note that this algorithm assumes that PartitionBoundSpec we manage
+ * to fetch is the right one -- so this is only good enough for
+ * concurrent ATTACH PARTITION, not concurrent DETACH PARTITION or
+ * some hypothetical operation that changes the partition bounds.
+ */
+ if (boundspec == NULL)
+ {
+ Relation pg_class;
+ SysScanDesc scan;
+ ScanKeyData key[1];
+ Datum datum;
+ bool isnull;
+
+ pg_class = table_open(RelationRelationId, AccessShareLock);
+ ScanKeyInit(&key[0],
+ Anum_pg_class_oid,
+ BTEqualStrategyNumber, F_OIDEQ,
+ ObjectIdGetDatum(inhrelid));
+ scan = systable_beginscan(pg_class, ClassOidIndexId, true,
+ NULL, 1, key);
+ tuple = systable_getnext(scan);
+ datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
+ RelationGetDescr(pg_class), &isnull);
+ if (!isnull)
+ boundspec = stringToNode(TextDatumGetCString(datum));
+ systable_endscan(scan);
+ table_close(pg_class, AccessShareLock);
+ }
+
+ /* Sanity checks. */
+ if (!boundspec)
+ elog(ERROR, "missing relpartbound for relation %u", inhrelid);
+ if (!IsA(boundspec, PartitionBoundSpec))
+ elog(ERROR, "invalid relpartbound for relation %u", inhrelid);
+
+ /*
+ * If the PartitionBoundSpec says this is the default partition, its
+ * OID should match pg_partitioned_table.partdefid; if not, the
+ * catalog is corrupt.
+ */
+ if (boundspec->is_default)
+ {
+ Oid partdefid;
+
+ partdefid = get_default_partition_oid(RelationGetRelid(rel));
+ if (partdefid != inhrelid)
+ elog(ERROR, "expected partdefid %u, but got %u",
+ inhrelid, partdefid);
+ }
+
+ /* Save results. */
+ oids[i] = inhrelid;
+ is_leaf[i] = (get_rel_relkind(inhrelid) != RELKIND_PARTITIONED_TABLE);
+ boundspecs[i] = boundspec;
+ ++i;
+ }
+
+ /*
+ * Create PartitionBoundInfo and mapping, working in the caller's context.
+ * This could fail, but we haven't done any damage if so.
+ */
+ if (nparts > 0)
+ boundinfo = partition_bounds_create(boundspecs, nparts, key, &mapping);
+
+ /*
+ * Now build the actual relcache partition descriptor, copying all the
+ * data into a new, small context. As per above comment, we don't make
+ * this a long-lived context until it's finished.
+ */
+ new_pdcxt = AllocSetContextCreate(CurTransactionContext,
+ "partition descriptor",
+ ALLOCSET_SMALL_SIZES);
+ MemoryContextCopyAndSetIdentifier(new_pdcxt,
+ RelationGetRelationName(rel));
+
+ partdesc = (PartitionDescData *)
+ MemoryContextAllocZero(new_pdcxt, sizeof(PartitionDescData));
+ partdesc->nparts = nparts;
+ partdesc->detached_exist = detached_exist;
+ /* If there are no partitions, the rest of the partdesc can stay zero */
+ if (nparts > 0)
+ {
+ oldcxt = MemoryContextSwitchTo(new_pdcxt);
+ partdesc->boundinfo = partition_bounds_copy(boundinfo, key);
+ partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid));
+ partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool));
+
+ /*
+ * Assign OIDs from the original array into mapped indexes of the
+ * result array. The order of OIDs in the former is defined by the
+ * catalog scan that retrieved them, whereas that in the latter is
+ * defined by canonicalized representation of the partition bounds.
+ * Also save leaf-ness of each partition.
+ */
+ for (i = 0; i < nparts; i++)
+ {
+ int index = mapping[i];
+
+ partdesc->oids[index] = oids[i];
+ partdesc->is_leaf[index] = is_leaf[i];
+ }
+ MemoryContextSwitchTo(oldcxt);
+ }
+
+ /*
+ * Are we working with the partdesc that omits the detached partition, or
+ * the one that includes it?
+ *
+ * Note that if a partition was found by the catalog's scan to have been
+ * detached, but the pg_inherit tuple saying so was not visible to the
+ * active snapshot (find_inheritance_children_extended will not have set
+ * detached_xmin in that case), we consider there to be no "omittable"
+ * detached partitions.
+ */
+ is_omit = omit_detached && detached_exist && ActiveSnapshotSet() &&
+ TransactionIdIsValid(detached_xmin);
+
+ /*
+ * We have a fully valid partdesc. Reparent it so that it has the right
+ * lifespan.
+ */
+ MemoryContextSetParent(new_pdcxt, CacheMemoryContext);
+
+ /*
+ * Store it into relcache.
+ *
+ * But first, a kluge: if there's an old context for this type of
+ * descriptor, it contains an old partition descriptor that may still be
+ * referenced somewhere. Preserve it, while not leaking it, by
+ * reattaching it as a child context of the new one. Eventually it will
+ * get dropped by either RelationClose or RelationClearRelation. (We keep
+ * the regular partdesc in rd_pdcxt, and the partdesc-excluding-
+ * detached-partitions in rd_pddcxt.)
+ */
+ if (is_omit)
+ {
+ if (rel->rd_pddcxt != NULL)
+ MemoryContextSetParent(rel->rd_pddcxt, new_pdcxt);
+ rel->rd_pddcxt = new_pdcxt;
+ rel->rd_partdesc_nodetached = partdesc;
+
+ /*
+ * For partdescs built excluding detached partitions, which we save
+ * separately, we also record the pg_inherits.xmin of the detached
+ * partition that was omitted; this informs a future potential user of
+ * such a cached partdesc to only use it after cross-checking that the
+ * xmin is indeed visible to the snapshot it is going to be working
+ * with.
+ */
+ Assert(TransactionIdIsValid(detached_xmin));
+ rel->rd_partdesc_nodetached_xmin = detached_xmin;
+ }
+ else
+ {
+ if (rel->rd_pdcxt != NULL)
+ MemoryContextSetParent(rel->rd_pdcxt, new_pdcxt);
+ rel->rd_pdcxt = new_pdcxt;
+ rel->rd_partdesc = partdesc;
+ }
+
+ return partdesc;
+}
+
+/*
+ * CreatePartitionDirectory
+ * Create a new partition directory object.
+ */
+PartitionDirectory
+CreatePartitionDirectory(MemoryContext mcxt, bool omit_detached)
+{
+ MemoryContext oldcontext = MemoryContextSwitchTo(mcxt);
+ PartitionDirectory pdir;
+ HASHCTL ctl;
+
+ pdir = palloc(sizeof(PartitionDirectoryData));
+ pdir->pdir_mcxt = mcxt;
+
+ ctl.keysize = sizeof(Oid);
+ ctl.entrysize = sizeof(PartitionDirectoryEntry);
+ ctl.hcxt = mcxt;
+
+ pdir->pdir_hash = hash_create("partition directory", 256, &ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+ pdir->omit_detached = omit_detached;
+
+ MemoryContextSwitchTo(oldcontext);
+ return pdir;
+}
+
+/*
+ * PartitionDirectoryLookup
+ * Look up the partition descriptor for a relation in the directory.
+ *
+ * The purpose of this function is to ensure that we get the same
+ * PartitionDesc for each relation every time we look it up. In the
+ * face of concurrent DDL, different PartitionDescs may be constructed with
+ * different views of the catalog state, but any single particular OID
+ * will always get the same PartitionDesc for as long as the same
+ * PartitionDirectory is used.
+ */
+PartitionDesc
+PartitionDirectoryLookup(PartitionDirectory pdir, Relation rel)
+{
+ PartitionDirectoryEntry *pde;
+ Oid relid = RelationGetRelid(rel);
+ bool found;
+
+ pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found);
+ if (!found)
+ {
+ /*
+ * We must keep a reference count on the relation so that the
+ * PartitionDesc to which we are pointing can't get destroyed.
+ */
+ RelationIncrementReferenceCount(rel);
+ pde->rel = rel;
+ pde->pd = RelationGetPartitionDesc(rel, pdir->omit_detached);
+ Assert(pde->pd != NULL);
+ }
+ return pde->pd;
+}
+
+/*
+ * DestroyPartitionDirectory
+ * Destroy a partition directory.
+ *
+ * Release the reference counts we're holding.
+ */
+void
+DestroyPartitionDirectory(PartitionDirectory pdir)
+{
+ HASH_SEQ_STATUS status;
+ PartitionDirectoryEntry *pde;
+
+ hash_seq_init(&status, pdir->pdir_hash);
+ while ((pde = hash_seq_search(&status)) != NULL)
+ RelationDecrementReferenceCount(pde->rel);
+}
+
+/*
+ * get_default_oid_from_partdesc
+ *
+ * Given a partition descriptor, return the OID of the default partition, if
+ * one exists; else, return InvalidOid.
+ */
+Oid
+get_default_oid_from_partdesc(PartitionDesc partdesc)
+{
+ if (partdesc && partdesc->boundinfo &&
+ partition_bound_has_default(partdesc->boundinfo))
+ return partdesc->oids[partdesc->boundinfo->default_index];
+
+ return InvalidOid;
+}