summaryrefslogtreecommitdiffstats
path: root/src/include/storage/predicate_internals.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/include/storage/predicate_internals.h')
-rw-r--r--src/include/storage/predicate_internals.h493
1 files changed, 493 insertions, 0 deletions
diff --git a/src/include/storage/predicate_internals.h b/src/include/storage/predicate_internals.h
new file mode 100644
index 0000000..104f560
--- /dev/null
+++ b/src/include/storage/predicate_internals.h
@@ -0,0 +1,493 @@
+/*-------------------------------------------------------------------------
+ *
+ * predicate_internals.h
+ * POSTGRES internal predicate locking definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/predicate_internals.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PREDICATE_INTERNALS_H
+#define PREDICATE_INTERNALS_H
+
+#include "storage/lock.h"
+#include "storage/lwlock.h"
+
+/*
+ * Commit number.
+ */
+typedef uint64 SerCommitSeqNo;
+
+/*
+ * Reserved commit sequence numbers:
+ * - 0 is reserved to indicate a non-existent SLRU entry; it cannot be
+ * used as a SerCommitSeqNo, even an invalid one
+ * - InvalidSerCommitSeqNo is used to indicate a transaction that
+ * hasn't committed yet, so use a number greater than all valid
+ * ones to make comparison do the expected thing
+ * - RecoverySerCommitSeqNo is used to refer to transactions that
+ * happened before a crash/recovery, since we restart the sequence
+ * at that point. It's earlier than all normal sequence numbers,
+ * and is only used by recovered prepared transactions
+ */
+#define InvalidSerCommitSeqNo ((SerCommitSeqNo) PG_UINT64_MAX)
+#define RecoverySerCommitSeqNo ((SerCommitSeqNo) 1)
+#define FirstNormalSerCommitSeqNo ((SerCommitSeqNo) 2)
+
+/*
+ * The SERIALIZABLEXACT struct contains information needed for each
+ * serializable database transaction to support SSI techniques.
+ *
+ * A home-grown list is maintained in shared memory to manage these.
+ * An entry is used when the serializable transaction acquires a snapshot.
+ * Unless the transaction is rolled back, this entry must generally remain
+ * until all concurrent transactions have completed. (There are special
+ * optimizations for READ ONLY transactions which often allow them to be
+ * cleaned up earlier.) A transaction which is rolled back is cleaned up
+ * as soon as possible.
+ *
+ * Eligibility for cleanup of committed transactions is generally determined
+ * by comparing the transaction's finishedBefore field to
+ * SxactGlobalXmin.
+ */
+typedef struct SERIALIZABLEXACT
+{
+ VirtualTransactionId vxid; /* The executing process always has one of
+ * these. */
+
+ /*
+ * We use two numbers to track the order that transactions commit. Before
+ * commit, a transaction is marked as prepared, and prepareSeqNo is set.
+ * Shortly after commit, it's marked as committed, and commitSeqNo is set.
+ * This doesn't give a strict commit order, but these two values together
+ * are good enough for us, as we can always err on the safe side and
+ * assume that there's a conflict, if we can't be sure of the exact
+ * ordering of two commits.
+ *
+ * Note that a transaction is marked as prepared for a short period during
+ * commit processing, even if two-phase commit is not used. But with
+ * two-phase commit, a transaction can stay in prepared state for some
+ * time.
+ */
+ SerCommitSeqNo prepareSeqNo;
+ SerCommitSeqNo commitSeqNo;
+
+ /* these values are not both interesting at the same time */
+ union
+ {
+ SerCommitSeqNo earliestOutConflictCommit; /* when committed with
+ * conflict out */
+ SerCommitSeqNo lastCommitBeforeSnapshot; /* when not committed or
+ * no conflict out */
+ } SeqNo;
+ SHM_QUEUE outConflicts; /* list of write transactions whose data we
+ * couldn't read. */
+ SHM_QUEUE inConflicts; /* list of read transactions which couldn't
+ * see our write. */
+ SHM_QUEUE predicateLocks; /* list of associated PREDICATELOCK objects */
+ SHM_QUEUE finishedLink; /* list link in
+ * FinishedSerializableTransactions */
+
+ /*
+ * perXactPredicateListLock is only used in parallel queries: it protects
+ * this SERIALIZABLEXACT's predicate lock list against other workers of
+ * the same session.
+ */
+ LWLock perXactPredicateListLock;
+
+ /*
+ * for r/o transactions: list of concurrent r/w transactions that we could
+ * potentially have conflicts with, and vice versa for r/w transactions
+ */
+ SHM_QUEUE possibleUnsafeConflicts;
+
+ TransactionId topXid; /* top level xid for the transaction, if one
+ * exists; else invalid */
+ TransactionId finishedBefore; /* invalid means still running; else the
+ * struct expires when no serializable
+ * xids are before this. */
+ TransactionId xmin; /* the transaction's snapshot xmin */
+ uint32 flags; /* OR'd combination of values defined below */
+ int pid; /* pid of associated process */
+} SERIALIZABLEXACT;
+
+#define SXACT_FLAG_COMMITTED 0x00000001 /* already committed */
+#define SXACT_FLAG_PREPARED 0x00000002 /* about to commit */
+#define SXACT_FLAG_ROLLED_BACK 0x00000004 /* already rolled back */
+#define SXACT_FLAG_DOOMED 0x00000008 /* will roll back */
+/*
+ * The following flag actually means that the flagged transaction has a
+ * conflict out *to a transaction which committed ahead of it*. It's hard
+ * to get that into a name of a reasonable length.
+ */
+#define SXACT_FLAG_CONFLICT_OUT 0x00000010
+#define SXACT_FLAG_READ_ONLY 0x00000020
+#define SXACT_FLAG_DEFERRABLE_WAITING 0x00000040
+#define SXACT_FLAG_RO_SAFE 0x00000080
+#define SXACT_FLAG_RO_UNSAFE 0x00000100
+#define SXACT_FLAG_SUMMARY_CONFLICT_IN 0x00000200
+#define SXACT_FLAG_SUMMARY_CONFLICT_OUT 0x00000400
+/*
+ * The following flag means the transaction has been partially released
+ * already, but is being preserved because parallel workers might have a
+ * reference to it. It'll be recycled by the leader at end-of-transaction.
+ */
+#define SXACT_FLAG_PARTIALLY_RELEASED 0x00000800
+
+/*
+ * The following types are used to provide an ad hoc list for holding
+ * SERIALIZABLEXACT objects. An HTAB is overkill, since there is no need to
+ * access these by key -- there are direct pointers to these objects where
+ * needed. If a shared memory list is created, these types can probably be
+ * eliminated in favor of using the general solution.
+ */
+typedef struct PredXactListElementData
+{
+ SHM_QUEUE link;
+ SERIALIZABLEXACT sxact;
+} PredXactListElementData;
+
+typedef struct PredXactListElementData *PredXactListElement;
+
+#define PredXactListElementDataSize \
+ ((Size)MAXALIGN(sizeof(PredXactListElementData)))
+
+typedef struct PredXactListData
+{
+ SHM_QUEUE availableList;
+ SHM_QUEUE activeList;
+
+ /*
+ * These global variables are maintained when registering and cleaning up
+ * serializable transactions. They must be global across all backends,
+ * but are not needed outside the predicate.c source file. Protected by
+ * SerializableXactHashLock.
+ */
+ TransactionId SxactGlobalXmin; /* global xmin for active serializable
+ * transactions */
+ int SxactGlobalXminCount; /* how many active serializable
+ * transactions have this xmin */
+ int WritableSxactCount; /* how many non-read-only serializable
+ * transactions are active */
+ SerCommitSeqNo LastSxactCommitSeqNo; /* a strictly monotonically
+ * increasing number for commits
+ * of serializable transactions */
+ /* Protected by SerializableXactHashLock. */
+ SerCommitSeqNo CanPartialClearThrough; /* can clear predicate locks and
+ * inConflicts for committed
+ * transactions through this seq
+ * no */
+ /* Protected by SerializableFinishedListLock. */
+ SerCommitSeqNo HavePartialClearedThrough; /* have cleared through this
+ * seq no */
+ SERIALIZABLEXACT *OldCommittedSxact; /* shared copy of dummy sxact */
+
+ PredXactListElement element;
+} PredXactListData;
+
+typedef struct PredXactListData *PredXactList;
+
+#define PredXactListDataSize \
+ ((Size)MAXALIGN(sizeof(PredXactListData)))
+
+
+/*
+ * The following types are used to provide lists of rw-conflicts between
+ * pairs of transactions. Since exactly the same information is needed,
+ * they are also used to record possible unsafe transaction relationships
+ * for purposes of identifying safe snapshots for read-only transactions.
+ *
+ * When a RWConflictData is not in use to record either type of relationship
+ * between a pair of transactions, it is kept on an "available" list. The
+ * outLink field is used for maintaining that list.
+ */
+typedef struct RWConflictData
+{
+ SHM_QUEUE outLink; /* link for list of conflicts out from a sxact */
+ SHM_QUEUE inLink; /* link for list of conflicts in to a sxact */
+ SERIALIZABLEXACT *sxactOut;
+ SERIALIZABLEXACT *sxactIn;
+} RWConflictData;
+
+typedef struct RWConflictData *RWConflict;
+
+#define RWConflictDataSize \
+ ((Size)MAXALIGN(sizeof(RWConflictData)))
+
+typedef struct RWConflictPoolHeaderData
+{
+ SHM_QUEUE availableList;
+ RWConflict element;
+} RWConflictPoolHeaderData;
+
+typedef struct RWConflictPoolHeaderData *RWConflictPoolHeader;
+
+#define RWConflictPoolHeaderDataSize \
+ ((Size)MAXALIGN(sizeof(RWConflictPoolHeaderData)))
+
+
+/*
+ * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable
+ * transaction or any of its subtransactions.
+ */
+typedef struct SERIALIZABLEXIDTAG
+{
+ TransactionId xid;
+} SERIALIZABLEXIDTAG;
+
+/*
+ * The SERIALIZABLEXID struct provides a link from a TransactionId for a
+ * serializable transaction to the related SERIALIZABLEXACT record, even if
+ * the transaction has completed and its connection has been closed.
+ *
+ * These are created as new top level transaction IDs are first assigned to
+ * transactions which are participating in predicate locking. This may
+ * never happen for a particular transaction if it doesn't write anything.
+ * They are removed with their related serializable transaction objects.
+ *
+ * The SubTransGetTopmostTransaction method is used where necessary to get
+ * from an XID which might be from a subtransaction to the top level XID.
+ */
+typedef struct SERIALIZABLEXID
+{
+ /* hash key */
+ SERIALIZABLEXIDTAG tag;
+
+ /* data */
+ SERIALIZABLEXACT *myXact; /* pointer to the top level transaction data */
+} SERIALIZABLEXID;
+
+
+/*
+ * The PREDICATELOCKTARGETTAG struct identifies a database object which can
+ * be the target of predicate locks.
+ *
+ * Note that the hash function being used doesn't properly respect tag
+ * length -- if the length of the structure isn't a multiple of four bytes it
+ * will go to a four byte boundary past the end of the tag. If you change
+ * this struct, make sure any slack space is initialized, so that any random
+ * bytes in the middle or at the end are not included in the hash.
+ *
+ * TODO SSI: If we always use the same fields for the same type of value, we
+ * should rename these. Holding off until it's clear there are no exceptions.
+ * Since indexes are relations with blocks and tuples, it's looking likely that
+ * the rename will be possible. If not, we may need to divide the last field
+ * and use part of it for a target type, so that we know how to interpret the
+ * data..
+ */
+typedef struct PREDICATELOCKTARGETTAG
+{
+ uint32 locktag_field1; /* a 32-bit ID field */
+ uint32 locktag_field2; /* a 32-bit ID field */
+ uint32 locktag_field3; /* a 32-bit ID field */
+ uint32 locktag_field4; /* a 32-bit ID field */
+} PREDICATELOCKTARGETTAG;
+
+/*
+ * The PREDICATELOCKTARGET struct represents a database object on which there
+ * are predicate locks.
+ *
+ * A hash list of these objects is maintained in shared memory. An entry is
+ * added when a predicate lock is requested on an object which doesn't
+ * already have one. An entry is removed when the last lock is removed from
+ * its list.
+ */
+typedef struct PREDICATELOCKTARGET
+{
+ /* hash key */
+ PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
+
+ /* data */
+ SHM_QUEUE predicateLocks; /* list of PREDICATELOCK objects assoc. with
+ * predicate lock target */
+} PREDICATELOCKTARGET;
+
+
+/*
+ * The PREDICATELOCKTAG struct identifies an individual predicate lock.
+ *
+ * It is the combination of predicate lock target (which is a lockable
+ * object) and a serializable transaction which has acquired a lock on that
+ * target.
+ */
+typedef struct PREDICATELOCKTAG
+{
+ PREDICATELOCKTARGET *myTarget;
+ SERIALIZABLEXACT *myXact;
+} PREDICATELOCKTAG;
+
+/*
+ * The PREDICATELOCK struct represents an individual lock.
+ *
+ * An entry can be created here when the related database object is read, or
+ * by promotion of multiple finer-grained targets. All entries related to a
+ * serializable transaction are removed when that serializable transaction is
+ * cleaned up. Entries can also be removed when they are combined into a
+ * single coarser-grained lock entry.
+ */
+typedef struct PREDICATELOCK
+{
+ /* hash key */
+ PREDICATELOCKTAG tag; /* unique identifier of lock */
+
+ /* data */
+ SHM_QUEUE targetLink; /* list link in PREDICATELOCKTARGET's list of
+ * predicate locks */
+ SHM_QUEUE xactLink; /* list link in SERIALIZABLEXACT's list of
+ * predicate locks */
+ SerCommitSeqNo commitSeqNo; /* only used for summarized predicate locks */
+} PREDICATELOCK;
+
+
+/*
+ * The LOCALPREDICATELOCK struct represents a local copy of data which is
+ * also present in the PREDICATELOCK table, organized for fast access without
+ * needing to acquire a LWLock. It is strictly for optimization.
+ *
+ * Each serializable transaction creates its own local hash table to hold a
+ * collection of these. This information is used to determine when a number
+ * of fine-grained locks should be promoted to a single coarser-grained lock.
+ * The information is maintained more-or-less in parallel to the
+ * PREDICATELOCK data, but because this data is not protected by locks and is
+ * only used in an optimization heuristic, it is allowed to drift in a few
+ * corner cases where maintaining exact data would be expensive.
+ *
+ * The hash table is created when the serializable transaction acquires its
+ * snapshot, and its memory is released upon completion of the transaction.
+ */
+typedef struct LOCALPREDICATELOCK
+{
+ /* hash key */
+ PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
+
+ /* data */
+ bool held; /* is lock held, or just its children? */
+ int childLocks; /* number of child locks currently held */
+} LOCALPREDICATELOCK;
+
+
+/*
+ * The types of predicate locks which can be acquired.
+ */
+typedef enum PredicateLockTargetType
+{
+ PREDLOCKTAG_RELATION,
+ PREDLOCKTAG_PAGE,
+ PREDLOCKTAG_TUPLE
+ /* TODO SSI: Other types may be needed for index locking */
+} PredicateLockTargetType;
+
+
+/*
+ * This structure is used to quickly capture a copy of all predicate
+ * locks. This is currently used only by the pg_lock_status function,
+ * which in turn is used by the pg_locks view.
+ */
+typedef struct PredicateLockData
+{
+ int nelements;
+ PREDICATELOCKTARGETTAG *locktags;
+ SERIALIZABLEXACT *xacts;
+} PredicateLockData;
+
+
+/*
+ * These macros define how we map logical IDs of lockable objects into the
+ * physical fields of PREDICATELOCKTARGETTAG. Use these to set up values,
+ * rather than accessing the fields directly. Note multiple eval of target!
+ */
+#define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \
+ ((locktag).locktag_field1 = (dboid), \
+ (locktag).locktag_field2 = (reloid), \
+ (locktag).locktag_field3 = InvalidBlockNumber, \
+ (locktag).locktag_field4 = InvalidOffsetNumber)
+
+#define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \
+ ((locktag).locktag_field1 = (dboid), \
+ (locktag).locktag_field2 = (reloid), \
+ (locktag).locktag_field3 = (blocknum), \
+ (locktag).locktag_field4 = InvalidOffsetNumber)
+
+#define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \
+ ((locktag).locktag_field1 = (dboid), \
+ (locktag).locktag_field2 = (reloid), \
+ (locktag).locktag_field3 = (blocknum), \
+ (locktag).locktag_field4 = (offnum))
+
+#define GET_PREDICATELOCKTARGETTAG_DB(locktag) \
+ ((Oid) (locktag).locktag_field1)
+#define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \
+ ((Oid) (locktag).locktag_field2)
+#define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \
+ ((BlockNumber) (locktag).locktag_field3)
+#define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \
+ ((OffsetNumber) (locktag).locktag_field4)
+#define GET_PREDICATELOCKTARGETTAG_TYPE(locktag) \
+ (((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \
+ (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE : \
+ PREDLOCKTAG_RELATION))
+
+/*
+ * Two-phase commit statefile records. There are two types: for each
+ * transaction, we generate one per-transaction record and a variable
+ * number of per-predicate-lock records.
+ */
+typedef enum TwoPhasePredicateRecordType
+{
+ TWOPHASEPREDICATERECORD_XACT,
+ TWOPHASEPREDICATERECORD_LOCK
+} TwoPhasePredicateRecordType;
+
+/*
+ * Per-transaction information to reconstruct a SERIALIZABLEXACT. Not
+ * much is needed because most of it not meaningful for a recovered
+ * prepared transaction.
+ *
+ * In particular, we do not record the in and out conflict lists for a
+ * prepared transaction because the associated SERIALIZABLEXACTs will
+ * not be available after recovery. Instead, we simply record the
+ * existence of each type of conflict by setting the transaction's
+ * summary conflict in/out flag.
+ */
+typedef struct TwoPhasePredicateXactRecord
+{
+ TransactionId xmin;
+ uint32 flags;
+} TwoPhasePredicateXactRecord;
+
+/* Per-lock state */
+typedef struct TwoPhasePredicateLockRecord
+{
+ PREDICATELOCKTARGETTAG target;
+ uint32 filler; /* to avoid length change in back-patched fix */
+} TwoPhasePredicateLockRecord;
+
+typedef struct TwoPhasePredicateRecord
+{
+ TwoPhasePredicateRecordType type;
+ union
+ {
+ TwoPhasePredicateXactRecord xactRecord;
+ TwoPhasePredicateLockRecord lockRecord;
+ } data;
+} TwoPhasePredicateRecord;
+
+/*
+ * Define a macro to use for an "empty" SERIALIZABLEXACT reference.
+ */
+#define InvalidSerializableXact ((SERIALIZABLEXACT *) NULL)
+
+
+/*
+ * Function definitions for functions needing awareness of predicate
+ * locking internals.
+ */
+extern PredicateLockData *GetPredicateLockStatusData(void);
+extern int GetSafeSnapshotBlockingPids(int blocked_pid,
+ int *output, int output_size);
+
+#endif /* PREDICATE_INTERNALS_H */