diff options
Diffstat (limited to 'src/include/storage/predicate_internals.h')
-rw-r--r-- | src/include/storage/predicate_internals.h | 493 |
1 files changed, 493 insertions, 0 deletions
diff --git a/src/include/storage/predicate_internals.h b/src/include/storage/predicate_internals.h new file mode 100644 index 0000000..104f560 --- /dev/null +++ b/src/include/storage/predicate_internals.h @@ -0,0 +1,493 @@ +/*------------------------------------------------------------------------- + * + * predicate_internals.h + * POSTGRES internal predicate locking definitions. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/predicate_internals.h + * + *------------------------------------------------------------------------- + */ +#ifndef PREDICATE_INTERNALS_H +#define PREDICATE_INTERNALS_H + +#include "storage/lock.h" +#include "storage/lwlock.h" + +/* + * Commit number. + */ +typedef uint64 SerCommitSeqNo; + +/* + * Reserved commit sequence numbers: + * - 0 is reserved to indicate a non-existent SLRU entry; it cannot be + * used as a SerCommitSeqNo, even an invalid one + * - InvalidSerCommitSeqNo is used to indicate a transaction that + * hasn't committed yet, so use a number greater than all valid + * ones to make comparison do the expected thing + * - RecoverySerCommitSeqNo is used to refer to transactions that + * happened before a crash/recovery, since we restart the sequence + * at that point. It's earlier than all normal sequence numbers, + * and is only used by recovered prepared transactions + */ +#define InvalidSerCommitSeqNo ((SerCommitSeqNo) PG_UINT64_MAX) +#define RecoverySerCommitSeqNo ((SerCommitSeqNo) 1) +#define FirstNormalSerCommitSeqNo ((SerCommitSeqNo) 2) + +/* + * The SERIALIZABLEXACT struct contains information needed for each + * serializable database transaction to support SSI techniques. + * + * A home-grown list is maintained in shared memory to manage these. + * An entry is used when the serializable transaction acquires a snapshot. + * Unless the transaction is rolled back, this entry must generally remain + * until all concurrent transactions have completed. (There are special + * optimizations for READ ONLY transactions which often allow them to be + * cleaned up earlier.) A transaction which is rolled back is cleaned up + * as soon as possible. + * + * Eligibility for cleanup of committed transactions is generally determined + * by comparing the transaction's finishedBefore field to + * SxactGlobalXmin. + */ +typedef struct SERIALIZABLEXACT +{ + VirtualTransactionId vxid; /* The executing process always has one of + * these. */ + + /* + * We use two numbers to track the order that transactions commit. Before + * commit, a transaction is marked as prepared, and prepareSeqNo is set. + * Shortly after commit, it's marked as committed, and commitSeqNo is set. + * This doesn't give a strict commit order, but these two values together + * are good enough for us, as we can always err on the safe side and + * assume that there's a conflict, if we can't be sure of the exact + * ordering of two commits. + * + * Note that a transaction is marked as prepared for a short period during + * commit processing, even if two-phase commit is not used. But with + * two-phase commit, a transaction can stay in prepared state for some + * time. + */ + SerCommitSeqNo prepareSeqNo; + SerCommitSeqNo commitSeqNo; + + /* these values are not both interesting at the same time */ + union + { + SerCommitSeqNo earliestOutConflictCommit; /* when committed with + * conflict out */ + SerCommitSeqNo lastCommitBeforeSnapshot; /* when not committed or + * no conflict out */ + } SeqNo; + SHM_QUEUE outConflicts; /* list of write transactions whose data we + * couldn't read. */ + SHM_QUEUE inConflicts; /* list of read transactions which couldn't + * see our write. */ + SHM_QUEUE predicateLocks; /* list of associated PREDICATELOCK objects */ + SHM_QUEUE finishedLink; /* list link in + * FinishedSerializableTransactions */ + + /* + * perXactPredicateListLock is only used in parallel queries: it protects + * this SERIALIZABLEXACT's predicate lock list against other workers of + * the same session. + */ + LWLock perXactPredicateListLock; + + /* + * for r/o transactions: list of concurrent r/w transactions that we could + * potentially have conflicts with, and vice versa for r/w transactions + */ + SHM_QUEUE possibleUnsafeConflicts; + + TransactionId topXid; /* top level xid for the transaction, if one + * exists; else invalid */ + TransactionId finishedBefore; /* invalid means still running; else the + * struct expires when no serializable + * xids are before this. */ + TransactionId xmin; /* the transaction's snapshot xmin */ + uint32 flags; /* OR'd combination of values defined below */ + int pid; /* pid of associated process */ +} SERIALIZABLEXACT; + +#define SXACT_FLAG_COMMITTED 0x00000001 /* already committed */ +#define SXACT_FLAG_PREPARED 0x00000002 /* about to commit */ +#define SXACT_FLAG_ROLLED_BACK 0x00000004 /* already rolled back */ +#define SXACT_FLAG_DOOMED 0x00000008 /* will roll back */ +/* + * The following flag actually means that the flagged transaction has a + * conflict out *to a transaction which committed ahead of it*. It's hard + * to get that into a name of a reasonable length. + */ +#define SXACT_FLAG_CONFLICT_OUT 0x00000010 +#define SXACT_FLAG_READ_ONLY 0x00000020 +#define SXACT_FLAG_DEFERRABLE_WAITING 0x00000040 +#define SXACT_FLAG_RO_SAFE 0x00000080 +#define SXACT_FLAG_RO_UNSAFE 0x00000100 +#define SXACT_FLAG_SUMMARY_CONFLICT_IN 0x00000200 +#define SXACT_FLAG_SUMMARY_CONFLICT_OUT 0x00000400 +/* + * The following flag means the transaction has been partially released + * already, but is being preserved because parallel workers might have a + * reference to it. It'll be recycled by the leader at end-of-transaction. + */ +#define SXACT_FLAG_PARTIALLY_RELEASED 0x00000800 + +/* + * The following types are used to provide an ad hoc list for holding + * SERIALIZABLEXACT objects. An HTAB is overkill, since there is no need to + * access these by key -- there are direct pointers to these objects where + * needed. If a shared memory list is created, these types can probably be + * eliminated in favor of using the general solution. + */ +typedef struct PredXactListElementData +{ + SHM_QUEUE link; + SERIALIZABLEXACT sxact; +} PredXactListElementData; + +typedef struct PredXactListElementData *PredXactListElement; + +#define PredXactListElementDataSize \ + ((Size)MAXALIGN(sizeof(PredXactListElementData))) + +typedef struct PredXactListData +{ + SHM_QUEUE availableList; + SHM_QUEUE activeList; + + /* + * These global variables are maintained when registering and cleaning up + * serializable transactions. They must be global across all backends, + * but are not needed outside the predicate.c source file. Protected by + * SerializableXactHashLock. + */ + TransactionId SxactGlobalXmin; /* global xmin for active serializable + * transactions */ + int SxactGlobalXminCount; /* how many active serializable + * transactions have this xmin */ + int WritableSxactCount; /* how many non-read-only serializable + * transactions are active */ + SerCommitSeqNo LastSxactCommitSeqNo; /* a strictly monotonically + * increasing number for commits + * of serializable transactions */ + /* Protected by SerializableXactHashLock. */ + SerCommitSeqNo CanPartialClearThrough; /* can clear predicate locks and + * inConflicts for committed + * transactions through this seq + * no */ + /* Protected by SerializableFinishedListLock. */ + SerCommitSeqNo HavePartialClearedThrough; /* have cleared through this + * seq no */ + SERIALIZABLEXACT *OldCommittedSxact; /* shared copy of dummy sxact */ + + PredXactListElement element; +} PredXactListData; + +typedef struct PredXactListData *PredXactList; + +#define PredXactListDataSize \ + ((Size)MAXALIGN(sizeof(PredXactListData))) + + +/* + * The following types are used to provide lists of rw-conflicts between + * pairs of transactions. Since exactly the same information is needed, + * they are also used to record possible unsafe transaction relationships + * for purposes of identifying safe snapshots for read-only transactions. + * + * When a RWConflictData is not in use to record either type of relationship + * between a pair of transactions, it is kept on an "available" list. The + * outLink field is used for maintaining that list. + */ +typedef struct RWConflictData +{ + SHM_QUEUE outLink; /* link for list of conflicts out from a sxact */ + SHM_QUEUE inLink; /* link for list of conflicts in to a sxact */ + SERIALIZABLEXACT *sxactOut; + SERIALIZABLEXACT *sxactIn; +} RWConflictData; + +typedef struct RWConflictData *RWConflict; + +#define RWConflictDataSize \ + ((Size)MAXALIGN(sizeof(RWConflictData))) + +typedef struct RWConflictPoolHeaderData +{ + SHM_QUEUE availableList; + RWConflict element; +} RWConflictPoolHeaderData; + +typedef struct RWConflictPoolHeaderData *RWConflictPoolHeader; + +#define RWConflictPoolHeaderDataSize \ + ((Size)MAXALIGN(sizeof(RWConflictPoolHeaderData))) + + +/* + * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable + * transaction or any of its subtransactions. + */ +typedef struct SERIALIZABLEXIDTAG +{ + TransactionId xid; +} SERIALIZABLEXIDTAG; + +/* + * The SERIALIZABLEXID struct provides a link from a TransactionId for a + * serializable transaction to the related SERIALIZABLEXACT record, even if + * the transaction has completed and its connection has been closed. + * + * These are created as new top level transaction IDs are first assigned to + * transactions which are participating in predicate locking. This may + * never happen for a particular transaction if it doesn't write anything. + * They are removed with their related serializable transaction objects. + * + * The SubTransGetTopmostTransaction method is used where necessary to get + * from an XID which might be from a subtransaction to the top level XID. + */ +typedef struct SERIALIZABLEXID +{ + /* hash key */ + SERIALIZABLEXIDTAG tag; + + /* data */ + SERIALIZABLEXACT *myXact; /* pointer to the top level transaction data */ +} SERIALIZABLEXID; + + +/* + * The PREDICATELOCKTARGETTAG struct identifies a database object which can + * be the target of predicate locks. + * + * Note that the hash function being used doesn't properly respect tag + * length -- if the length of the structure isn't a multiple of four bytes it + * will go to a four byte boundary past the end of the tag. If you change + * this struct, make sure any slack space is initialized, so that any random + * bytes in the middle or at the end are not included in the hash. + * + * TODO SSI: If we always use the same fields for the same type of value, we + * should rename these. Holding off until it's clear there are no exceptions. + * Since indexes are relations with blocks and tuples, it's looking likely that + * the rename will be possible. If not, we may need to divide the last field + * and use part of it for a target type, so that we know how to interpret the + * data.. + */ +typedef struct PREDICATELOCKTARGETTAG +{ + uint32 locktag_field1; /* a 32-bit ID field */ + uint32 locktag_field2; /* a 32-bit ID field */ + uint32 locktag_field3; /* a 32-bit ID field */ + uint32 locktag_field4; /* a 32-bit ID field */ +} PREDICATELOCKTARGETTAG; + +/* + * The PREDICATELOCKTARGET struct represents a database object on which there + * are predicate locks. + * + * A hash list of these objects is maintained in shared memory. An entry is + * added when a predicate lock is requested on an object which doesn't + * already have one. An entry is removed when the last lock is removed from + * its list. + */ +typedef struct PREDICATELOCKTARGET +{ + /* hash key */ + PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */ + + /* data */ + SHM_QUEUE predicateLocks; /* list of PREDICATELOCK objects assoc. with + * predicate lock target */ +} PREDICATELOCKTARGET; + + +/* + * The PREDICATELOCKTAG struct identifies an individual predicate lock. + * + * It is the combination of predicate lock target (which is a lockable + * object) and a serializable transaction which has acquired a lock on that + * target. + */ +typedef struct PREDICATELOCKTAG +{ + PREDICATELOCKTARGET *myTarget; + SERIALIZABLEXACT *myXact; +} PREDICATELOCKTAG; + +/* + * The PREDICATELOCK struct represents an individual lock. + * + * An entry can be created here when the related database object is read, or + * by promotion of multiple finer-grained targets. All entries related to a + * serializable transaction are removed when that serializable transaction is + * cleaned up. Entries can also be removed when they are combined into a + * single coarser-grained lock entry. + */ +typedef struct PREDICATELOCK +{ + /* hash key */ + PREDICATELOCKTAG tag; /* unique identifier of lock */ + + /* data */ + SHM_QUEUE targetLink; /* list link in PREDICATELOCKTARGET's list of + * predicate locks */ + SHM_QUEUE xactLink; /* list link in SERIALIZABLEXACT's list of + * predicate locks */ + SerCommitSeqNo commitSeqNo; /* only used for summarized predicate locks */ +} PREDICATELOCK; + + +/* + * The LOCALPREDICATELOCK struct represents a local copy of data which is + * also present in the PREDICATELOCK table, organized for fast access without + * needing to acquire a LWLock. It is strictly for optimization. + * + * Each serializable transaction creates its own local hash table to hold a + * collection of these. This information is used to determine when a number + * of fine-grained locks should be promoted to a single coarser-grained lock. + * The information is maintained more-or-less in parallel to the + * PREDICATELOCK data, but because this data is not protected by locks and is + * only used in an optimization heuristic, it is allowed to drift in a few + * corner cases where maintaining exact data would be expensive. + * + * The hash table is created when the serializable transaction acquires its + * snapshot, and its memory is released upon completion of the transaction. + */ +typedef struct LOCALPREDICATELOCK +{ + /* hash key */ + PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */ + + /* data */ + bool held; /* is lock held, or just its children? */ + int childLocks; /* number of child locks currently held */ +} LOCALPREDICATELOCK; + + +/* + * The types of predicate locks which can be acquired. + */ +typedef enum PredicateLockTargetType +{ + PREDLOCKTAG_RELATION, + PREDLOCKTAG_PAGE, + PREDLOCKTAG_TUPLE + /* TODO SSI: Other types may be needed for index locking */ +} PredicateLockTargetType; + + +/* + * This structure is used to quickly capture a copy of all predicate + * locks. This is currently used only by the pg_lock_status function, + * which in turn is used by the pg_locks view. + */ +typedef struct PredicateLockData +{ + int nelements; + PREDICATELOCKTARGETTAG *locktags; + SERIALIZABLEXACT *xacts; +} PredicateLockData; + + +/* + * These macros define how we map logical IDs of lockable objects into the + * physical fields of PREDICATELOCKTARGETTAG. Use these to set up values, + * rather than accessing the fields directly. Note multiple eval of target! + */ +#define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = (reloid), \ + (locktag).locktag_field3 = InvalidBlockNumber, \ + (locktag).locktag_field4 = InvalidOffsetNumber) + +#define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = (reloid), \ + (locktag).locktag_field3 = (blocknum), \ + (locktag).locktag_field4 = InvalidOffsetNumber) + +#define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = (reloid), \ + (locktag).locktag_field3 = (blocknum), \ + (locktag).locktag_field4 = (offnum)) + +#define GET_PREDICATELOCKTARGETTAG_DB(locktag) \ + ((Oid) (locktag).locktag_field1) +#define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \ + ((Oid) (locktag).locktag_field2) +#define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \ + ((BlockNumber) (locktag).locktag_field3) +#define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \ + ((OffsetNumber) (locktag).locktag_field4) +#define GET_PREDICATELOCKTARGETTAG_TYPE(locktag) \ + (((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \ + (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE : \ + PREDLOCKTAG_RELATION)) + +/* + * Two-phase commit statefile records. There are two types: for each + * transaction, we generate one per-transaction record and a variable + * number of per-predicate-lock records. + */ +typedef enum TwoPhasePredicateRecordType +{ + TWOPHASEPREDICATERECORD_XACT, + TWOPHASEPREDICATERECORD_LOCK +} TwoPhasePredicateRecordType; + +/* + * Per-transaction information to reconstruct a SERIALIZABLEXACT. Not + * much is needed because most of it not meaningful for a recovered + * prepared transaction. + * + * In particular, we do not record the in and out conflict lists for a + * prepared transaction because the associated SERIALIZABLEXACTs will + * not be available after recovery. Instead, we simply record the + * existence of each type of conflict by setting the transaction's + * summary conflict in/out flag. + */ +typedef struct TwoPhasePredicateXactRecord +{ + TransactionId xmin; + uint32 flags; +} TwoPhasePredicateXactRecord; + +/* Per-lock state */ +typedef struct TwoPhasePredicateLockRecord +{ + PREDICATELOCKTARGETTAG target; + uint32 filler; /* to avoid length change in back-patched fix */ +} TwoPhasePredicateLockRecord; + +typedef struct TwoPhasePredicateRecord +{ + TwoPhasePredicateRecordType type; + union + { + TwoPhasePredicateXactRecord xactRecord; + TwoPhasePredicateLockRecord lockRecord; + } data; +} TwoPhasePredicateRecord; + +/* + * Define a macro to use for an "empty" SERIALIZABLEXACT reference. + */ +#define InvalidSerializableXact ((SERIALIZABLEXACT *) NULL) + + +/* + * Function definitions for functions needing awareness of predicate + * locking internals. + */ +extern PredicateLockData *GetPredicateLockStatusData(void); +extern int GetSafeSnapshotBlockingPids(int blocked_pid, + int *output, int output_size); + +#endif /* PREDICATE_INTERNALS_H */ |