diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-16 19:46:48 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-16 19:46:48 +0000 |
commit | 311bcfc6b3acdd6fd152798c7f287ddf74fa2a98 (patch) | |
tree | 0ec307299b1dada3701e42f4ca6eda57d708261e /src/include/storage | |
parent | Initial commit. (diff) | |
download | postgresql-15-311bcfc6b3acdd6fd152798c7f287ddf74fa2a98.tar.xz postgresql-15-311bcfc6b3acdd6fd152798c7f287ddf74fa2a98.zip |
Adding upstream version 15.4.upstream/15.4upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/include/storage')
56 files changed, 7994 insertions, 0 deletions
diff --git a/src/include/storage/.gitignore b/src/include/storage/.gitignore new file mode 100644 index 0000000..209c8be --- /dev/null +++ b/src/include/storage/.gitignore @@ -0,0 +1 @@ +/lwlocknames.h diff --git a/src/include/storage/backendid.h b/src/include/storage/backendid.h new file mode 100644 index 0000000..93d5b50 --- /dev/null +++ b/src/include/storage/backendid.h @@ -0,0 +1,37 @@ +/*------------------------------------------------------------------------- + * + * backendid.h + * POSTGRES backend id communication definitions + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/backendid.h + * + *------------------------------------------------------------------------- + */ +#ifndef BACKENDID_H +#define BACKENDID_H + +/* ---------------- + * -cim 8/17/90 + * ---------------- + */ +typedef int BackendId; /* unique currently active backend identifier */ + +#define InvalidBackendId (-1) + +extern PGDLLIMPORT BackendId MyBackendId; /* backend id of this backend */ + +/* backend id of our parallel session leader, or InvalidBackendId if none */ +extern PGDLLIMPORT BackendId ParallelLeaderBackendId; + +/* + * The BackendId to use for our session's temp relations is normally our own, + * but parallel workers should use their leader's ID. + */ +#define BackendIdForTempRelations() \ + (ParallelLeaderBackendId == InvalidBackendId ? MyBackendId : ParallelLeaderBackendId) + +#endif /* BACKENDID_H */ diff --git a/src/include/storage/barrier.h b/src/include/storage/barrier.h new file mode 100644 index 0000000..57d2c52 --- /dev/null +++ b/src/include/storage/barrier.h @@ -0,0 +1,46 @@ +/*------------------------------------------------------------------------- + * + * barrier.h + * Barriers for synchronizing cooperating processes. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/barrier.h + * + *------------------------------------------------------------------------- + */ +#ifndef BARRIER_H +#define BARRIER_H + +/* + * For the header previously known as "barrier.h", please include + * "port/atomics.h", which deals with atomics, compiler barriers and memory + * barriers. + */ + +#include "storage/condition_variable.h" +#include "storage/spin.h" + +typedef struct Barrier +{ + slock_t mutex; + int phase; /* phase counter */ + int participants; /* the number of participants attached */ + int arrived; /* the number of participants that have + * arrived */ + int elected; /* highest phase elected */ + bool static_party; /* used only for assertions */ + ConditionVariable condition_variable; +} Barrier; + +extern void BarrierInit(Barrier *barrier, int num_workers); +extern bool BarrierArriveAndWait(Barrier *barrier, uint32 wait_event_info); +extern bool BarrierArriveAndDetach(Barrier *barrier); +extern bool BarrierArriveAndDetachExceptLast(Barrier *barrier); +extern int BarrierAttach(Barrier *barrier); +extern bool BarrierDetach(Barrier *barrier); +extern int BarrierPhase(Barrier *barrier); +extern int BarrierParticipants(Barrier *barrier); + +#endif /* BARRIER_H */ diff --git a/src/include/storage/block.h b/src/include/storage/block.h new file mode 100644 index 0000000..d756e3f --- /dev/null +++ b/src/include/storage/block.h @@ -0,0 +1,115 @@ +/*------------------------------------------------------------------------- + * + * block.h + * POSTGRES disk block definitions. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/block.h + * + *------------------------------------------------------------------------- + */ +#ifndef BLOCK_H +#define BLOCK_H + +/* + * BlockNumber: + * + * each data file (heap or index) is divided into postgres disk blocks + * (which may be thought of as the unit of i/o -- a postgres buffer + * contains exactly one disk block). the blocks are numbered + * sequentially, 0 to 0xFFFFFFFE. + * + * InvalidBlockNumber is the same thing as P_NEW in bufmgr.h. + * + * the access methods, the buffer manager and the storage manager are + * more or less the only pieces of code that should be accessing disk + * blocks directly. + */ +typedef uint32 BlockNumber; + +#define InvalidBlockNumber ((BlockNumber) 0xFFFFFFFF) + +#define MaxBlockNumber ((BlockNumber) 0xFFFFFFFE) + +/* + * BlockId: + * + * this is a storage type for BlockNumber. in other words, this type + * is used for on-disk structures (e.g., in HeapTupleData) whereas + * BlockNumber is the type on which calculations are performed (e.g., + * in access method code). + * + * there doesn't appear to be any reason to have separate types except + * for the fact that BlockIds can be SHORTALIGN'd (and therefore any + * structures that contains them, such as ItemPointerData, can also be + * SHORTALIGN'd). this is an important consideration for reducing the + * space requirements of the line pointer (ItemIdData) array on each + * page and the header of each heap or index tuple, so it doesn't seem + * wise to change this without good reason. + */ +typedef struct BlockIdData +{ + uint16 bi_hi; + uint16 bi_lo; +} BlockIdData; + +typedef BlockIdData *BlockId; /* block identifier */ + +/* ---------------- + * support macros + * ---------------- + */ + +/* + * BlockNumberIsValid + * True iff blockNumber is valid. + */ +#define BlockNumberIsValid(blockNumber) \ + ((BlockNumber) (blockNumber) != InvalidBlockNumber) + +/* + * BlockIdIsValid + * True iff the block identifier is valid. + */ +#define BlockIdIsValid(blockId) \ + PointerIsValid(blockId) + +/* + * BlockIdSet + * Sets a block identifier to the specified value. + */ +#define BlockIdSet(blockId, blockNumber) \ +( \ + (blockId)->bi_hi = (blockNumber) >> 16, \ + (blockId)->bi_lo = (blockNumber) & 0xffff \ +) + +/* + * BlockIdCopy + * Copy a block identifier. + */ +#define BlockIdCopy(toBlockId, fromBlockId) \ +( \ + (toBlockId)->bi_hi = (fromBlockId)->bi_hi, \ + (toBlockId)->bi_lo = (fromBlockId)->bi_lo \ +) + +/* + * BlockIdEquals + * Check for block number equality. + */ +#define BlockIdEquals(blockId1, blockId2) \ + ((blockId1)->bi_hi == (blockId2)->bi_hi && \ + (blockId1)->bi_lo == (blockId2)->bi_lo) + +/* + * BlockIdGetBlockNumber + * Retrieve the block number from a block identifier. + */ +#define BlockIdGetBlockNumber(blockId) \ + ((((BlockNumber) (blockId)->bi_hi) << 16) | ((BlockNumber) (blockId)->bi_lo)) + +#endif /* BLOCK_H */ diff --git a/src/include/storage/buf.h b/src/include/storage/buf.h new file mode 100644 index 0000000..aec01ca --- /dev/null +++ b/src/include/storage/buf.h @@ -0,0 +1,46 @@ +/*------------------------------------------------------------------------- + * + * buf.h + * Basic buffer manager data types. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/buf.h + * + *------------------------------------------------------------------------- + */ +#ifndef BUF_H +#define BUF_H + +/* + * Buffer identifiers. + * + * Zero is invalid, positive is the index of a shared buffer (1..NBuffers), + * negative is the index of a local buffer (-1 .. -NLocBuffer). + */ +typedef int Buffer; + +#define InvalidBuffer 0 + +/* + * BufferIsInvalid + * True iff the buffer is invalid. + */ +#define BufferIsInvalid(buffer) ((buffer) == InvalidBuffer) + +/* + * BufferIsLocal + * True iff the buffer is local (not visible to other backends). + */ +#define BufferIsLocal(buffer) ((buffer) < 0) + +/* + * Buffer access strategy objects. + * + * BufferAccessStrategyData is private to freelist.c + */ +typedef struct BufferAccessStrategyData *BufferAccessStrategy; + +#endif /* BUF_H */ diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h new file mode 100644 index 0000000..a17e7b2 --- /dev/null +++ b/src/include/storage/buf_internals.h @@ -0,0 +1,345 @@ +/*------------------------------------------------------------------------- + * + * buf_internals.h + * Internal definitions for buffer manager and the buffer replacement + * strategy. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/buf_internals.h + * + *------------------------------------------------------------------------- + */ +#ifndef BUFMGR_INTERNALS_H +#define BUFMGR_INTERNALS_H + +#include "port/atomics.h" +#include "storage/buf.h" +#include "storage/bufmgr.h" +#include "storage/condition_variable.h" +#include "storage/latch.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "storage/smgr.h" +#include "storage/spin.h" +#include "utils/relcache.h" + +/* + * Buffer state is a single 32-bit variable where following data is combined. + * + * - 18 bits refcount + * - 4 bits usage count + * - 10 bits of flags + * + * Combining these values allows to perform some operations without locking + * the buffer header, by modifying them together with a CAS loop. + * + * The definition of buffer state components is below. + */ +#define BUF_REFCOUNT_ONE 1 +#define BUF_REFCOUNT_MASK ((1U << 18) - 1) +#define BUF_USAGECOUNT_MASK 0x003C0000U +#define BUF_USAGECOUNT_ONE (1U << 18) +#define BUF_USAGECOUNT_SHIFT 18 +#define BUF_FLAG_MASK 0xFFC00000U + +/* Get refcount and usagecount from buffer state */ +#define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK) +#define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT) + +/* + * Flags for buffer descriptors + * + * Note: BM_TAG_VALID essentially means that there is a buffer hashtable + * entry associated with the buffer's tag. + */ +#define BM_LOCKED (1U << 22) /* buffer header is locked */ +#define BM_DIRTY (1U << 23) /* data needs writing */ +#define BM_VALID (1U << 24) /* data is valid */ +#define BM_TAG_VALID (1U << 25) /* tag is assigned */ +#define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */ +#define BM_IO_ERROR (1U << 27) /* previous I/O failed */ +#define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */ +#define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */ +#define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */ +#define BM_PERMANENT (1U << 31) /* permanent buffer (not unlogged, + * or init fork) */ +/* + * The maximum allowed value of usage_count represents a tradeoff between + * accuracy and speed of the clock-sweep buffer management algorithm. A + * large value (comparable to NBuffers) would approximate LRU semantics. + * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of + * clock sweeps to find a free buffer, so in practice we don't want the + * value to be very large. + */ +#define BM_MAX_USAGE_COUNT 5 + +/* + * Buffer tag identifies which disk block the buffer contains. + * + * Note: the BufferTag data must be sufficient to determine where to write the + * block, without reference to pg_class or pg_tablespace entries. It's + * possible that the backend flushing the buffer doesn't even believe the + * relation is visible yet (its xact may have started before the xact that + * created the rel). The storage manager must be able to cope anyway. + * + * Note: if there's any pad bytes in the struct, INIT_BUFFERTAG will have + * to be fixed to zero them, since this struct is used as a hash key. + */ +typedef struct buftag +{ + RelFileNode rnode; /* physical relation identifier */ + ForkNumber forkNum; + BlockNumber blockNum; /* blknum relative to begin of reln */ +} BufferTag; + +#define CLEAR_BUFFERTAG(a) \ +( \ + (a).rnode.spcNode = InvalidOid, \ + (a).rnode.dbNode = InvalidOid, \ + (a).rnode.relNode = InvalidOid, \ + (a).forkNum = InvalidForkNumber, \ + (a).blockNum = InvalidBlockNumber \ +) + +#define INIT_BUFFERTAG(a,xx_rnode,xx_forkNum,xx_blockNum) \ +( \ + (a).rnode = (xx_rnode), \ + (a).forkNum = (xx_forkNum), \ + (a).blockNum = (xx_blockNum) \ +) + +#define BUFFERTAGS_EQUAL(a,b) \ +( \ + RelFileNodeEquals((a).rnode, (b).rnode) && \ + (a).blockNum == (b).blockNum && \ + (a).forkNum == (b).forkNum \ +) + +/* + * The shared buffer mapping table is partitioned to reduce contention. + * To determine which partition lock a given tag requires, compute the tag's + * hash code with BufTableHashCode(), then apply BufMappingPartitionLock(). + * NB: NUM_BUFFER_PARTITIONS must be a power of 2! + */ +#define BufTableHashPartition(hashcode) \ + ((hashcode) % NUM_BUFFER_PARTITIONS) +#define BufMappingPartitionLock(hashcode) \ + (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \ + BufTableHashPartition(hashcode)].lock) +#define BufMappingPartitionLockByIndex(i) \ + (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock) + +/* + * BufferDesc -- shared descriptor/state data for a single shared buffer. + * + * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change + * tag, state or wait_backend_pgprocno fields. In general, buffer header lock + * is a spinlock which is combined with flags, refcount and usagecount into + * single atomic variable. This layout allow us to do some operations in a + * single atomic operation, without actually acquiring and releasing spinlock; + * for instance, increase or decrease refcount. buf_id field never changes + * after initialization, so does not need locking. freeNext is protected by + * the buffer_strategy_lock not buffer header lock. The LWLock can take care + * of itself. The buffer header lock is *not* used to control access to the + * data in the buffer! + * + * It's assumed that nobody changes the state field while buffer header lock + * is held. Thus buffer header lock holder can do complex updates of the + * state variable in single write, simultaneously with lock release (cleaning + * BM_LOCKED flag). On the other hand, updating of state without holding + * buffer header lock is restricted to CAS, which insure that BM_LOCKED flag + * is not set. Atomic increment/decrement, OR/AND etc. are not allowed. + * + * An exception is that if we have the buffer pinned, its tag can't change + * underneath us, so we can examine the tag without locking the buffer header. + * Also, in places we do one-time reads of the flags without bothering to + * lock the buffer header; this is generally for situations where we don't + * expect the flag bit being tested to be changing. + * + * We can't physically remove items from a disk page if another backend has + * the buffer pinned. Hence, a backend may need to wait for all other pins + * to go away. This is signaled by storing its own pgprocno into + * wait_backend_pgprocno and setting flag bit BM_PIN_COUNT_WAITER. At present, + * there can be only one such waiter per buffer. + * + * We use this same struct for local buffer headers, but the locks are not + * used and not all of the flag bits are useful either. To avoid unnecessary + * overhead, manipulations of the state field should be done without actual + * atomic operations (i.e. only pg_atomic_read_u32() and + * pg_atomic_unlocked_write_u32()). + * + * Be careful to avoid increasing the size of the struct when adding or + * reordering members. Keeping it below 64 bytes (the most common CPU + * cache line size) is fairly important for performance. + * + * Per-buffer I/O condition variables are currently kept outside this struct in + * a separate array. They could be moved in here and still fit within that + * limit on common systems, but for now that is not done. + */ +typedef struct BufferDesc +{ + BufferTag tag; /* ID of page contained in buffer */ + int buf_id; /* buffer's index number (from 0) */ + + /* state of the tag, containing flags, refcount and usagecount */ + pg_atomic_uint32 state; + + int wait_backend_pgprocno; /* backend of pin-count waiter */ + int freeNext; /* link in freelist chain */ + LWLock content_lock; /* to lock access to buffer contents */ +} BufferDesc; + +/* + * Concurrent access to buffer headers has proven to be more efficient if + * they're cache line aligned. So we force the start of the BufferDescriptors + * array to be on a cache line boundary and force the elements to be cache + * line sized. + * + * XXX: As this is primarily matters in highly concurrent workloads which + * probably all are 64bit these days, and the space wastage would be a bit + * more noticeable on 32bit systems, we don't force the stride to be cache + * line sized on those. If somebody does actual performance testing, we can + * reevaluate. + * + * Note that local buffer descriptors aren't forced to be aligned - as there's + * no concurrent access to those it's unlikely to be beneficial. + * + * We use a 64-byte cache line size here, because that's the most common + * size. Making it bigger would be a waste of memory. Even if running on a + * platform with either 32 or 128 byte line sizes, it's good to align to + * boundaries and avoid false sharing. + */ +#define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1) + +typedef union BufferDescPadded +{ + BufferDesc bufferdesc; + char pad[BUFFERDESC_PAD_TO_SIZE]; +} BufferDescPadded; + +#define GetBufferDescriptor(id) (&BufferDescriptors[(id)].bufferdesc) +#define GetLocalBufferDescriptor(id) (&LocalBufferDescriptors[(id)]) + +#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1) + +#define BufferDescriptorGetIOCV(bdesc) \ + (&(BufferIOCVArray[(bdesc)->buf_id]).cv) +#define BufferDescriptorGetContentLock(bdesc) \ + ((LWLock*) (&(bdesc)->content_lock)) + +extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray; + +/* + * The freeNext field is either the index of the next freelist entry, + * or one of these special values: + */ +#define FREENEXT_END_OF_LIST (-1) +#define FREENEXT_NOT_IN_LIST (-2) + +/* + * Functions for acquiring/releasing a shared buffer header's spinlock. Do + * not apply these to local buffers! + */ +extern uint32 LockBufHdr(BufferDesc *desc); +#define UnlockBufHdr(desc, s) \ + do { \ + pg_write_barrier(); \ + pg_atomic_write_u32(&(desc)->state, (s) & (~BM_LOCKED)); \ + } while (0) + + +/* + * The PendingWriteback & WritebackContext structure are used to keep + * information about pending flush requests to be issued to the OS. + */ +typedef struct PendingWriteback +{ + /* could store different types of pending flushes here */ + BufferTag tag; +} PendingWriteback; + +/* struct forward declared in bufmgr.h */ +typedef struct WritebackContext +{ + /* pointer to the max number of writeback requests to coalesce */ + int *max_pending; + + /* current number of pending writeback requests */ + int nr_pending; + + /* pending requests */ + PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]; +} WritebackContext; + +/* in buf_init.c */ +extern PGDLLIMPORT BufferDescPadded *BufferDescriptors; +extern PGDLLIMPORT WritebackContext BackendWritebackContext; + +/* in localbuf.c */ +extern PGDLLIMPORT BufferDesc *LocalBufferDescriptors; + +/* in bufmgr.c */ + +/* + * Structure to sort buffers per file on checkpoints. + * + * This structure is allocated per buffer in shared memory, so it should be + * kept as small as possible. + */ +typedef struct CkptSortItem +{ + Oid tsId; + Oid relNode; + ForkNumber forkNum; + BlockNumber blockNum; + int buf_id; +} CkptSortItem; + +extern PGDLLIMPORT CkptSortItem *CkptBufferIds; + +/* + * Internal buffer management routines + */ +/* bufmgr.c */ +extern void WritebackContextInit(WritebackContext *context, int *max_pending); +extern void IssuePendingWritebacks(WritebackContext *context); +extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag); + +/* freelist.c */ +extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy, + uint32 *buf_state); +extern void StrategyFreeBuffer(BufferDesc *buf); +extern bool StrategyRejectBuffer(BufferAccessStrategy strategy, + BufferDesc *buf); + +extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc); +extern void StrategyNotifyBgWriter(int bgwprocno); + +extern Size StrategyShmemSize(void); +extern void StrategyInitialize(bool init); +extern bool have_free_buffer(void); + +/* buf_table.c */ +extern Size BufTableShmemSize(int size); +extern void InitBufTable(int size); +extern uint32 BufTableHashCode(BufferTag *tagPtr); +extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode); +extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id); +extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode); + +/* localbuf.c */ +extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, + ForkNumber forkNum, + BlockNumber blockNum); +extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, + BlockNumber blockNum, bool *foundPtr); +extern void MarkLocalBufferDirty(Buffer buffer); +extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, + BlockNumber firstDelBlock); +extern void DropRelFileNodeAllLocalBuffers(RelFileNode rnode); +extern void AtEOXact_LocalBuffers(bool isCommit); + +#endif /* BUFMGR_INTERNALS_H */ diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h new file mode 100644 index 0000000..a4922d1 --- /dev/null +++ b/src/include/storage/buffile.h @@ -0,0 +1,57 @@ +/*------------------------------------------------------------------------- + * + * buffile.h + * Management of large buffered temporary files. + * + * The BufFile routines provide a partial replacement for stdio atop + * virtual file descriptors managed by fd.c. Currently they only support + * buffered access to a virtual file, without any of stdio's formatting + * features. That's enough for immediate needs, but the set of facilities + * could be expanded if necessary. + * + * BufFile also supports working with temporary files that exceed the OS + * file size limit and/or the largest offset representable in an int. + * It might be better to split that out as a separately accessible module, + * but currently we have no need for oversize temp files without buffered + * access. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/buffile.h + * + *------------------------------------------------------------------------- + */ + +#ifndef BUFFILE_H +#define BUFFILE_H + +#include "storage/fileset.h" + +/* BufFile is an opaque type whose details are not known outside buffile.c. */ + +typedef struct BufFile BufFile; + +/* + * prototypes for functions in buffile.c + */ + +extern BufFile *BufFileCreateTemp(bool interXact); +extern void BufFileClose(BufFile *file); +extern size_t BufFileRead(BufFile *file, void *ptr, size_t size); +extern void BufFileWrite(BufFile *file, void *ptr, size_t size); +extern int BufFileSeek(BufFile *file, int fileno, off_t offset, int whence); +extern void BufFileTell(BufFile *file, int *fileno, off_t *offset); +extern int BufFileSeekBlock(BufFile *file, long blknum); +extern int64 BufFileSize(BufFile *file); +extern long BufFileAppend(BufFile *target, BufFile *source); + +extern BufFile *BufFileCreateFileSet(FileSet *fileset, const char *name); +extern void BufFileExportFileSet(BufFile *file); +extern BufFile *BufFileOpenFileSet(FileSet *fileset, const char *name, + int mode, bool missing_ok); +extern void BufFileDeleteFileSet(FileSet *fileset, const char *name, + bool missing_ok); +extern void BufFileTruncateFileSet(BufFile *file, int fileno, off_t offset); + +#endif /* BUFFILE_H */ diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h new file mode 100644 index 0000000..5839140 --- /dev/null +++ b/src/include/storage/bufmgr.h @@ -0,0 +1,297 @@ +/*------------------------------------------------------------------------- + * + * bufmgr.h + * POSTGRES buffer manager definitions. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/bufmgr.h + * + *------------------------------------------------------------------------- + */ +#ifndef BUFMGR_H +#define BUFMGR_H + +#include "storage/block.h" +#include "storage/buf.h" +#include "storage/bufpage.h" +#include "storage/relfilenode.h" +#include "utils/relcache.h" +#include "utils/snapmgr.h" + +typedef void *Block; + +/* Possible arguments for GetAccessStrategy() */ +typedef enum BufferAccessStrategyType +{ + BAS_NORMAL, /* Normal random access */ + BAS_BULKREAD, /* Large read-only scan (hint bit updates are + * ok) */ + BAS_BULKWRITE, /* Large multi-block write (e.g. COPY IN) */ + BAS_VACUUM /* VACUUM */ +} BufferAccessStrategyType; + +/* Possible modes for ReadBufferExtended() */ +typedef enum +{ + RBM_NORMAL, /* Normal read */ + RBM_ZERO_AND_LOCK, /* Don't read from disk, caller will + * initialize. Also locks the page. */ + RBM_ZERO_AND_CLEANUP_LOCK, /* Like RBM_ZERO_AND_LOCK, but locks the page + * in "cleanup" mode */ + RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */ + RBM_NORMAL_NO_LOG /* Don't log page as invalid during WAL + * replay; otherwise same as RBM_NORMAL */ +} ReadBufferMode; + +/* + * Type returned by PrefetchBuffer(). + */ +typedef struct PrefetchBufferResult +{ + Buffer recent_buffer; /* If valid, a hit (recheck needed!) */ + bool initiated_io; /* If true, a miss resulting in async I/O */ +} PrefetchBufferResult; + +/* forward declared, to avoid having to expose buf_internals.h here */ +struct WritebackContext; + +/* forward declared, to avoid including smgr.h here */ +struct SMgrRelationData; + +/* in globals.c ... this duplicates miscadmin.h */ +extern PGDLLIMPORT int NBuffers; + +/* in bufmgr.c */ +extern PGDLLIMPORT bool zero_damaged_pages; +extern PGDLLIMPORT int bgwriter_lru_maxpages; +extern PGDLLIMPORT double bgwriter_lru_multiplier; +extern PGDLLIMPORT bool track_io_timing; +extern PGDLLIMPORT int effective_io_concurrency; +extern PGDLLIMPORT int maintenance_io_concurrency; + +extern PGDLLIMPORT int checkpoint_flush_after; +extern PGDLLIMPORT int backend_flush_after; +extern PGDLLIMPORT int bgwriter_flush_after; + +/* in buf_init.c */ +extern PGDLLIMPORT char *BufferBlocks; + +/* in localbuf.c */ +extern PGDLLIMPORT int NLocBuffer; +extern PGDLLIMPORT Block *LocalBufferBlockPointers; +extern PGDLLIMPORT int32 *LocalRefCount; + +/* upper limit for effective_io_concurrency */ +#define MAX_IO_CONCURRENCY 1000 + +/* special block number for ReadBuffer() */ +#define P_NEW InvalidBlockNumber /* grow the file to get a new page */ + +/* + * Buffer content lock modes (mode argument for LockBuffer()) + */ +#define BUFFER_LOCK_UNLOCK 0 +#define BUFFER_LOCK_SHARE 1 +#define BUFFER_LOCK_EXCLUSIVE 2 + +/* + * These routines are beaten on quite heavily, hence the macroization. + */ + +/* + * BufferIsValid + * True iff the given buffer number is valid (either as a shared + * or local buffer). + * + * Note: For a long time this was defined the same as BufferIsPinned, + * that is it would say False if you didn't hold a pin on the buffer. + * I believe this was bogus and served only to mask logic errors. + * Code should always know whether it has a buffer reference, + * independently of the pin state. + * + * Note: For a further long time this was not quite the inverse of the + * BufferIsInvalid() macro, in that it also did sanity checks to verify + * that the buffer number was in range. Most likely, this macro was + * originally intended only to be used in assertions, but its use has + * since expanded quite a bit, and the overhead of making those checks + * even in non-assert-enabled builds can be significant. Thus, we've + * now demoted the range checks to assertions within the macro itself. + */ +#define BufferIsValid(bufnum) \ +( \ + AssertMacro((bufnum) <= NBuffers && (bufnum) >= -NLocBuffer), \ + (bufnum) != InvalidBuffer \ +) + +/* + * BufferGetBlock + * Returns a reference to a disk page image associated with a buffer. + * + * Note: + * Assumes buffer is valid. + */ +#define BufferGetBlock(buffer) \ +( \ + AssertMacro(BufferIsValid(buffer)), \ + BufferIsLocal(buffer) ? \ + LocalBufferBlockPointers[-(buffer) - 1] \ + : \ + (Block) (BufferBlocks + ((Size) ((buffer) - 1)) * BLCKSZ) \ +) + +/* + * BufferGetPageSize + * Returns the page size within a buffer. + * + * Notes: + * Assumes buffer is valid. + * + * The buffer can be a raw disk block and need not contain a valid + * (formatted) disk page. + */ +/* XXX should dig out of buffer descriptor */ +#define BufferGetPageSize(buffer) \ +( \ + AssertMacro(BufferIsValid(buffer)), \ + (Size)BLCKSZ \ +) + +/* + * BufferGetPage + * Returns the page associated with a buffer. + * + * When this is called as part of a scan, there may be a need for a nearby + * call to TestForOldSnapshot(). See the definition of that for details. + */ +#define BufferGetPage(buffer) ((Page)BufferGetBlock(buffer)) + +/* + * prototypes for functions in bufmgr.c + */ +extern PrefetchBufferResult PrefetchSharedBuffer(struct SMgrRelationData *smgr_reln, + ForkNumber forkNum, + BlockNumber blockNum); +extern PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, + BlockNumber blockNum); +extern bool ReadRecentBuffer(RelFileNode rnode, ForkNumber forkNum, + BlockNumber blockNum, Buffer recent_buffer); +extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum); +extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, + BlockNumber blockNum, ReadBufferMode mode, + BufferAccessStrategy strategy); +extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode, + ForkNumber forkNum, BlockNumber blockNum, + ReadBufferMode mode, BufferAccessStrategy strategy, + bool permanent); +extern void ReleaseBuffer(Buffer buffer); +extern void UnlockReleaseBuffer(Buffer buffer); +extern void MarkBufferDirty(Buffer buffer); +extern void IncrBufferRefCount(Buffer buffer); +extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, + BlockNumber blockNum); + +extern void InitBufferPool(void); +extern void InitBufferPoolAccess(void); +extern void AtEOXact_Buffers(bool isCommit); +extern void PrintBufferLeakWarning(Buffer buffer); +extern void CheckPointBuffers(int flags); +extern BlockNumber BufferGetBlockNumber(Buffer buffer); +extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, + ForkNumber forkNum); +extern void FlushOneBuffer(Buffer buffer); +extern void FlushRelationBuffers(Relation rel); +extern void FlushRelationsAllBuffers(struct SMgrRelationData **smgrs, int nrels); +extern void CreateAndCopyRelationData(RelFileNode src_rnode, + RelFileNode dst_rnode, + bool permanent); +extern void FlushDatabaseBuffers(Oid dbid); +extern void DropRelFileNodeBuffers(struct SMgrRelationData *smgr_reln, ForkNumber *forkNum, + int nforks, BlockNumber *firstDelBlock); +extern void DropRelFileNodesAllBuffers(struct SMgrRelationData **smgr_reln, int nnodes); +extern void DropDatabaseBuffers(Oid dbid); + +#define RelationGetNumberOfBlocks(reln) \ + RelationGetNumberOfBlocksInFork(reln, MAIN_FORKNUM) + +extern bool BufferIsPermanent(Buffer buffer); +extern XLogRecPtr BufferGetLSNAtomic(Buffer buffer); + +#ifdef NOT_USED +extern void PrintPinnedBufs(void); +#endif +extern Size BufferShmemSize(void); +extern void BufferGetTag(Buffer buffer, RelFileNode *rnode, + ForkNumber *forknum, BlockNumber *blknum); + +extern void MarkBufferDirtyHint(Buffer buffer, bool buffer_std); + +extern void UnlockBuffers(void); +extern void LockBuffer(Buffer buffer, int mode); +extern bool ConditionalLockBuffer(Buffer buffer); +extern void LockBufferForCleanup(Buffer buffer); +extern bool ConditionalLockBufferForCleanup(Buffer buffer); +extern bool IsBufferCleanupOK(Buffer buffer); +extern bool HoldingBufferPinThatDelaysRecovery(void); + +extern void AbortBufferIO(void); + +extern void BufmgrCommit(void); +extern bool BgBufferSync(struct WritebackContext *wb_context); + +extern void AtProcExit_LocalBuffers(void); + +extern void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation); + +/* in freelist.c */ +extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype); +extern void FreeAccessStrategy(BufferAccessStrategy strategy); + + +/* inline functions */ + +/* + * Although this header file is nominally backend-only, certain frontend + * programs like pg_waldump include it. For compilers that emit static + * inline functions even when they're unused, that leads to unsatisfied + * external references; hence hide these with #ifndef FRONTEND. + */ + +#ifndef FRONTEND + +/* + * Check whether the given snapshot is too old to have safely read the given + * page from the given table. If so, throw a "snapshot too old" error. + * + * This test generally needs to be performed after every BufferGetPage() call + * that is executed as part of a scan. It is not needed for calls made for + * modifying the page (for example, to position to the right place to insert a + * new index tuple or for vacuuming). It may also be omitted where calls to + * lower-level functions will have already performed the test. + * + * Note that a NULL snapshot argument is allowed and causes a fast return + * without error; this is to support call sites which can be called from + * either scans or index modification areas. + * + * For best performance, keep the tests that are fastest and/or most likely to + * exclude a page from old snapshot testing near the front. + */ +static inline void +TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page) +{ + Assert(relation != NULL); + + if (old_snapshot_threshold >= 0 + && (snapshot) != NULL + && ((snapshot)->snapshot_type == SNAPSHOT_MVCC + || (snapshot)->snapshot_type == SNAPSHOT_TOAST) + && !XLogRecPtrIsInvalid((snapshot)->lsn) + && PageGetLSN(page) > (snapshot)->lsn) + TestForOldSnapshot_impl(snapshot, relation); +} + +#endif /* FRONTEND */ + +#endif /* BUFMGR_H */ diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h new file mode 100644 index 0000000..e9f253f --- /dev/null +++ b/src/include/storage/bufpage.h @@ -0,0 +1,457 @@ +/*------------------------------------------------------------------------- + * + * bufpage.h + * Standard POSTGRES buffer page definitions. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/bufpage.h + * + *------------------------------------------------------------------------- + */ +#ifndef BUFPAGE_H +#define BUFPAGE_H + +#include "access/xlogdefs.h" +#include "storage/block.h" +#include "storage/item.h" +#include "storage/off.h" + +/* + * A postgres disk page is an abstraction layered on top of a postgres + * disk block (which is simply a unit of i/o, see block.h). + * + * specifically, while a disk block can be unformatted, a postgres + * disk page is always a slotted page of the form: + * + * +----------------+---------------------------------+ + * | PageHeaderData | linp1 linp2 linp3 ... | + * +-----------+----+---------------------------------+ + * | ... linpN | | + * +-----------+--------------------------------------+ + * | ^ pd_lower | + * | | + * | v pd_upper | + * +-------------+------------------------------------+ + * | | tupleN ... | + * +-------------+------------------+-----------------+ + * | ... tuple3 tuple2 tuple1 | "special space" | + * +--------------------------------+-----------------+ + * ^ pd_special + * + * a page is full when nothing can be added between pd_lower and + * pd_upper. + * + * all blocks written out by an access method must be disk pages. + * + * EXCEPTIONS: + * + * obviously, a page is not formatted before it is initialized by + * a call to PageInit. + * + * NOTES: + * + * linp1..N form an ItemId (line pointer) array. ItemPointers point + * to a physical block number and a logical offset (line pointer + * number) within that block/page. Note that OffsetNumbers + * conventionally start at 1, not 0. + * + * tuple1..N are added "backwards" on the page. Since an ItemPointer + * offset is used to access an ItemId entry rather than an actual + * byte-offset position, tuples can be physically shuffled on a page + * whenever the need arises. This indirection also keeps crash recovery + * relatively simple, because the low-level details of page space + * management can be controlled by standard buffer page code during + * logging, and during recovery. + * + * AM-generic per-page information is kept in PageHeaderData. + * + * AM-specific per-page data (if any) is kept in the area marked "special + * space"; each AM has an "opaque" structure defined somewhere that is + * stored as the page trailer. an access method should always + * initialize its pages with PageInit and then set its own opaque + * fields. + */ + +typedef Pointer Page; + + +/* + * location (byte offset) within a page. + * + * note that this is actually limited to 2^15 because we have limited + * ItemIdData.lp_off and ItemIdData.lp_len to 15 bits (see itemid.h). + */ +typedef uint16 LocationIndex; + + +/* + * For historical reasons, the 64-bit LSN value is stored as two 32-bit + * values. + */ +typedef struct +{ + uint32 xlogid; /* high bits */ + uint32 xrecoff; /* low bits */ +} PageXLogRecPtr; + +#define PageXLogRecPtrGet(val) \ + ((uint64) (val).xlogid << 32 | (val).xrecoff) +#define PageXLogRecPtrSet(ptr, lsn) \ + ((ptr).xlogid = (uint32) ((lsn) >> 32), (ptr).xrecoff = (uint32) (lsn)) + +/* + * disk page organization + * + * space management information generic to any page + * + * pd_lsn - identifies xlog record for last change to this page. + * pd_checksum - page checksum, if set. + * pd_flags - flag bits. + * pd_lower - offset to start of free space. + * pd_upper - offset to end of free space. + * pd_special - offset to start of special space. + * pd_pagesize_version - size in bytes and page layout version number. + * pd_prune_xid - oldest XID among potentially prunable tuples on page. + * + * The LSN is used by the buffer manager to enforce the basic rule of WAL: + * "thou shalt write xlog before data". A dirty buffer cannot be dumped + * to disk until xlog has been flushed at least as far as the page's LSN. + * + * pd_checksum stores the page checksum, if it has been set for this page; + * zero is a valid value for a checksum. If a checksum is not in use then + * we leave the field unset. This will typically mean the field is zero + * though non-zero values may also be present if databases have been + * pg_upgraded from releases prior to 9.3, when the same byte offset was + * used to store the current timelineid when the page was last updated. + * Note that there is no indication on a page as to whether the checksum + * is valid or not, a deliberate design choice which avoids the problem + * of relying on the page contents to decide whether to verify it. Hence + * there are no flag bits relating to checksums. + * + * pd_prune_xid is a hint field that helps determine whether pruning will be + * useful. It is currently unused in index pages. + * + * The page version number and page size are packed together into a single + * uint16 field. This is for historical reasons: before PostgreSQL 7.3, + * there was no concept of a page version number, and doing it this way + * lets us pretend that pre-7.3 databases have page version number zero. + * We constrain page sizes to be multiples of 256, leaving the low eight + * bits available for a version number. + * + * Minimum possible page size is perhaps 64B to fit page header, opaque space + * and a minimal tuple; of course, in reality you want it much bigger, so + * the constraint on pagesize mod 256 is not an important restriction. + * On the high end, we can only support pages up to 32KB because lp_off/lp_len + * are 15 bits. + */ + +typedef struct PageHeaderData +{ + /* XXX LSN is member of *any* block, not only page-organized ones */ + PageXLogRecPtr pd_lsn; /* LSN: next byte after last byte of xlog + * record for last change to this page */ + uint16 pd_checksum; /* checksum */ + uint16 pd_flags; /* flag bits, see below */ + LocationIndex pd_lower; /* offset to start of free space */ + LocationIndex pd_upper; /* offset to end of free space */ + LocationIndex pd_special; /* offset to start of special space */ + uint16 pd_pagesize_version; + TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */ + ItemIdData pd_linp[FLEXIBLE_ARRAY_MEMBER]; /* line pointer array */ +} PageHeaderData; + +typedef PageHeaderData *PageHeader; + +/* + * pd_flags contains the following flag bits. Undefined bits are initialized + * to zero and may be used in the future. + * + * PD_HAS_FREE_LINES is set if there are any LP_UNUSED line pointers before + * pd_lower. This should be considered a hint rather than the truth, since + * changes to it are not WAL-logged. + * + * PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the + * page for its new tuple version; this suggests that a prune is needed. + * Again, this is just a hint. + */ +#define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */ +#define PD_PAGE_FULL 0x0002 /* not enough free space for new tuple? */ +#define PD_ALL_VISIBLE 0x0004 /* all tuples on page are visible to + * everyone */ + +#define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */ + +/* + * Page layout version number 0 is for pre-7.3 Postgres releases. + * Releases 7.3 and 7.4 use 1, denoting a new HeapTupleHeader layout. + * Release 8.0 uses 2; it changed the HeapTupleHeader layout again. + * Release 8.1 uses 3; it redefined HeapTupleHeader infomask bits. + * Release 8.3 uses 4; it changed the HeapTupleHeader layout again, and + * added the pd_flags field (by stealing some bits from pd_tli), + * as well as adding the pd_prune_xid field (which enlarges the header). + * + * As of Release 9.3, the checksum version must also be considered when + * handling pages. + */ +#define PG_PAGE_LAYOUT_VERSION 4 +#define PG_DATA_CHECKSUM_VERSION 1 + +/* ---------------------------------------------------------------- + * page support macros + * ---------------------------------------------------------------- + */ + +/* + * PageIsValid + * True iff page is valid. + */ +#define PageIsValid(page) PointerIsValid(page) + +/* + * line pointer(s) do not count as part of header + */ +#define SizeOfPageHeaderData (offsetof(PageHeaderData, pd_linp)) + +/* + * PageIsEmpty + * returns true iff no itemid has been allocated on the page + */ +#define PageIsEmpty(page) \ + (((PageHeader) (page))->pd_lower <= SizeOfPageHeaderData) + +/* + * PageIsNew + * returns true iff page has not been initialized (by PageInit) + */ +#define PageIsNew(page) (((PageHeader) (page))->pd_upper == 0) + +/* + * PageGetItemId + * Returns an item identifier of a page. + */ +#define PageGetItemId(page, offsetNumber) \ + ((ItemId) (&((PageHeader) (page))->pd_linp[(offsetNumber) - 1])) + +/* + * PageGetContents + * To be used in cases where the page does not contain line pointers. + * + * Note: prior to 8.3 this was not guaranteed to yield a MAXALIGN'd result. + * Now it is. Beware of old code that might think the offset to the contents + * is just SizeOfPageHeaderData rather than MAXALIGN(SizeOfPageHeaderData). + */ +#define PageGetContents(page) \ + ((char *) (page) + MAXALIGN(SizeOfPageHeaderData)) + +/* ---------------- + * macros to access page size info + * ---------------- + */ + +/* + * PageSizeIsValid + * True iff the page size is valid. + */ +#define PageSizeIsValid(pageSize) ((pageSize) == BLCKSZ) + +/* + * PageGetPageSize + * Returns the page size of a page. + * + * this can only be called on a formatted page (unlike + * BufferGetPageSize, which can be called on an unformatted page). + * however, it can be called on a page that is not stored in a buffer. + */ +#define PageGetPageSize(page) \ + ((Size) (((PageHeader) (page))->pd_pagesize_version & (uint16) 0xFF00)) + +/* + * PageGetPageLayoutVersion + * Returns the page layout version of a page. + */ +#define PageGetPageLayoutVersion(page) \ + (((PageHeader) (page))->pd_pagesize_version & 0x00FF) + +/* + * PageSetPageSizeAndVersion + * Sets the page size and page layout version number of a page. + * + * We could support setting these two values separately, but there's + * no real need for it at the moment. + */ +#define PageSetPageSizeAndVersion(page, size, version) \ +( \ + AssertMacro(((size) & 0xFF00) == (size)), \ + AssertMacro(((version) & 0x00FF) == (version)), \ + ((PageHeader) (page))->pd_pagesize_version = (size) | (version) \ +) + +/* ---------------- + * page special data macros + * ---------------- + */ +/* + * PageGetSpecialSize + * Returns size of special space on a page. + */ +#define PageGetSpecialSize(page) \ + ((uint16) (PageGetPageSize(page) - ((PageHeader)(page))->pd_special)) + +/* + * Using assertions, validate that the page special pointer is OK. + * + * This is intended to catch use of the pointer before page initialization. + * It is implemented as a function due to the limitations of the MSVC + * compiler, which choked on doing all these tests within another macro. We + * return true so that AssertMacro() can be used while still getting the + * specifics from the macro failure within this function. + */ +static inline bool +PageValidateSpecialPointer(Page page) +{ + Assert(PageIsValid(page)); + Assert(((PageHeader) (page))->pd_special <= BLCKSZ); + Assert(((PageHeader) (page))->pd_special >= SizeOfPageHeaderData); + + return true; +} + +/* + * PageGetSpecialPointer + * Returns pointer to special space on a page. + */ +#define PageGetSpecialPointer(page) \ +( \ + AssertMacro(PageValidateSpecialPointer(page)), \ + (char *) ((char *) (page) + ((PageHeader) (page))->pd_special) \ +) + +/* + * PageGetItem + * Retrieves an item on the given page. + * + * Note: + * This does not change the status of any of the resources passed. + * The semantics may change in the future. + */ +#define PageGetItem(page, itemId) \ +( \ + AssertMacro(PageIsValid(page)), \ + AssertMacro(ItemIdHasStorage(itemId)), \ + (Item)(((char *)(page)) + ItemIdGetOffset(itemId)) \ +) + +/* + * PageGetMaxOffsetNumber + * Returns the maximum offset number used by the given page. + * Since offset numbers are 1-based, this is also the number + * of items on the page. + * + * NOTE: if the page is not initialized (pd_lower == 0), we must + * return zero to ensure sane behavior. Accept double evaluation + * of the argument so that we can ensure this. + */ +#define PageGetMaxOffsetNumber(page) \ + (((PageHeader) (page))->pd_lower <= SizeOfPageHeaderData ? 0 : \ + ((((PageHeader) (page))->pd_lower - SizeOfPageHeaderData) \ + / sizeof(ItemIdData))) + +/* + * Additional macros for access to page headers. (Beware multiple evaluation + * of the arguments!) + */ +#define PageGetLSN(page) \ + PageXLogRecPtrGet(((PageHeader) (page))->pd_lsn) +#define PageSetLSN(page, lsn) \ + PageXLogRecPtrSet(((PageHeader) (page))->pd_lsn, lsn) + +#define PageHasFreeLinePointers(page) \ + (((PageHeader) (page))->pd_flags & PD_HAS_FREE_LINES) +#define PageSetHasFreeLinePointers(page) \ + (((PageHeader) (page))->pd_flags |= PD_HAS_FREE_LINES) +#define PageClearHasFreeLinePointers(page) \ + (((PageHeader) (page))->pd_flags &= ~PD_HAS_FREE_LINES) + +#define PageIsFull(page) \ + (((PageHeader) (page))->pd_flags & PD_PAGE_FULL) +#define PageSetFull(page) \ + (((PageHeader) (page))->pd_flags |= PD_PAGE_FULL) +#define PageClearFull(page) \ + (((PageHeader) (page))->pd_flags &= ~PD_PAGE_FULL) + +#define PageIsAllVisible(page) \ + (((PageHeader) (page))->pd_flags & PD_ALL_VISIBLE) +#define PageSetAllVisible(page) \ + (((PageHeader) (page))->pd_flags |= PD_ALL_VISIBLE) +#define PageClearAllVisible(page) \ + (((PageHeader) (page))->pd_flags &= ~PD_ALL_VISIBLE) + +#define PageSetPrunable(page, xid) \ +do { \ + Assert(TransactionIdIsNormal(xid)); \ + if (!TransactionIdIsValid(((PageHeader) (page))->pd_prune_xid) || \ + TransactionIdPrecedes(xid, ((PageHeader) (page))->pd_prune_xid)) \ + ((PageHeader) (page))->pd_prune_xid = (xid); \ +} while (0) +#define PageClearPrunable(page) \ + (((PageHeader) (page))->pd_prune_xid = InvalidTransactionId) + + +/* ---------------------------------------------------------------- + * extern declarations + * ---------------------------------------------------------------- + */ + +/* flags for PageAddItemExtended() */ +#define PAI_OVERWRITE (1 << 0) +#define PAI_IS_HEAP (1 << 1) + +/* flags for PageIsVerifiedExtended() */ +#define PIV_LOG_WARNING (1 << 0) +#define PIV_REPORT_STAT (1 << 1) + +#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap) \ + PageAddItemExtended(page, item, size, offsetNumber, \ + ((overwrite) ? PAI_OVERWRITE : 0) | \ + ((is_heap) ? PAI_IS_HEAP : 0)) + +#define PageIsVerified(page, blkno) \ + PageIsVerifiedExtended(page, blkno, \ + PIV_LOG_WARNING | PIV_REPORT_STAT) + +/* + * Check that BLCKSZ is a multiple of sizeof(size_t). In + * PageIsVerifiedExtended(), it is much faster to check if a page is + * full of zeroes using the native word size. Note that this assertion + * is kept within a header to make sure that StaticAssertDecl() works + * across various combinations of platforms and compilers. + */ +StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)), + "BLCKSZ has to be a multiple of sizeof(size_t)"); + +extern void PageInit(Page page, Size pageSize, Size specialSize); +extern bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags); +extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size, + OffsetNumber offsetNumber, int flags); +extern Page PageGetTempPage(Page page); +extern Page PageGetTempPageCopy(Page page); +extern Page PageGetTempPageCopySpecial(Page page); +extern void PageRestoreTempPage(Page tempPage, Page oldPage); +extern void PageRepairFragmentation(Page page); +extern void PageTruncateLinePointerArray(Page page); +extern Size PageGetFreeSpace(Page page); +extern Size PageGetFreeSpaceForMultipleTuples(Page page, int ntups); +extern Size PageGetExactFreeSpace(Page page); +extern Size PageGetHeapFreeSpace(Page page); +extern void PageIndexTupleDelete(Page page, OffsetNumber offset); +extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems); +extern void PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offset); +extern bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, + Item newtup, Size newsize); +extern char *PageSetChecksumCopy(Page page, BlockNumber blkno); +extern void PageSetChecksumInplace(Page page, BlockNumber blkno); + +#endif /* BUFPAGE_H */ diff --git a/src/include/storage/checksum.h b/src/include/storage/checksum.h new file mode 100644 index 0000000..1904fab --- /dev/null +++ b/src/include/storage/checksum.h @@ -0,0 +1,24 @@ +/*------------------------------------------------------------------------- + * + * checksum.h + * Checksum implementation for data pages. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/checksum.h + * + *------------------------------------------------------------------------- + */ +#ifndef CHECKSUM_H +#define CHECKSUM_H + +#include "storage/block.h" + +/* + * Compute the checksum for a Postgres page. The page must be aligned on a + * 4-byte boundary. + */ +extern uint16 pg_checksum_page(char *page, BlockNumber blkno); + +#endif /* CHECKSUM_H */ diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h new file mode 100644 index 0000000..015f0f1 --- /dev/null +++ b/src/include/storage/checksum_impl.h @@ -0,0 +1,215 @@ +/*------------------------------------------------------------------------- + * + * checksum_impl.h + * Checksum implementation for data pages. + * + * This file exists for the benefit of external programs that may wish to + * check Postgres page checksums. They can #include this to get the code + * referenced by storage/checksum.h. (Note: you may need to redefine + * Assert() as empty to compile this successfully externally.) + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/checksum_impl.h + * + *------------------------------------------------------------------------- + */ + +/* + * The algorithm used to checksum pages is chosen for very fast calculation. + * Workloads where the database working set fits into OS file cache but not + * into shared buffers can read in pages at a very fast pace and the checksum + * algorithm itself can become the largest bottleneck. + * + * The checksum algorithm itself is based on the FNV-1a hash (FNV is shorthand + * for Fowler/Noll/Vo). The primitive of a plain FNV-1a hash folds in data 1 + * byte at a time according to the formula: + * + * hash = (hash ^ value) * FNV_PRIME + * + * FNV-1a algorithm is described at http://www.isthe.com/chongo/tech/comp/fnv/ + * + * PostgreSQL doesn't use FNV-1a hash directly because it has bad mixing of + * high bits - high order bits in input data only affect high order bits in + * output data. To resolve this we xor in the value prior to multiplication + * shifted right by 17 bits. The number 17 was chosen because it doesn't + * have common denominator with set bit positions in FNV_PRIME and empirically + * provides the fastest mixing for high order bits of final iterations quickly + * avalanche into lower positions. For performance reasons we choose to combine + * 4 bytes at a time. The actual hash formula used as the basis is: + * + * hash = (hash ^ value) * FNV_PRIME ^ ((hash ^ value) >> 17) + * + * The main bottleneck in this calculation is the multiplication latency. To + * hide the latency and to make use of SIMD parallelism multiple hash values + * are calculated in parallel. The page is treated as a 32 column two + * dimensional array of 32 bit values. Each column is aggregated separately + * into a partial checksum. Each partial checksum uses a different initial + * value (offset basis in FNV terminology). The initial values actually used + * were chosen randomly, as the values themselves don't matter as much as that + * they are different and don't match anything in real data. After initializing + * partial checksums each value in the column is aggregated according to the + * above formula. Finally two more iterations of the formula are performed with + * value 0 to mix the bits of the last value added. + * + * The partial checksums are then folded together using xor to form a single + * 32-bit checksum. The caller can safely reduce the value to 16 bits + * using modulo 2^16-1. That will cause a very slight bias towards lower + * values but this is not significant for the performance of the + * checksum. + * + * The algorithm choice was based on what instructions are available in SIMD + * instruction sets. This meant that a fast and good algorithm needed to use + * multiplication as the main mixing operator. The simplest multiplication + * based checksum primitive is the one used by FNV. The prime used is chosen + * for good dispersion of values. It has no known simple patterns that result + * in collisions. Test of 5-bit differentials of the primitive over 64bit keys + * reveals no differentials with 3 or more values out of 100000 random keys + * colliding. Avalanche test shows that only high order bits of the last word + * have a bias. Tests of 1-4 uncorrelated bit errors, stray 0 and 0xFF bytes, + * overwriting page from random position to end with 0 bytes, and overwriting + * random segments of page with 0x00, 0xFF and random data all show optimal + * 2e-16 false positive rate within margin of error. + * + * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer + * multiplication instruction. As of 2013 the corresponding instruction is + * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32). + * Vectorization requires a compiler to do the vectorization for us. For recent + * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough + * to achieve vectorization. + * + * The optimal amount of parallelism to use depends on CPU specific instruction + * latency, SIMD instruction width, throughput and the amount of registers + * available to hold intermediate state. Generally, more parallelism is better + * up to the point that state doesn't fit in registers and extra load-store + * instructions are needed to swap values in/out. The number chosen is a fixed + * part of the algorithm because changing the parallelism changes the checksum + * result. + * + * The parallelism number 32 was chosen based on the fact that it is the + * largest state that fits into architecturally visible x86 SSE registers while + * leaving some free registers for intermediate values. For future processors + * with 256bit vector registers this will leave some performance on the table. + * When vectorization is not available it might be beneficial to restructure + * the computation to calculate a subset of the columns at a time and perform + * multiple passes to avoid register spilling. This optimization opportunity + * is not used. Current coding also assumes that the compiler has the ability + * to unroll the inner loop to avoid loop overhead and minimize register + * spilling. For less sophisticated compilers it might be beneficial to + * manually unroll the inner loop. + */ + +#include "storage/bufpage.h" + +/* number of checksums to calculate in parallel */ +#define N_SUMS 32 +/* prime multiplier of FNV-1a hash */ +#define FNV_PRIME 16777619 + +/* Use a union so that this code is valid under strict aliasing */ +typedef union +{ + PageHeaderData phdr; + uint32 data[BLCKSZ / (sizeof(uint32) * N_SUMS)][N_SUMS]; +} PGChecksummablePage; + +/* + * Base offsets to initialize each of the parallel FNV hashes into a + * different initial state. + */ +static const uint32 checksumBaseOffsets[N_SUMS] = { + 0x5B1F36E9, 0xB8525960, 0x02AB50AA, 0x1DE66D2A, + 0x79FF467A, 0x9BB9F8A3, 0x217E7CD2, 0x83E13D2C, + 0xF8D4474F, 0xE39EB970, 0x42C6AE16, 0x993216FA, + 0x7B093B5D, 0x98DAFF3C, 0xF718902A, 0x0B1C9CDB, + 0xE58F764B, 0x187636BC, 0x5D7B3BB1, 0xE73DE7DE, + 0x92BEC979, 0xCCA6C0B2, 0x304A0979, 0x85AA43D4, + 0x783125BB, 0x6CA8EAA2, 0xE407EAC6, 0x4B5CFC3E, + 0x9FBF8C76, 0x15CA20BE, 0xF2CA9FD3, 0x959BD756 +}; + +/* + * Calculate one round of the checksum. + */ +#define CHECKSUM_COMP(checksum, value) \ +do { \ + uint32 __tmp = (checksum) ^ (value); \ + (checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17); \ +} while (0) + +/* + * Block checksum algorithm. The page must be adequately aligned + * (at least on 4-byte boundary). + */ +static uint32 +pg_checksum_block(const PGChecksummablePage *page) +{ + uint32 sums[N_SUMS]; + uint32 result = 0; + uint32 i, + j; + + /* ensure that the size is compatible with the algorithm */ + Assert(sizeof(PGChecksummablePage) == BLCKSZ); + + /* initialize partial checksums to their corresponding offsets */ + memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); + + /* main checksum calculation */ + for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++) + for (j = 0; j < N_SUMS; j++) + CHECKSUM_COMP(sums[j], page->data[i][j]); + + /* finally add in two rounds of zeroes for additional mixing */ + for (i = 0; i < 2; i++) + for (j = 0; j < N_SUMS; j++) + CHECKSUM_COMP(sums[j], 0); + + /* xor fold partial checksums together */ + for (i = 0; i < N_SUMS; i++) + result ^= sums[i]; + + return result; +} + +/* + * Compute the checksum for a Postgres page. + * + * The page must be adequately aligned (at least on a 4-byte boundary). + * Beware also that the checksum field of the page is transiently zeroed. + * + * The checksum includes the block number (to detect the case where a page is + * somehow moved to a different location), the page header (excluding the + * checksum itself), and the page data. + */ +uint16 +pg_checksum_page(char *page, BlockNumber blkno) +{ + PGChecksummablePage *cpage = (PGChecksummablePage *) page; + uint16 save_checksum; + uint32 checksum; + + /* We only calculate the checksum for properly-initialized pages */ + Assert(!PageIsNew(&cpage->phdr)); + + /* + * Save pd_checksum and temporarily set it to zero, so that the checksum + * calculation isn't affected by the old checksum stored on the page. + * Restore it after, because actually updating the checksum is NOT part of + * the API of this function. + */ + save_checksum = cpage->phdr.pd_checksum; + cpage->phdr.pd_checksum = 0; + checksum = pg_checksum_block(cpage); + cpage->phdr.pd_checksum = save_checksum; + + /* Mix in the block number to detect transposed pages */ + checksum ^= blkno; + + /* + * Reduce to a uint16 (to fit in the pd_checksum field) with an offset of + * one. That avoids checksums of zero, which seems like a good idea. + */ + return (uint16) ((checksum % 65535) + 1); +} diff --git a/src/include/storage/condition_variable.h b/src/include/storage/condition_variable.h new file mode 100644 index 0000000..e89175e --- /dev/null +++ b/src/include/storage/condition_variable.h @@ -0,0 +1,73 @@ +/*------------------------------------------------------------------------- + * + * condition_variable.h + * Condition variables + * + * A condition variable is a method of waiting until a certain condition + * becomes true. Conventionally, a condition variable supports three + * operations: (1) sleep; (2) signal, which wakes up one process sleeping + * on the condition variable; and (3) broadcast, which wakes up every + * process sleeping on the condition variable. In our implementation, + * condition variables put a process into an interruptible sleep (so it + * can be canceled prior to the fulfillment of the condition) and do not + * use pointers internally (so that they are safe to use within DSMs). + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/condition_variable.h + * + *------------------------------------------------------------------------- + */ +#ifndef CONDITION_VARIABLE_H +#define CONDITION_VARIABLE_H + +#include "storage/proclist_types.h" +#include "storage/spin.h" + +typedef struct +{ + slock_t mutex; /* spinlock protecting the wakeup list */ + proclist_head wakeup; /* list of wake-able processes */ +} ConditionVariable; + +/* + * Pad a condition variable to a power-of-two size so that an array of + * condition variables does not cross a cache line boundary. + */ +#define CV_MINIMAL_SIZE (sizeof(ConditionVariable) <= 16 ? 16 : 32) +typedef union ConditionVariableMinimallyPadded +{ + ConditionVariable cv; + char pad[CV_MINIMAL_SIZE]; +} ConditionVariableMinimallyPadded; + +/* Initialize a condition variable. */ +extern void ConditionVariableInit(ConditionVariable *cv); + +/* + * To sleep on a condition variable, a process should use a loop which first + * checks the condition, exiting the loop if it is met, and then calls + * ConditionVariableSleep. Spurious wakeups are possible, but should be + * infrequent. After exiting the loop, ConditionVariableCancelSleep must + * be called to ensure that the process is no longer in the wait list for + * the condition variable. + */ +extern void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info); +extern bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, + uint32 wait_event_info); +extern void ConditionVariableCancelSleep(void); + +/* + * Optionally, ConditionVariablePrepareToSleep can be called before entering + * the test-and-sleep loop described above. Doing so is more efficient if + * at least one sleep is needed, whereas not doing so is more efficient when + * no sleep is needed because the test condition is true the first time. + */ +extern void ConditionVariablePrepareToSleep(ConditionVariable *cv); + +/* Wake up a single waiter (via signal) or all waiters (via broadcast). */ +extern void ConditionVariableSignal(ConditionVariable *cv); +extern void ConditionVariableBroadcast(ConditionVariable *cv); + +#endif /* CONDITION_VARIABLE_H */ diff --git a/src/include/storage/copydir.h b/src/include/storage/copydir.h new file mode 100644 index 0000000..50a26ed --- /dev/null +++ b/src/include/storage/copydir.h @@ -0,0 +1,19 @@ +/*------------------------------------------------------------------------- + * + * copydir.h + * Copy a directory. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/copydir.h + * + *------------------------------------------------------------------------- + */ +#ifndef COPYDIR_H +#define COPYDIR_H + +extern void copydir(char *fromdir, char *todir, bool recurse); +extern void copy_file(char *fromfile, char *tofile); + +#endif /* COPYDIR_H */ diff --git a/src/include/storage/dsm.h b/src/include/storage/dsm.h new file mode 100644 index 0000000..4dd6af2 --- /dev/null +++ b/src/include/storage/dsm.h @@ -0,0 +1,64 @@ +/*------------------------------------------------------------------------- + * + * dsm.h + * manage dynamic shared memory segments + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/dsm.h + * + *------------------------------------------------------------------------- + */ +#ifndef DSM_H +#define DSM_H + +#include "storage/dsm_impl.h" + +typedef struct dsm_segment dsm_segment; + +#define DSM_CREATE_NULL_IF_MAXSEGMENTS 0x0001 + +/* A sentinel value for an invalid DSM handle. */ +#define DSM_HANDLE_INVALID 0 + +/* Startup and shutdown functions. */ +struct PGShmemHeader; /* avoid including pg_shmem.h */ +extern void dsm_cleanup_using_control_segment(dsm_handle old_control_handle); +extern void dsm_postmaster_startup(struct PGShmemHeader *); +extern void dsm_backend_shutdown(void); +extern void dsm_detach_all(void); + +extern size_t dsm_estimate_size(void); +extern void dsm_shmem_init(void); + +#ifdef EXEC_BACKEND +extern void dsm_set_control_handle(dsm_handle h); +#endif + +/* Functions that create or remove mappings. */ +extern dsm_segment *dsm_create(Size size, int flags); +extern dsm_segment *dsm_attach(dsm_handle h); +extern void dsm_detach(dsm_segment *seg); + +/* Resource management functions. */ +extern void dsm_pin_mapping(dsm_segment *seg); +extern void dsm_unpin_mapping(dsm_segment *seg); +extern void dsm_pin_segment(dsm_segment *seg); +extern void dsm_unpin_segment(dsm_handle h); +extern dsm_segment *dsm_find_mapping(dsm_handle h); + +/* Informational functions. */ +extern void *dsm_segment_address(dsm_segment *seg); +extern Size dsm_segment_map_length(dsm_segment *seg); +extern dsm_handle dsm_segment_handle(dsm_segment *seg); + +/* Cleanup hooks. */ +typedef void (*on_dsm_detach_callback) (dsm_segment *, Datum arg); +extern void on_dsm_detach(dsm_segment *seg, + on_dsm_detach_callback function, Datum arg); +extern void cancel_on_dsm_detach(dsm_segment *seg, + on_dsm_detach_callback function, Datum arg); +extern void reset_on_dsm_detach(void); + +#endif /* DSM_H */ diff --git a/src/include/storage/dsm_impl.h b/src/include/storage/dsm_impl.h new file mode 100644 index 0000000..c51584d --- /dev/null +++ b/src/include/storage/dsm_impl.h @@ -0,0 +1,76 @@ +/*------------------------------------------------------------------------- + * + * dsm_impl.h + * low-level dynamic shared memory primitives + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/dsm_impl.h + * + *------------------------------------------------------------------------- + */ +#ifndef DSM_IMPL_H +#define DSM_IMPL_H + +/* Dynamic shared memory implementations. */ +#define DSM_IMPL_POSIX 1 +#define DSM_IMPL_SYSV 2 +#define DSM_IMPL_WINDOWS 3 +#define DSM_IMPL_MMAP 4 + +/* + * Determine which dynamic shared memory implementations will be supported + * on this platform, and which one will be the default. + */ +#ifdef WIN32 +#define USE_DSM_WINDOWS +#define DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE DSM_IMPL_WINDOWS +#else +#ifdef HAVE_SHM_OPEN +#define USE_DSM_POSIX +#define DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE DSM_IMPL_POSIX +#endif +#define USE_DSM_SYSV +#ifndef DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE +#define DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE DSM_IMPL_SYSV +#endif +#define USE_DSM_MMAP +#endif + +/* GUC. */ +extern PGDLLIMPORT int dynamic_shared_memory_type; +extern PGDLLIMPORT int min_dynamic_shared_memory; + +/* + * Directory for on-disk state. + * + * This is used by all implementations for crash recovery and by the mmap + * implementation for storage. + */ +#define PG_DYNSHMEM_DIR "pg_dynshmem" +#define PG_DYNSHMEM_MMAP_FILE_PREFIX "mmap." + +/* A "name" for a dynamic shared memory segment. */ +typedef uint32 dsm_handle; + +/* All the shared-memory operations we know about. */ +typedef enum +{ + DSM_OP_CREATE, + DSM_OP_ATTACH, + DSM_OP_DETACH, + DSM_OP_DESTROY +} dsm_op; + +/* Create, attach to, detach from, resize, or destroy a segment. */ +extern bool dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, Size *mapped_size, + int elevel); + +/* Implementation-dependent actions required to keep segment until shutdown. */ +extern void dsm_impl_pin_segment(dsm_handle handle, void *impl_private, + void **impl_private_pm_handle); +extern void dsm_impl_unpin_segment(dsm_handle handle, void **impl_private); + +#endif /* DSM_IMPL_H */ diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h new file mode 100644 index 0000000..69549b0 --- /dev/null +++ b/src/include/storage/fd.h @@ -0,0 +1,198 @@ +/*------------------------------------------------------------------------- + * + * fd.h + * Virtual file descriptor definitions. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/fd.h + * + *------------------------------------------------------------------------- + */ + +/* + * calls: + * + * File {Close, Read, Write, Size, Sync} + * {Path Name Open, Allocate, Free} File + * + * These are NOT JUST RENAMINGS OF THE UNIX ROUTINES. + * Use them for all file activity... + * + * File fd; + * fd = PathNameOpenFile("foo", O_RDONLY); + * + * AllocateFile(); + * FreeFile(); + * + * Use AllocateFile, not fopen, if you need a stdio file (FILE*); then + * use FreeFile, not fclose, to close it. AVOID using stdio for files + * that you intend to hold open for any length of time, since there is + * no way for them to share kernel file descriptors with other files. + * + * Likewise, use AllocateDir/FreeDir, not opendir/closedir, to allocate + * open directories (DIR*), and OpenTransientFile/CloseTransientFile for an + * unbuffered file descriptor. + * + * If you really can't use any of the above, at least call AcquireExternalFD + * or ReserveExternalFD to report any file descriptors that are held for any + * length of time. Failure to do so risks unnecessary EMFILE errors. + */ +#ifndef FD_H +#define FD_H + +#include <dirent.h> + +typedef enum RecoveryInitSyncMethod +{ + RECOVERY_INIT_SYNC_METHOD_FSYNC, + RECOVERY_INIT_SYNC_METHOD_SYNCFS +} RecoveryInitSyncMethod; + +struct iovec; /* avoid including port/pg_iovec.h here */ + +typedef int File; + + +/* GUC parameter */ +extern PGDLLIMPORT int max_files_per_process; +extern PGDLLIMPORT bool data_sync_retry; +extern PGDLLIMPORT int recovery_init_sync_method; + +/* + * This is private to fd.c, but exported for save/restore_backend_variables() + */ +extern PGDLLIMPORT int max_safe_fds; + +/* + * On Windows, we have to interpret EACCES as possibly meaning the same as + * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform, + * that's what you get. Ugh. This code is designed so that we don't + * actually believe these cases are okay without further evidence (namely, + * a pending fsync request getting canceled ... see ProcessSyncRequests). + */ +#ifndef WIN32 +#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT) +#else +#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES) +#endif + +/* + * O_DIRECT is not standard, but almost every Unix has it. We translate it + * to the appropriate Windows flag in src/port/open.c. We simulate it with + * fcntl(F_NOCACHE) on macOS inside fd.c's open() wrapper. We use the name + * PG_O_DIRECT rather than defining O_DIRECT in that case (probably not a good + * idea on a Unix). + */ +#if defined(O_DIRECT) +#define PG_O_DIRECT O_DIRECT +#elif defined(F_NOCACHE) +#define PG_O_DIRECT 0x80000000 +#define PG_O_DIRECT_USE_F_NOCACHE +#else +#define PG_O_DIRECT 0 +#endif + +/* + * prototypes for functions in fd.c + */ + +/* Operations on virtual Files --- equivalent to Unix kernel file ops */ +extern File PathNameOpenFile(const char *fileName, int fileFlags); +extern File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode); +extern File OpenTemporaryFile(bool interXact); +extern void FileClose(File file); +extern int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info); +extern int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info); +extern int FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info); +extern int FileSync(File file, uint32 wait_event_info); +extern off_t FileSize(File file); +extern int FileTruncate(File file, off_t offset, uint32 wait_event_info); +extern void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info); +extern char *FilePathName(File file); +extern int FileGetRawDesc(File file); +extern int FileGetRawFlags(File file); +extern mode_t FileGetRawMode(File file); + +/* Operations used for sharing named temporary files */ +extern File PathNameCreateTemporaryFile(const char *name, bool error_on_failure); +extern File PathNameOpenTemporaryFile(const char *path, int mode); +extern bool PathNameDeleteTemporaryFile(const char *name, bool error_on_failure); +extern void PathNameCreateTemporaryDir(const char *base, const char *name); +extern void PathNameDeleteTemporaryDir(const char *name); +extern void TempTablespacePath(char *path, Oid tablespace); + +/* Operations that allow use of regular stdio --- USE WITH CAUTION */ +extern FILE *AllocateFile(const char *name, const char *mode); +extern int FreeFile(FILE *file); + +/* Operations that allow use of pipe streams (popen/pclose) */ +extern FILE *OpenPipeStream(const char *command, const char *mode); +extern int ClosePipeStream(FILE *file); + +/* Operations to allow use of the <dirent.h> library routines */ +extern DIR *AllocateDir(const char *dirname); +extern struct dirent *ReadDir(DIR *dir, const char *dirname); +extern struct dirent *ReadDirExtended(DIR *dir, const char *dirname, + int elevel); +extern int FreeDir(DIR *dir); + +/* Operations to allow use of a plain kernel FD, with automatic cleanup */ +extern int OpenTransientFile(const char *fileName, int fileFlags); +extern int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode); +extern int CloseTransientFile(int fd); + +/* If you've really really gotta have a plain kernel FD, use this */ +extern int BasicOpenFile(const char *fileName, int fileFlags); +extern int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode); + +/* Use these for other cases, and also for long-lived BasicOpenFile FDs */ +extern bool AcquireExternalFD(void); +extern void ReserveExternalFD(void); +extern void ReleaseExternalFD(void); + +/* Make a directory with default permissions */ +extern int MakePGDirectory(const char *directoryName); + +/* Miscellaneous support routines */ +extern void InitFileAccess(void); +extern void InitTemporaryFileAccess(void); +extern void set_max_safe_fds(void); +extern void closeAllVfds(void); +extern void SetTempTablespaces(Oid *tableSpaces, int numSpaces); +extern bool TempTablespacesAreSet(void); +extern int GetTempTablespaces(Oid *tableSpaces, int numSpaces); +extern Oid GetNextTempTableSpace(void); +extern void AtEOXact_Files(bool isCommit); +extern void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, + SubTransactionId parentSubid); +extern void RemovePgTempFiles(void); +extern void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, + bool unlink_all); +extern bool looks_like_temp_rel_name(const char *name); + +extern int pg_fsync(int fd); +extern int pg_fsync_no_writethrough(int fd); +extern int pg_fsync_writethrough(int fd); +extern int pg_fdatasync(int fd); +extern void pg_flush_data(int fd, off_t offset, off_t amount); +extern ssize_t pg_pwritev_with_retry(int fd, + const struct iovec *iov, + int iovcnt, + off_t offset); +extern int pg_truncate(const char *path, off_t length); +extern void fsync_fname(const char *fname, bool isdir); +extern int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel); +extern int durable_rename(const char *oldfile, const char *newfile, int loglevel); +extern int durable_unlink(const char *fname, int loglevel); +extern int durable_rename_excl(const char *oldfile, const char *newfile, int loglevel); +extern void SyncDataDirectory(void); +extern int data_sync_elevel(int elevel); + +/* Filename components */ +#define PG_TEMP_FILES_DIR "pgsql_tmp" +#define PG_TEMP_FILE_PREFIX "pgsql_tmp" + +#endif /* FD_H */ diff --git a/src/include/storage/fileset.h b/src/include/storage/fileset.h new file mode 100644 index 0000000..ad37884 --- /dev/null +++ b/src/include/storage/fileset.h @@ -0,0 +1,40 @@ +/*------------------------------------------------------------------------- + * + * fileset.h + * Management of named temporary files. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/fileset.h + * + *------------------------------------------------------------------------- + */ + +#ifndef FILESET_H +#define FILESET_H + +#include "storage/fd.h" + +/* + * A set of temporary files. + */ +typedef struct FileSet +{ + pid_t creator_pid; /* PID of the creating process */ + uint32 number; /* per-PID identifier */ + int ntablespaces; /* number of tablespaces to use */ + Oid tablespaces[8]; /* OIDs of tablespaces to use. Assumes that + * it's rare that there more than temp + * tablespaces. */ +} FileSet; + +extern void FileSetInit(FileSet *fileset); +extern File FileSetCreate(FileSet *fileset, const char *name); +extern File FileSetOpen(FileSet *fileset, const char *name, + int mode); +extern bool FileSetDelete(FileSet *fileset, const char *name, + bool error_on_failure); +extern void FileSetDeleteAll(FileSet *fileset); + +#endif diff --git a/src/include/storage/freespace.h b/src/include/storage/freespace.h new file mode 100644 index 0000000..dcc40eb --- /dev/null +++ b/src/include/storage/freespace.h @@ -0,0 +1,39 @@ +/*------------------------------------------------------------------------- + * + * freespace.h + * POSTGRES free space map for quickly finding free space in relations + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/freespace.h + * + *------------------------------------------------------------------------- + */ +#ifndef FREESPACE_H_ +#define FREESPACE_H_ + +#include "storage/block.h" +#include "storage/relfilenode.h" +#include "utils/relcache.h" + +/* prototypes for public functions in freespace.c */ +extern Size GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk); +extern BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded); +extern BlockNumber RecordAndGetPageWithFreeSpace(Relation rel, + BlockNumber oldPage, + Size oldSpaceAvail, + Size spaceNeeded); +extern void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, + Size spaceAvail); +extern void XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk, + Size spaceAvail); + +extern BlockNumber FreeSpaceMapPrepareTruncateRel(Relation rel, + BlockNumber nblocks); +extern void FreeSpaceMapVacuum(Relation rel); +extern void FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, + BlockNumber end); + +#endif /* FREESPACE_H_ */ diff --git a/src/include/storage/fsm_internals.h b/src/include/storage/fsm_internals.h new file mode 100644 index 0000000..a6f8372 --- /dev/null +++ b/src/include/storage/fsm_internals.h @@ -0,0 +1,72 @@ +/*------------------------------------------------------------------------- + * + * fsm_internals.h + * internal functions for free space map + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/fsm_internals.h + * + *------------------------------------------------------------------------- + */ +#ifndef FSM_INTERNALS_H +#define FSM_INTERNALS_H + +#include "storage/buf.h" +#include "storage/bufpage.h" + +/* + * Structure of a FSM page. See src/backend/storage/freespace/README for + * details. + */ +typedef struct +{ + /* + * fsm_search_avail() tries to spread the load of multiple backends by + * returning different pages to different backends in a round-robin + * fashion. fp_next_slot points to the next slot to be returned (assuming + * there's enough space on it for the request). It's defined as an int, + * because it's updated without an exclusive lock. uint16 would be more + * appropriate, but int is more likely to be atomically + * fetchable/storable. + */ + int fp_next_slot; + + /* + * fp_nodes contains the binary tree, stored in array. The first + * NonLeafNodesPerPage elements are upper nodes, and the following + * LeafNodesPerPage elements are leaf nodes. Unused nodes are zero. + */ + uint8 fp_nodes[FLEXIBLE_ARRAY_MEMBER]; +} FSMPageData; + +typedef FSMPageData *FSMPage; + +/* + * Number of non-leaf and leaf nodes, and nodes in total, on an FSM page. + * These definitions are internal to fsmpage.c. + */ +#define NodesPerPage (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \ + offsetof(FSMPageData, fp_nodes)) + +#define NonLeafNodesPerPage (BLCKSZ / 2 - 1) +#define LeafNodesPerPage (NodesPerPage - NonLeafNodesPerPage) + +/* + * Number of FSM "slots" on a FSM page. This is what should be used + * outside fsmpage.c. + */ +#define SlotsPerFSMPage LeafNodesPerPage + +/* Prototypes for functions in fsmpage.c */ +extern int fsm_search_avail(Buffer buf, uint8 min_cat, bool advancenext, + bool exclusive_lock_held); +extern uint8 fsm_get_avail(Page page, int slot); +extern uint8 fsm_get_max_avail(Page page); +extern bool fsm_set_avail(Page page, int slot, uint8 value); +extern bool fsm_truncate_avail(Page page, int nslots); +extern bool fsm_rebuild_page(Page page); + +#endif /* FSM_INTERNALS_H */ diff --git a/src/include/storage/indexfsm.h b/src/include/storage/indexfsm.h new file mode 100644 index 0000000..04c1a05 --- /dev/null +++ b/src/include/storage/indexfsm.h @@ -0,0 +1,26 @@ +/*------------------------------------------------------------------------- + * + * indexfsm.h + * POSTGRES free space map for quickly finding an unused page in index + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/indexfsm.h + * + *------------------------------------------------------------------------- + */ +#ifndef INDEXFSM_H_ +#define INDEXFSM_H_ + +#include "storage/block.h" +#include "utils/relcache.h" + +extern BlockNumber GetFreeIndexPage(Relation rel); +extern void RecordFreeIndexPage(Relation rel, BlockNumber page); +extern void RecordUsedIndexPage(Relation rel, BlockNumber page); + +extern void IndexFreeSpaceMapVacuum(Relation rel); + +#endif /* INDEXFSM_H_ */ diff --git a/src/include/storage/ipc.h b/src/include/storage/ipc.h new file mode 100644 index 0000000..fade4db --- /dev/null +++ b/src/include/storage/ipc.h @@ -0,0 +1,84 @@ +/*------------------------------------------------------------------------- + * + * ipc.h + * POSTGRES inter-process communication definitions. + * + * This file is misnamed, as it no longer has much of anything directly + * to do with IPC. The functionality here is concerned with managing + * exit-time cleanup for either a postmaster or a backend. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/ipc.h + * + *------------------------------------------------------------------------- + */ +#ifndef IPC_H +#define IPC_H + +typedef void (*pg_on_exit_callback) (int code, Datum arg); +typedef void (*shmem_startup_hook_type) (void); + +/*---------- + * API for handling cleanup that must occur during either ereport(ERROR) + * or ereport(FATAL) exits from a block of code. (Typical examples are + * undoing transient changes to shared-memory state.) + * + * PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg); + * { + * ... code that might throw ereport(ERROR) or ereport(FATAL) ... + * } + * PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg); + * + * where the cleanup code is in a function declared per pg_on_exit_callback. + * The Datum value "arg" can carry any information the cleanup function + * needs. + * + * This construct ensures that cleanup_function() will be called during + * either ERROR or FATAL exits. It will not be called on successful + * exit from the controlled code. (If you want it to happen then too, + * call the function yourself from just after the construct.) + * + * Note: the macro arguments are multiply evaluated, so avoid side-effects. + *---------- + */ +#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg) \ + do { \ + before_shmem_exit(cleanup_function, arg); \ + PG_TRY() + +#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg) \ + cancel_before_shmem_exit(cleanup_function, arg); \ + PG_CATCH(); \ + { \ + cancel_before_shmem_exit(cleanup_function, arg); \ + cleanup_function (0, arg); \ + PG_RE_THROW(); \ + } \ + PG_END_TRY(); \ + } while (0) + + +/* ipc.c */ +extern PGDLLIMPORT bool proc_exit_inprogress; +extern PGDLLIMPORT bool shmem_exit_inprogress; + +extern void proc_exit(int code) pg_attribute_noreturn(); +extern void shmem_exit(int code); +extern void on_proc_exit(pg_on_exit_callback function, Datum arg); +extern void on_shmem_exit(pg_on_exit_callback function, Datum arg); +extern void before_shmem_exit(pg_on_exit_callback function, Datum arg); +extern void cancel_before_shmem_exit(pg_on_exit_callback function, Datum arg); +extern void on_exit_reset(void); +extern void check_on_shmem_exit_lists_are_empty(void); + +/* ipci.c */ +extern PGDLLIMPORT shmem_startup_hook_type shmem_startup_hook; + +extern Size CalculateShmemSize(int *num_semaphores); +extern void CreateSharedMemoryAndSemaphores(void); +extern void InitializeShmemGUCs(void); + +#endif /* IPC_H */ diff --git a/src/include/storage/item.h b/src/include/storage/item.h new file mode 100644 index 0000000..6f3eaeb --- /dev/null +++ b/src/include/storage/item.h @@ -0,0 +1,19 @@ +/*------------------------------------------------------------------------- + * + * item.h + * POSTGRES disk item definitions. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/item.h + * + *------------------------------------------------------------------------- + */ +#ifndef ITEM_H +#define ITEM_H + +typedef Pointer Item; + +#endif /* ITEM_H */ diff --git a/src/include/storage/itemid.h b/src/include/storage/itemid.h new file mode 100644 index 0000000..e33637f --- /dev/null +++ b/src/include/storage/itemid.h @@ -0,0 +1,184 @@ +/*------------------------------------------------------------------------- + * + * itemid.h + * Standard POSTGRES buffer page item identifier/line pointer definitions. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/itemid.h + * + *------------------------------------------------------------------------- + */ +#ifndef ITEMID_H +#define ITEMID_H + +/* + * A line pointer on a buffer page. See buffer page definitions and comments + * for an explanation of how line pointers are used. + * + * In some cases a line pointer is "in use" but does not have any associated + * storage on the page. By convention, lp_len == 0 in every line pointer + * that does not have storage, independently of its lp_flags state. + */ +typedef struct ItemIdData +{ + unsigned lp_off:15, /* offset to tuple (from start of page) */ + lp_flags:2, /* state of line pointer, see below */ + lp_len:15; /* byte length of tuple */ +} ItemIdData; + +typedef ItemIdData *ItemId; + +/* + * lp_flags has these possible states. An UNUSED line pointer is available + * for immediate re-use, the other states are not. + */ +#define LP_UNUSED 0 /* unused (should always have lp_len=0) */ +#define LP_NORMAL 1 /* used (should always have lp_len>0) */ +#define LP_REDIRECT 2 /* HOT redirect (should have lp_len=0) */ +#define LP_DEAD 3 /* dead, may or may not have storage */ + +/* + * Item offsets and lengths are represented by these types when + * they're not actually stored in an ItemIdData. + */ +typedef uint16 ItemOffset; +typedef uint16 ItemLength; + + +/* ---------------- + * support macros + * ---------------- + */ + +/* + * ItemIdGetLength + */ +#define ItemIdGetLength(itemId) \ + ((itemId)->lp_len) + +/* + * ItemIdGetOffset + */ +#define ItemIdGetOffset(itemId) \ + ((itemId)->lp_off) + +/* + * ItemIdGetFlags + */ +#define ItemIdGetFlags(itemId) \ + ((itemId)->lp_flags) + +/* + * ItemIdGetRedirect + * In a REDIRECT pointer, lp_off holds offset number for next line pointer + */ +#define ItemIdGetRedirect(itemId) \ + ((itemId)->lp_off) + +/* + * ItemIdIsValid + * True iff item identifier is valid. + * This is a pretty weak test, probably useful only in Asserts. + */ +#define ItemIdIsValid(itemId) PointerIsValid(itemId) + +/* + * ItemIdIsUsed + * True iff item identifier is in use. + */ +#define ItemIdIsUsed(itemId) \ + ((itemId)->lp_flags != LP_UNUSED) + +/* + * ItemIdIsNormal + * True iff item identifier is in state NORMAL. + */ +#define ItemIdIsNormal(itemId) \ + ((itemId)->lp_flags == LP_NORMAL) + +/* + * ItemIdIsRedirected + * True iff item identifier is in state REDIRECT. + */ +#define ItemIdIsRedirected(itemId) \ + ((itemId)->lp_flags == LP_REDIRECT) + +/* + * ItemIdIsDead + * True iff item identifier is in state DEAD. + */ +#define ItemIdIsDead(itemId) \ + ((itemId)->lp_flags == LP_DEAD) + +/* + * ItemIdHasStorage + * True iff item identifier has associated storage. + */ +#define ItemIdHasStorage(itemId) \ + ((itemId)->lp_len != 0) + +/* + * ItemIdSetUnused + * Set the item identifier to be UNUSED, with no storage. + * Beware of multiple evaluations of itemId! + */ +#define ItemIdSetUnused(itemId) \ +( \ + (itemId)->lp_flags = LP_UNUSED, \ + (itemId)->lp_off = 0, \ + (itemId)->lp_len = 0 \ +) + +/* + * ItemIdSetNormal + * Set the item identifier to be NORMAL, with the specified storage. + * Beware of multiple evaluations of itemId! + */ +#define ItemIdSetNormal(itemId, off, len) \ +( \ + (itemId)->lp_flags = LP_NORMAL, \ + (itemId)->lp_off = (off), \ + (itemId)->lp_len = (len) \ +) + +/* + * ItemIdSetRedirect + * Set the item identifier to be REDIRECT, with the specified link. + * Beware of multiple evaluations of itemId! + */ +#define ItemIdSetRedirect(itemId, link) \ +( \ + (itemId)->lp_flags = LP_REDIRECT, \ + (itemId)->lp_off = (link), \ + (itemId)->lp_len = 0 \ +) + +/* + * ItemIdSetDead + * Set the item identifier to be DEAD, with no storage. + * Beware of multiple evaluations of itemId! + */ +#define ItemIdSetDead(itemId) \ +( \ + (itemId)->lp_flags = LP_DEAD, \ + (itemId)->lp_off = 0, \ + (itemId)->lp_len = 0 \ +) + +/* + * ItemIdMarkDead + * Set the item identifier to be DEAD, keeping its existing storage. + * + * Note: in indexes, this is used as if it were a hint-bit mechanism; + * we trust that multiple processors can do this in parallel and get + * the same result. + */ +#define ItemIdMarkDead(itemId) \ +( \ + (itemId)->lp_flags = LP_DEAD \ +) + +#endif /* ITEMID_H */ diff --git a/src/include/storage/itemptr.h b/src/include/storage/itemptr.h new file mode 100644 index 0000000..81947bc --- /dev/null +++ b/src/include/storage/itemptr.h @@ -0,0 +1,208 @@ +/*------------------------------------------------------------------------- + * + * itemptr.h + * POSTGRES disk item pointer definitions. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/itemptr.h + * + *------------------------------------------------------------------------- + */ +#ifndef ITEMPTR_H +#define ITEMPTR_H + +#include "storage/block.h" +#include "storage/off.h" + +/* + * ItemPointer: + * + * This is a pointer to an item within a disk page of a known file + * (for example, a cross-link from an index to its parent table). + * ip_blkid tells us which block, ip_posid tells us which entry in + * the linp (ItemIdData) array we want. + * + * Note: because there is an item pointer in each tuple header and index + * tuple header on disk, it's very important not to waste space with + * structure padding bytes. The struct is designed to be six bytes long + * (it contains three int16 fields) but a few compilers will pad it to + * eight bytes unless coerced. We apply appropriate persuasion where + * possible. If your compiler can't be made to play along, you'll waste + * lots of space. + */ +typedef struct ItemPointerData +{ + BlockIdData ip_blkid; + OffsetNumber ip_posid; +} + +/* If compiler understands packed and aligned pragmas, use those */ +#if defined(pg_attribute_packed) && defined(pg_attribute_aligned) + pg_attribute_packed() + pg_attribute_aligned(2) +#endif +ItemPointerData; + +typedef ItemPointerData *ItemPointer; + +/* ---------------- + * special values used in heap tuples (t_ctid) + * ---------------- + */ + +/* + * If a heap tuple holds a speculative insertion token rather than a real + * TID, ip_posid is set to SpecTokenOffsetNumber, and the token is stored in + * ip_blkid. SpecTokenOffsetNumber must be higher than MaxOffsetNumber, so + * that it can be distinguished from a valid offset number in a regular item + * pointer. + */ +#define SpecTokenOffsetNumber 0xfffe + +/* + * When a tuple is moved to a different partition by UPDATE, the t_ctid of + * the old tuple version is set to this magic value. + */ +#define MovedPartitionsOffsetNumber 0xfffd +#define MovedPartitionsBlockNumber InvalidBlockNumber + + +/* ---------------- + * support macros + * ---------------- + */ + +/* + * ItemPointerIsValid + * True iff the disk item pointer is not NULL. + */ +#define ItemPointerIsValid(pointer) \ + ((bool) (PointerIsValid(pointer) && ((pointer)->ip_posid != 0))) + +/* + * ItemPointerGetBlockNumberNoCheck + * Returns the block number of a disk item pointer. + */ +#define ItemPointerGetBlockNumberNoCheck(pointer) \ +( \ + BlockIdGetBlockNumber(&(pointer)->ip_blkid) \ +) + +/* + * ItemPointerGetBlockNumber + * As above, but verifies that the item pointer looks valid. + */ +#define ItemPointerGetBlockNumber(pointer) \ +( \ + AssertMacro(ItemPointerIsValid(pointer)), \ + ItemPointerGetBlockNumberNoCheck(pointer) \ +) + +/* + * ItemPointerGetOffsetNumberNoCheck + * Returns the offset number of a disk item pointer. + */ +#define ItemPointerGetOffsetNumberNoCheck(pointer) \ +( \ + (pointer)->ip_posid \ +) + +/* + * ItemPointerGetOffsetNumber + * As above, but verifies that the item pointer looks valid. + */ +#define ItemPointerGetOffsetNumber(pointer) \ +( \ + AssertMacro(ItemPointerIsValid(pointer)), \ + ItemPointerGetOffsetNumberNoCheck(pointer) \ +) + +/* + * ItemPointerSet + * Sets a disk item pointer to the specified block and offset. + */ +#define ItemPointerSet(pointer, blockNumber, offNum) \ +( \ + AssertMacro(PointerIsValid(pointer)), \ + BlockIdSet(&((pointer)->ip_blkid), blockNumber), \ + (pointer)->ip_posid = offNum \ +) + +/* + * ItemPointerSetBlockNumber + * Sets a disk item pointer to the specified block. + */ +#define ItemPointerSetBlockNumber(pointer, blockNumber) \ +( \ + AssertMacro(PointerIsValid(pointer)), \ + BlockIdSet(&((pointer)->ip_blkid), blockNumber) \ +) + +/* + * ItemPointerSetOffsetNumber + * Sets a disk item pointer to the specified offset. + */ +#define ItemPointerSetOffsetNumber(pointer, offsetNumber) \ +( \ + AssertMacro(PointerIsValid(pointer)), \ + (pointer)->ip_posid = (offsetNumber) \ +) + +/* + * ItemPointerCopy + * Copies the contents of one disk item pointer to another. + * + * Should there ever be padding in an ItemPointer this would need to be handled + * differently as it's used as hash key. + */ +#define ItemPointerCopy(fromPointer, toPointer) \ +( \ + AssertMacro(PointerIsValid(toPointer)), \ + AssertMacro(PointerIsValid(fromPointer)), \ + *(toPointer) = *(fromPointer) \ +) + +/* + * ItemPointerSetInvalid + * Sets a disk item pointer to be invalid. + */ +#define ItemPointerSetInvalid(pointer) \ +( \ + AssertMacro(PointerIsValid(pointer)), \ + BlockIdSet(&((pointer)->ip_blkid), InvalidBlockNumber), \ + (pointer)->ip_posid = InvalidOffsetNumber \ +) + +/* + * ItemPointerIndicatesMovedPartitions + * True iff the block number indicates the tuple has moved to another + * partition. + */ +#define ItemPointerIndicatesMovedPartitions(pointer) \ +( \ + ItemPointerGetOffsetNumber(pointer) == MovedPartitionsOffsetNumber && \ + ItemPointerGetBlockNumberNoCheck(pointer) == MovedPartitionsBlockNumber \ +) + +/* + * ItemPointerSetMovedPartitions + * Indicate that the item referenced by the itempointer has moved into a + * different partition. + */ +#define ItemPointerSetMovedPartitions(pointer) \ + ItemPointerSet((pointer), MovedPartitionsBlockNumber, MovedPartitionsOffsetNumber) + +/* ---------------- + * externs + * ---------------- + */ + +extern bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2); +extern int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2); +extern void ItemPointerInc(ItemPointer pointer); +extern void ItemPointerDec(ItemPointer pointer); + +#endif /* ITEMPTR_H */ diff --git a/src/include/storage/large_object.h b/src/include/storage/large_object.h new file mode 100644 index 0000000..b826a7d --- /dev/null +++ b/src/include/storage/large_object.h @@ -0,0 +1,100 @@ +/*------------------------------------------------------------------------- + * + * large_object.h + * Declarations for PostgreSQL large objects. POSTGRES 4.2 supported + * zillions of large objects (internal, external, jaquith, inversion). + * Now we only support inversion. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/large_object.h + * + *------------------------------------------------------------------------- + */ +#ifndef LARGE_OBJECT_H +#define LARGE_OBJECT_H + +#include "utils/snapshot.h" + + +/*---------- + * Data about a currently-open large object. + * + * id is the logical OID of the large object + * snapshot is the snapshot to use for read/write operations + * subid is the subtransaction that opened the desc (or currently owns it) + * offset is the current seek offset within the LO + * flags contains some flag bits + * + * NOTE: as of v11, permission checks are made when the large object is + * opened; therefore IFS_RDLOCK/IFS_WRLOCK indicate that read or write mode + * has been requested *and* the corresponding permission has been checked. + * + * NOTE: before 7.1, we also had to store references to the separate table + * and index of a specific large object. Now they all live in pg_largeobject + * and are accessed via a common relation descriptor. + *---------- + */ +typedef struct LargeObjectDesc +{ + Oid id; /* LO's identifier */ + Snapshot snapshot; /* snapshot to use */ + SubTransactionId subid; /* owning subtransaction ID */ + uint64 offset; /* current seek pointer */ + int flags; /* see flag bits below */ + +/* bits in flags: */ +#define IFS_RDLOCK (1 << 0) /* LO was opened for reading */ +#define IFS_WRLOCK (1 << 1) /* LO was opened for writing */ + +} LargeObjectDesc; + + +/* + * Each "page" (tuple) of a large object can hold this much data + * + * We could set this as high as BLCKSZ less some overhead, but it seems + * better to make it a smaller value, so that not as much space is used + * up when a page-tuple is updated. Note that the value is deliberately + * chosen large enough to trigger the tuple toaster, so that we will + * attempt to compress page tuples in-line. (But they won't be moved off + * unless the user creates a toast-table for pg_largeobject...) + * + * Also, it seems to be a smart move to make the page size be a power of 2, + * since clients will often be written to send data in power-of-2 blocks. + * This avoids unnecessary tuple updates caused by partial-page writes. + * + * NB: Changing LOBLKSIZE requires an initdb. + */ +#define LOBLKSIZE (BLCKSZ / 4) + +/* + * Maximum length in bytes for a large object. To make this larger, we'd + * have to widen pg_largeobject.pageno as well as various internal variables. + */ +#define MAX_LARGE_OBJECT_SIZE ((int64) INT_MAX * LOBLKSIZE) + + +/* + * GUC: backwards-compatibility flag to suppress LO permission checks + */ +extern PGDLLIMPORT bool lo_compat_privileges; + +/* + * Function definitions... + */ + +/* inversion stuff in inv_api.c */ +extern void close_lo_relation(bool isCommit); +extern Oid inv_create(Oid lobjId); +extern LargeObjectDesc *inv_open(Oid lobjId, int flags, MemoryContext mcxt); +extern void inv_close(LargeObjectDesc *obj_desc); +extern int inv_drop(Oid lobjId); +extern int64 inv_seek(LargeObjectDesc *obj_desc, int64 offset, int whence); +extern int64 inv_tell(LargeObjectDesc *obj_desc); +extern int inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes); +extern int inv_write(LargeObjectDesc *obj_desc, const char *buf, int nbytes); +extern void inv_truncate(LargeObjectDesc *obj_desc, int64 len); + +#endif /* LARGE_OBJECT_H */ diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h new file mode 100644 index 0000000..68ab740 --- /dev/null +++ b/src/include/storage/latch.h @@ -0,0 +1,186 @@ +/*------------------------------------------------------------------------- + * + * latch.h + * Routines for interprocess latches + * + * A latch is a boolean variable, with operations that let processes sleep + * until it is set. A latch can be set from another process, or a signal + * handler within the same process. + * + * The latch interface is a reliable replacement for the common pattern of + * using pg_usleep() or select() to wait until a signal arrives, where the + * signal handler sets a flag variable. Because on some platforms an + * incoming signal doesn't interrupt sleep, and even on platforms where it + * does there is a race condition if the signal arrives just before + * entering the sleep, the common pattern must periodically wake up and + * poll the flag variable. The pselect() system call was invented to solve + * this problem, but it is not portable enough. Latches are designed to + * overcome these limitations, allowing you to sleep without polling and + * ensuring quick response to signals from other processes. + * + * There are two kinds of latches: local and shared. A local latch is + * initialized by InitLatch, and can only be set from the same process. + * A local latch can be used to wait for a signal to arrive, by calling + * SetLatch in the signal handler. A shared latch resides in shared memory, + * and must be initialized at postmaster startup by InitSharedLatch. Before + * a shared latch can be waited on, it must be associated with a process + * with OwnLatch. Only the process owning the latch can wait on it, but any + * process can set it. + * + * There are three basic operations on a latch: + * + * SetLatch - Sets the latch + * ResetLatch - Clears the latch, allowing it to be set again + * WaitLatch - Waits for the latch to become set + * + * WaitLatch includes a provision for timeouts (which should be avoided + * when possible, as they incur extra overhead) and a provision for + * postmaster child processes to wake up immediately on postmaster death. + * See latch.c for detailed specifications for the exported functions. + * + * The correct pattern to wait for event(s) is: + * + * for (;;) + * { + * ResetLatch(); + * if (work to do) + * Do Stuff(); + * WaitLatch(); + * } + * + * It's important to reset the latch *before* checking if there's work to + * do. Otherwise, if someone sets the latch between the check and the + * ResetLatch call, you will miss it and Wait will incorrectly block. + * + * Another valid coding pattern looks like: + * + * for (;;) + * { + * if (work to do) + * Do Stuff(); // in particular, exit loop if some condition satisfied + * WaitLatch(); + * ResetLatch(); + * } + * + * This is useful to reduce latch traffic if it's expected that the loop's + * termination condition will often be satisfied in the first iteration; + * the cost is an extra loop iteration before blocking when it is not. + * What must be avoided is placing any checks for asynchronous events after + * WaitLatch and before ResetLatch, as that creates a race condition. + * + * To wake up the waiter, you must first set a global flag or something + * else that the wait loop tests in the "if (work to do)" part, and call + * SetLatch *after* that. SetLatch is designed to return quickly if the + * latch is already set. + * + * On some platforms, signals will not interrupt the latch wait primitive + * by themselves. Therefore, it is critical that any signal handler that + * is meant to terminate a WaitLatch wait calls SetLatch. + * + * Note that use of the process latch (PGPROC.procLatch) is generally better + * than an ad-hoc shared latch for signaling auxiliary processes. This is + * because generic signal handlers will call SetLatch on the process latch + * only, so using any latch other than the process latch effectively precludes + * use of any generic handler. + * + * + * WaitEventSets allow to wait for latches being set and additional events - + * postmaster dying and socket readiness of several sockets currently - at the + * same time. On many platforms using a long lived event set is more + * efficient than using WaitLatch or WaitLatchOrSocket. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/latch.h + * + *------------------------------------------------------------------------- + */ +#ifndef LATCH_H +#define LATCH_H + +#include <signal.h> + +/* + * Latch structure should be treated as opaque and only accessed through + * the public functions. It is defined here to allow embedding Latches as + * part of bigger structs. + */ +typedef struct Latch +{ + sig_atomic_t is_set; + sig_atomic_t maybe_sleeping; + bool is_shared; + int owner_pid; +#ifdef WIN32 + HANDLE event; +#endif +} Latch; + +/* + * Bitmasks for events that may wake-up WaitLatch(), WaitLatchOrSocket(), or + * WaitEventSetWait(). + */ +#define WL_LATCH_SET (1 << 0) +#define WL_SOCKET_READABLE (1 << 1) +#define WL_SOCKET_WRITEABLE (1 << 2) +#define WL_TIMEOUT (1 << 3) /* not for WaitEventSetWait() */ +#define WL_POSTMASTER_DEATH (1 << 4) +#define WL_EXIT_ON_PM_DEATH (1 << 5) +#ifdef WIN32 +#define WL_SOCKET_CONNECTED (1 << 6) +#else +/* avoid having to deal with case on platforms not requiring it */ +#define WL_SOCKET_CONNECTED WL_SOCKET_WRITEABLE +#endif +#define WL_SOCKET_CLOSED (1 << 7) +#define WL_SOCKET_MASK (WL_SOCKET_READABLE | \ + WL_SOCKET_WRITEABLE | \ + WL_SOCKET_CONNECTED | \ + WL_SOCKET_CLOSED) + +typedef struct WaitEvent +{ + int pos; /* position in the event data structure */ + uint32 events; /* triggered events */ + pgsocket fd; /* socket fd associated with event */ + void *user_data; /* pointer provided in AddWaitEventToSet */ +#ifdef WIN32 + bool reset; /* Is reset of the event required? */ +#endif +} WaitEvent; + +/* forward declaration to avoid exposing latch.c implementation details */ +typedef struct WaitEventSet WaitEventSet; + +/* + * prototypes for functions in latch.c + */ +extern void InitializeLatchSupport(void); +extern void InitLatch(Latch *latch); +extern void InitSharedLatch(Latch *latch); +extern void OwnLatch(Latch *latch); +extern void DisownLatch(Latch *latch); +extern void SetLatch(Latch *latch); +extern void ResetLatch(Latch *latch); +extern void ShutdownLatchSupport(void); + +extern WaitEventSet *CreateWaitEventSet(MemoryContext context, int nevents); +extern void FreeWaitEventSet(WaitEventSet *set); +extern int AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, + Latch *latch, void *user_data); +extern void ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch); + +extern int WaitEventSetWait(WaitEventSet *set, long timeout, + WaitEvent *occurred_events, int nevents, + uint32 wait_event_info); +extern int WaitLatch(Latch *latch, int wakeEvents, long timeout, + uint32 wait_event_info); +extern int WaitLatchOrSocket(Latch *latch, int wakeEvents, + pgsocket sock, long timeout, uint32 wait_event_info); +extern void InitializeLatchWaitSet(void); +extern int GetNumRegisteredWaitEvents(WaitEventSet *set); +extern bool WaitEventSetCanReportClosed(void); + +#endif /* LATCH_H */ diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h new file mode 100644 index 0000000..be1d2c9 --- /dev/null +++ b/src/include/storage/lmgr.h @@ -0,0 +1,115 @@ +/*------------------------------------------------------------------------- + * + * lmgr.h + * POSTGRES lock manager definitions. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/lmgr.h + * + *------------------------------------------------------------------------- + */ +#ifndef LMGR_H +#define LMGR_H + +#include "lib/stringinfo.h" +#include "storage/itemptr.h" +#include "storage/lock.h" +#include "utils/rel.h" + + +/* XactLockTableWait operations */ +typedef enum XLTW_Oper +{ + XLTW_None, + XLTW_Update, + XLTW_Delete, + XLTW_Lock, + XLTW_LockUpdated, + XLTW_InsertIndex, + XLTW_InsertIndexUnique, + XLTW_FetchUpdated, + XLTW_RecheckExclusionConstr +} XLTW_Oper; + +extern void RelationInitLockInfo(Relation relation); + +/* Lock a relation */ +extern void LockRelationOid(Oid relid, LOCKMODE lockmode); +extern void LockRelationId(LockRelId *relid, LOCKMODE lockmode); +extern bool ConditionalLockRelationOid(Oid relid, LOCKMODE lockmode); +extern void UnlockRelationId(LockRelId *relid, LOCKMODE lockmode); +extern void UnlockRelationOid(Oid relid, LOCKMODE lockmode); + +extern void LockRelation(Relation relation, LOCKMODE lockmode); +extern bool ConditionalLockRelation(Relation relation, LOCKMODE lockmode); +extern void UnlockRelation(Relation relation, LOCKMODE lockmode); +extern bool CheckRelationLockedByMe(Relation relation, LOCKMODE lockmode, + bool orstronger); +extern bool LockHasWaitersRelation(Relation relation, LOCKMODE lockmode); + +extern void LockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode); +extern void UnlockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode); + +/* Lock a relation for extension */ +extern void LockRelationForExtension(Relation relation, LOCKMODE lockmode); +extern void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode); +extern bool ConditionalLockRelationForExtension(Relation relation, + LOCKMODE lockmode); +extern int RelationExtensionLockWaiterCount(Relation relation); + +/* Lock to recompute pg_database.datfrozenxid in the current database */ +extern void LockDatabaseFrozenIds(LOCKMODE lockmode); + +/* Lock a page (currently only used within indexes) */ +extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode); +extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode); +extern void UnlockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode); + +/* Lock a tuple (see heap_lock_tuple before assuming you understand this) */ +extern void LockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode); +extern bool ConditionalLockTuple(Relation relation, ItemPointer tid, + LOCKMODE lockmode); +extern void UnlockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode); + +/* Lock an XID (used to wait for a transaction to finish) */ +extern void XactLockTableInsert(TransactionId xid); +extern void XactLockTableDelete(TransactionId xid); +extern void XactLockTableWait(TransactionId xid, Relation rel, + ItemPointer ctid, XLTW_Oper oper); +extern bool ConditionalXactLockTableWait(TransactionId xid); + +/* Lock VXIDs, specified by conflicting locktags */ +extern void WaitForLockers(LOCKTAG heaplocktag, LOCKMODE lockmode, bool progress); +extern void WaitForLockersMultiple(List *locktags, LOCKMODE lockmode, bool progress); + +/* Lock an XID for tuple insertion (used to wait for an insertion to finish) */ +extern uint32 SpeculativeInsertionLockAcquire(TransactionId xid); +extern void SpeculativeInsertionLockRelease(TransactionId xid); +extern void SpeculativeInsertionWait(TransactionId xid, uint32 token); + +/* Lock a general object (other than a relation) of the current database */ +extern void LockDatabaseObject(Oid classid, Oid objid, uint16 objsubid, + LOCKMODE lockmode); +extern void UnlockDatabaseObject(Oid classid, Oid objid, uint16 objsubid, + LOCKMODE lockmode); + +/* Lock a shared-across-databases object (other than a relation) */ +extern void LockSharedObject(Oid classid, Oid objid, uint16 objsubid, + LOCKMODE lockmode); +extern void UnlockSharedObject(Oid classid, Oid objid, uint16 objsubid, + LOCKMODE lockmode); + +extern void LockSharedObjectForSession(Oid classid, Oid objid, uint16 objsubid, + LOCKMODE lockmode); +extern void UnlockSharedObjectForSession(Oid classid, Oid objid, uint16 objsubid, + LOCKMODE lockmode); + +/* Describe a locktag for error messages */ +extern void DescribeLockTag(StringInfo buf, const LOCKTAG *tag); + +extern const char *GetLockNameFromTagType(uint16 locktag_type); + +#endif /* LMGR_H */ diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h new file mode 100644 index 0000000..e4e1495 --- /dev/null +++ b/src/include/storage/lock.h @@ -0,0 +1,616 @@ +/*------------------------------------------------------------------------- + * + * lock.h + * POSTGRES low-level lock mechanism + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/lock.h + * + *------------------------------------------------------------------------- + */ +#ifndef LOCK_H_ +#define LOCK_H_ + +#ifdef FRONTEND +#error "lock.h may not be included from frontend code" +#endif + +#include "storage/backendid.h" +#include "storage/lockdefs.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "utils/timestamp.h" + +/* struct PGPROC is declared in proc.h, but must forward-reference it */ +typedef struct PGPROC PGPROC; + +typedef struct PROC_QUEUE +{ + SHM_QUEUE links; /* head of list of PGPROC objects */ + int size; /* number of entries in list */ +} PROC_QUEUE; + +/* GUC variables */ +extern PGDLLIMPORT int max_locks_per_xact; + +#ifdef LOCK_DEBUG +extern PGDLLIMPORT int Trace_lock_oidmin; +extern PGDLLIMPORT bool Trace_locks; +extern PGDLLIMPORT bool Trace_userlocks; +extern PGDLLIMPORT int Trace_lock_table; +extern PGDLLIMPORT bool Debug_deadlocks; +#endif /* LOCK_DEBUG */ + + +/* + * Top-level transactions are identified by VirtualTransactionIDs comprising + * PGPROC fields backendId and lxid. For recovered prepared transactions, the + * LocalTransactionId is an ordinary XID; LOCKTAG_VIRTUALTRANSACTION never + * refers to that kind. These are guaranteed unique over the short term, but + * will be reused after a database restart or XID wraparound; hence they + * should never be stored on disk. + * + * Note that struct VirtualTransactionId can not be assumed to be atomically + * assignable as a whole. However, type LocalTransactionId is assumed to + * be atomically assignable, and the backend ID doesn't change often enough + * to be a problem, so we can fetch or assign the two fields separately. + * We deliberately refrain from using the struct within PGPROC, to prevent + * coding errors from trying to use struct assignment with it; instead use + * GET_VXID_FROM_PGPROC(). + */ +typedef struct +{ + BackendId backendId; /* backendId from PGPROC */ + LocalTransactionId localTransactionId; /* lxid from PGPROC */ +} VirtualTransactionId; + +#define InvalidLocalTransactionId 0 +#define LocalTransactionIdIsValid(lxid) ((lxid) != InvalidLocalTransactionId) +#define VirtualTransactionIdIsValid(vxid) \ + (LocalTransactionIdIsValid((vxid).localTransactionId)) +#define VirtualTransactionIdIsRecoveredPreparedXact(vxid) \ + ((vxid).backendId == InvalidBackendId) +#define VirtualTransactionIdEquals(vxid1, vxid2) \ + ((vxid1).backendId == (vxid2).backendId && \ + (vxid1).localTransactionId == (vxid2).localTransactionId) +#define SetInvalidVirtualTransactionId(vxid) \ + ((vxid).backendId = InvalidBackendId, \ + (vxid).localTransactionId = InvalidLocalTransactionId) +#define GET_VXID_FROM_PGPROC(vxid, proc) \ + ((vxid).backendId = (proc).backendId, \ + (vxid).localTransactionId = (proc).lxid) + +/* MAX_LOCKMODES cannot be larger than the # of bits in LOCKMASK */ +#define MAX_LOCKMODES 10 + +#define LOCKBIT_ON(lockmode) (1 << (lockmode)) +#define LOCKBIT_OFF(lockmode) (~(1 << (lockmode))) + + +/* + * This data structure defines the locking semantics associated with a + * "lock method". The semantics specify the meaning of each lock mode + * (by defining which lock modes it conflicts with). + * All of this data is constant and is kept in const tables. + * + * numLockModes -- number of lock modes (READ,WRITE,etc) that + * are defined in this lock method. Must be less than MAX_LOCKMODES. + * + * conflictTab -- this is an array of bitmasks showing lock + * mode conflicts. conflictTab[i] is a mask with the j-th bit + * turned on if lock modes i and j conflict. Lock modes are + * numbered 1..numLockModes; conflictTab[0] is unused. + * + * lockModeNames -- ID strings for debug printouts. + * + * trace_flag -- pointer to GUC trace flag for this lock method. (The + * GUC variable is not constant, but we use "const" here to denote that + * it can't be changed through this reference.) + */ +typedef struct LockMethodData +{ + int numLockModes; + const LOCKMASK *conflictTab; + const char *const *lockModeNames; + const bool *trace_flag; +} LockMethodData; + +typedef const LockMethodData *LockMethod; + +/* + * Lock methods are identified by LOCKMETHODID. (Despite the declaration as + * uint16, we are constrained to 256 lockmethods by the layout of LOCKTAG.) + */ +typedef uint16 LOCKMETHODID; + +/* These identify the known lock methods */ +#define DEFAULT_LOCKMETHOD 1 +#define USER_LOCKMETHOD 2 + +/* + * LOCKTAG is the key information needed to look up a LOCK item in the + * lock hashtable. A LOCKTAG value uniquely identifies a lockable object. + * + * The LockTagType enum defines the different kinds of objects we can lock. + * We can handle up to 256 different LockTagTypes. + */ +typedef enum LockTagType +{ + LOCKTAG_RELATION, /* whole relation */ + LOCKTAG_RELATION_EXTEND, /* the right to extend a relation */ + LOCKTAG_DATABASE_FROZEN_IDS, /* pg_database.datfrozenxid */ + LOCKTAG_PAGE, /* one page of a relation */ + LOCKTAG_TUPLE, /* one physical tuple */ + LOCKTAG_TRANSACTION, /* transaction (for waiting for xact done) */ + LOCKTAG_VIRTUALTRANSACTION, /* virtual transaction (ditto) */ + LOCKTAG_SPECULATIVE_TOKEN, /* speculative insertion Xid and token */ + LOCKTAG_OBJECT, /* non-relation database object */ + LOCKTAG_USERLOCK, /* reserved for old contrib/userlock code */ + LOCKTAG_ADVISORY /* advisory user locks */ +} LockTagType; + +#define LOCKTAG_LAST_TYPE LOCKTAG_ADVISORY + +extern PGDLLIMPORT const char *const LockTagTypeNames[]; + +/* + * The LOCKTAG struct is defined with malice aforethought to fit into 16 + * bytes with no padding. Note that this would need adjustment if we were + * to widen Oid, BlockNumber, or TransactionId to more than 32 bits. + * + * We include lockmethodid in the locktag so that a single hash table in + * shared memory can store locks of different lockmethods. + */ +typedef struct LOCKTAG +{ + uint32 locktag_field1; /* a 32-bit ID field */ + uint32 locktag_field2; /* a 32-bit ID field */ + uint32 locktag_field3; /* a 32-bit ID field */ + uint16 locktag_field4; /* a 16-bit ID field */ + uint8 locktag_type; /* see enum LockTagType */ + uint8 locktag_lockmethodid; /* lockmethod indicator */ +} LOCKTAG; + +/* + * These macros define how we map logical IDs of lockable objects into + * the physical fields of LOCKTAG. Use these to set up LOCKTAG values, + * rather than accessing the fields directly. Note multiple eval of target! + */ + +/* ID info for a relation is DB OID + REL OID; DB OID = 0 if shared */ +#define SET_LOCKTAG_RELATION(locktag,dboid,reloid) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = (reloid), \ + (locktag).locktag_field3 = 0, \ + (locktag).locktag_field4 = 0, \ + (locktag).locktag_type = LOCKTAG_RELATION, \ + (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) + +/* same ID info as RELATION */ +#define SET_LOCKTAG_RELATION_EXTEND(locktag,dboid,reloid) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = (reloid), \ + (locktag).locktag_field3 = 0, \ + (locktag).locktag_field4 = 0, \ + (locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \ + (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) + +/* ID info for frozen IDs is DB OID */ +#define SET_LOCKTAG_DATABASE_FROZEN_IDS(locktag,dboid) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = 0, \ + (locktag).locktag_field3 = 0, \ + (locktag).locktag_field4 = 0, \ + (locktag).locktag_type = LOCKTAG_DATABASE_FROZEN_IDS, \ + (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) + +/* ID info for a page is RELATION info + BlockNumber */ +#define SET_LOCKTAG_PAGE(locktag,dboid,reloid,blocknum) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = (reloid), \ + (locktag).locktag_field3 = (blocknum), \ + (locktag).locktag_field4 = 0, \ + (locktag).locktag_type = LOCKTAG_PAGE, \ + (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) + +/* ID info for a tuple is PAGE info + OffsetNumber */ +#define SET_LOCKTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = (reloid), \ + (locktag).locktag_field3 = (blocknum), \ + (locktag).locktag_field4 = (offnum), \ + (locktag).locktag_type = LOCKTAG_TUPLE, \ + (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) + +/* ID info for a transaction is its TransactionId */ +#define SET_LOCKTAG_TRANSACTION(locktag,xid) \ + ((locktag).locktag_field1 = (xid), \ + (locktag).locktag_field2 = 0, \ + (locktag).locktag_field3 = 0, \ + (locktag).locktag_field4 = 0, \ + (locktag).locktag_type = LOCKTAG_TRANSACTION, \ + (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) + +/* ID info for a virtual transaction is its VirtualTransactionId */ +#define SET_LOCKTAG_VIRTUALTRANSACTION(locktag,vxid) \ + ((locktag).locktag_field1 = (vxid).backendId, \ + (locktag).locktag_field2 = (vxid).localTransactionId, \ + (locktag).locktag_field3 = 0, \ + (locktag).locktag_field4 = 0, \ + (locktag).locktag_type = LOCKTAG_VIRTUALTRANSACTION, \ + (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) + +/* + * ID info for a speculative insert is TRANSACTION info + + * its speculative insert counter. + */ +#define SET_LOCKTAG_SPECULATIVE_INSERTION(locktag,xid,token) \ + ((locktag).locktag_field1 = (xid), \ + (locktag).locktag_field2 = (token), \ + (locktag).locktag_field3 = 0, \ + (locktag).locktag_field4 = 0, \ + (locktag).locktag_type = LOCKTAG_SPECULATIVE_TOKEN, \ + (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) + +/* + * ID info for an object is DB OID + CLASS OID + OBJECT OID + SUBID + * + * Note: object ID has same representation as in pg_depend and + * pg_description, but notice that we are constraining SUBID to 16 bits. + * Also, we use DB OID = 0 for shared objects such as tablespaces. + */ +#define SET_LOCKTAG_OBJECT(locktag,dboid,classoid,objoid,objsubid) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = (classoid), \ + (locktag).locktag_field3 = (objoid), \ + (locktag).locktag_field4 = (objsubid), \ + (locktag).locktag_type = LOCKTAG_OBJECT, \ + (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) + +#define SET_LOCKTAG_ADVISORY(locktag,id1,id2,id3,id4) \ + ((locktag).locktag_field1 = (id1), \ + (locktag).locktag_field2 = (id2), \ + (locktag).locktag_field3 = (id3), \ + (locktag).locktag_field4 = (id4), \ + (locktag).locktag_type = LOCKTAG_ADVISORY, \ + (locktag).locktag_lockmethodid = USER_LOCKMETHOD) + + +/* + * Per-locked-object lock information: + * + * tag -- uniquely identifies the object being locked + * grantMask -- bitmask for all lock types currently granted on this object. + * waitMask -- bitmask for all lock types currently awaited on this object. + * procLocks -- list of PROCLOCK objects for this lock. + * waitProcs -- queue of processes waiting for this lock. + * requested -- count of each lock type currently requested on the lock + * (includes requests already granted!!). + * nRequested -- total requested locks of all types. + * granted -- count of each lock type currently granted on the lock. + * nGranted -- total granted locks of all types. + * + * Note: these counts count 1 for each backend. Internally to a backend, + * there may be multiple grabs on a particular lock, but this is not reflected + * into shared memory. + */ +typedef struct LOCK +{ + /* hash key */ + LOCKTAG tag; /* unique identifier of lockable object */ + + /* data */ + LOCKMASK grantMask; /* bitmask for lock types already granted */ + LOCKMASK waitMask; /* bitmask for lock types awaited */ + SHM_QUEUE procLocks; /* list of PROCLOCK objects assoc. with lock */ + PROC_QUEUE waitProcs; /* list of PGPROC objects waiting on lock */ + int requested[MAX_LOCKMODES]; /* counts of requested locks */ + int nRequested; /* total of requested[] array */ + int granted[MAX_LOCKMODES]; /* counts of granted locks */ + int nGranted; /* total of granted[] array */ +} LOCK; + +#define LOCK_LOCKMETHOD(lock) ((LOCKMETHODID) (lock).tag.locktag_lockmethodid) +#define LOCK_LOCKTAG(lock) ((LockTagType) (lock).tag.locktag_type) + + +/* + * We may have several different backends holding or awaiting locks + * on the same lockable object. We need to store some per-holder/waiter + * information for each such holder (or would-be holder). This is kept in + * a PROCLOCK struct. + * + * PROCLOCKTAG is the key information needed to look up a PROCLOCK item in the + * proclock hashtable. A PROCLOCKTAG value uniquely identifies the combination + * of a lockable object and a holder/waiter for that object. (We can use + * pointers here because the PROCLOCKTAG need only be unique for the lifespan + * of the PROCLOCK, and it will never outlive the lock or the proc.) + * + * Internally to a backend, it is possible for the same lock to be held + * for different purposes: the backend tracks transaction locks separately + * from session locks. However, this is not reflected in the shared-memory + * state: we only track which backend(s) hold the lock. This is OK since a + * backend can never block itself. + * + * The holdMask field shows the already-granted locks represented by this + * proclock. Note that there will be a proclock object, possibly with + * zero holdMask, for any lock that the process is currently waiting on. + * Otherwise, proclock objects whose holdMasks are zero are recycled + * as soon as convenient. + * + * releaseMask is workspace for LockReleaseAll(): it shows the locks due + * to be released during the current call. This must only be examined or + * set by the backend owning the PROCLOCK. + * + * Each PROCLOCK object is linked into lists for both the associated LOCK + * object and the owning PGPROC object. Note that the PROCLOCK is entered + * into these lists as soon as it is created, even if no lock has yet been + * granted. A PGPROC that is waiting for a lock to be granted will also be + * linked into the lock's waitProcs queue. + */ +typedef struct PROCLOCKTAG +{ + /* NB: we assume this struct contains no padding! */ + LOCK *myLock; /* link to per-lockable-object information */ + PGPROC *myProc; /* link to PGPROC of owning backend */ +} PROCLOCKTAG; + +typedef struct PROCLOCK +{ + /* tag */ + PROCLOCKTAG tag; /* unique identifier of proclock object */ + + /* data */ + PGPROC *groupLeader; /* proc's lock group leader, or proc itself */ + LOCKMASK holdMask; /* bitmask for lock types currently held */ + LOCKMASK releaseMask; /* bitmask for lock types to be released */ + SHM_QUEUE lockLink; /* list link in LOCK's list of proclocks */ + SHM_QUEUE procLink; /* list link in PGPROC's list of proclocks */ +} PROCLOCK; + +#define PROCLOCK_LOCKMETHOD(proclock) \ + LOCK_LOCKMETHOD(*((proclock).tag.myLock)) + +/* + * Each backend also maintains a local hash table with information about each + * lock it is currently interested in. In particular the local table counts + * the number of times that lock has been acquired. This allows multiple + * requests for the same lock to be executed without additional accesses to + * shared memory. We also track the number of lock acquisitions per + * ResourceOwner, so that we can release just those locks belonging to a + * particular ResourceOwner. + * + * When holding a lock taken "normally", the lock and proclock fields always + * point to the associated objects in shared memory. However, if we acquired + * the lock via the fast-path mechanism, the lock and proclock fields are set + * to NULL, since there probably aren't any such objects in shared memory. + * (If the lock later gets promoted to normal representation, we may eventually + * update our locallock's lock/proclock fields after finding the shared + * objects.) + * + * Caution: a locallock object can be left over from a failed lock acquisition + * attempt. In this case its lock/proclock fields are untrustworthy, since + * the shared lock object is neither held nor awaited, and hence is available + * to be reclaimed. If nLocks > 0 then these pointers must either be valid or + * NULL, but when nLocks == 0 they should be considered garbage. + */ +typedef struct LOCALLOCKTAG +{ + LOCKTAG lock; /* identifies the lockable object */ + LOCKMODE mode; /* lock mode for this table entry */ +} LOCALLOCKTAG; + +typedef struct LOCALLOCKOWNER +{ + /* + * Note: if owner is NULL then the lock is held on behalf of the session; + * otherwise it is held on behalf of my current transaction. + * + * Must use a forward struct reference to avoid circularity. + */ + struct ResourceOwnerData *owner; + int64 nLocks; /* # of times held by this owner */ +} LOCALLOCKOWNER; + +typedef struct LOCALLOCK +{ + /* tag */ + LOCALLOCKTAG tag; /* unique identifier of locallock entry */ + + /* data */ + uint32 hashcode; /* copy of LOCKTAG's hash value */ + LOCK *lock; /* associated LOCK object, if any */ + PROCLOCK *proclock; /* associated PROCLOCK object, if any */ + int64 nLocks; /* total number of times lock is held */ + int numLockOwners; /* # of relevant ResourceOwners */ + int maxLockOwners; /* allocated size of array */ + LOCALLOCKOWNER *lockOwners; /* dynamically resizable array */ + bool holdsStrongLockCount; /* bumped FastPathStrongRelationLocks */ + bool lockCleared; /* we read all sinval msgs for lock */ +} LOCALLOCK; + +#define LOCALLOCK_LOCKMETHOD(llock) ((llock).tag.lock.locktag_lockmethodid) +#define LOCALLOCK_LOCKTAG(llock) ((LockTagType) (llock).tag.lock.locktag_type) + + +/* + * These structures hold information passed from lmgr internals to the lock + * listing user-level functions (in lockfuncs.c). + */ + +typedef struct LockInstanceData +{ + LOCKTAG locktag; /* tag for locked object */ + LOCKMASK holdMask; /* locks held by this PGPROC */ + LOCKMODE waitLockMode; /* lock awaited by this PGPROC, if any */ + BackendId backend; /* backend ID of this PGPROC */ + LocalTransactionId lxid; /* local transaction ID of this PGPROC */ + TimestampTz waitStart; /* time at which this PGPROC started waiting + * for lock */ + int pid; /* pid of this PGPROC */ + int leaderPid; /* pid of group leader; = pid if no group */ + bool fastpath; /* taken via fastpath? */ +} LockInstanceData; + +typedef struct LockData +{ + int nelements; /* The length of the array */ + LockInstanceData *locks; /* Array of per-PROCLOCK information */ +} LockData; + +typedef struct BlockedProcData +{ + int pid; /* pid of a blocked PGPROC */ + /* Per-PROCLOCK information about PROCLOCKs of the lock the pid awaits */ + /* (these fields refer to indexes in BlockedProcsData.locks[]) */ + int first_lock; /* index of first relevant LockInstanceData */ + int num_locks; /* number of relevant LockInstanceDatas */ + /* PIDs of PGPROCs that are ahead of "pid" in the lock's wait queue */ + /* (these fields refer to indexes in BlockedProcsData.waiter_pids[]) */ + int first_waiter; /* index of first preceding waiter */ + int num_waiters; /* number of preceding waiters */ +} BlockedProcData; + +typedef struct BlockedProcsData +{ + BlockedProcData *procs; /* Array of per-blocked-proc information */ + LockInstanceData *locks; /* Array of per-PROCLOCK information */ + int *waiter_pids; /* Array of PIDs of other blocked PGPROCs */ + int nprocs; /* # of valid entries in procs[] array */ + int maxprocs; /* Allocated length of procs[] array */ + int nlocks; /* # of valid entries in locks[] array */ + int maxlocks; /* Allocated length of locks[] array */ + int npids; /* # of valid entries in waiter_pids[] array */ + int maxpids; /* Allocated length of waiter_pids[] array */ +} BlockedProcsData; + + +/* Result codes for LockAcquire() */ +typedef enum +{ + LOCKACQUIRE_NOT_AVAIL, /* lock not available, and dontWait=true */ + LOCKACQUIRE_OK, /* lock successfully acquired */ + LOCKACQUIRE_ALREADY_HELD, /* incremented count for lock already held */ + LOCKACQUIRE_ALREADY_CLEAR /* incremented count for lock already clear */ +} LockAcquireResult; + +/* Deadlock states identified by DeadLockCheck() */ +typedef enum +{ + DS_NOT_YET_CHECKED, /* no deadlock check has run yet */ + DS_NO_DEADLOCK, /* no deadlock detected */ + DS_SOFT_DEADLOCK, /* deadlock avoided by queue rearrangement */ + DS_HARD_DEADLOCK, /* deadlock, no way out but ERROR */ + DS_BLOCKED_BY_AUTOVACUUM /* no deadlock; queue blocked by autovacuum + * worker */ +} DeadLockState; + +/* + * The lockmgr's shared hash tables are partitioned to reduce contention. + * To determine which partition a given locktag belongs to, compute the tag's + * hash code with LockTagHashCode(), then apply one of these macros. + * NB: NUM_LOCK_PARTITIONS must be a power of 2! + */ +#define LockHashPartition(hashcode) \ + ((hashcode) % NUM_LOCK_PARTITIONS) +#define LockHashPartitionLock(hashcode) \ + (&MainLWLockArray[LOCK_MANAGER_LWLOCK_OFFSET + \ + LockHashPartition(hashcode)].lock) +#define LockHashPartitionLockByIndex(i) \ + (&MainLWLockArray[LOCK_MANAGER_LWLOCK_OFFSET + (i)].lock) + +/* + * The deadlock detector needs to be able to access lockGroupLeader and + * related fields in the PGPROC, so we arrange for those fields to be protected + * by one of the lock hash partition locks. Since the deadlock detector + * acquires all such locks anyway, this makes it safe for it to access these + * fields without doing anything extra. To avoid contention as much as + * possible, we map different PGPROCs to different partition locks. The lock + * used for a given lock group is determined by the group leader's pgprocno. + */ +#define LockHashPartitionLockByProc(leader_pgproc) \ + LockHashPartitionLock((leader_pgproc)->pgprocno) + +/* + * function prototypes + */ +extern void InitLocks(void); +extern LockMethod GetLocksMethodTable(const LOCK *lock); +extern LockMethod GetLockTagsMethodTable(const LOCKTAG *locktag); +extern uint32 LockTagHashCode(const LOCKTAG *locktag); +extern bool DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2); +extern LockAcquireResult LockAcquire(const LOCKTAG *locktag, + LOCKMODE lockmode, + bool sessionLock, + bool dontWait); +extern LockAcquireResult LockAcquireExtended(const LOCKTAG *locktag, + LOCKMODE lockmode, + bool sessionLock, + bool dontWait, + bool reportMemoryError, + LOCALLOCK **locallockp); +extern void AbortStrongLockAcquire(void); +extern void MarkLockClear(LOCALLOCK *locallock); +extern bool LockRelease(const LOCKTAG *locktag, + LOCKMODE lockmode, bool sessionLock); +extern void LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks); +extern void LockReleaseSession(LOCKMETHODID lockmethodid); +extern void LockReleaseCurrentOwner(LOCALLOCK **locallocks, int nlocks); +extern void LockReassignCurrentOwner(LOCALLOCK **locallocks, int nlocks); +extern bool LockHeldByMe(const LOCKTAG *locktag, LOCKMODE lockmode); +#ifdef USE_ASSERT_CHECKING +extern HTAB *GetLockMethodLocalHash(void); +#endif +extern bool LockHasWaiters(const LOCKTAG *locktag, + LOCKMODE lockmode, bool sessionLock); +extern VirtualTransactionId *GetLockConflicts(const LOCKTAG *locktag, + LOCKMODE lockmode, int *countp); +extern void AtPrepare_Locks(void); +extern void PostPrepare_Locks(TransactionId xid); +extern bool LockCheckConflicts(LockMethod lockMethodTable, + LOCKMODE lockmode, + LOCK *lock, PROCLOCK *proclock); +extern void GrantLock(LOCK *lock, PROCLOCK *proclock, LOCKMODE lockmode); +extern void GrantAwaitedLock(void); +extern void RemoveFromWaitQueue(PGPROC *proc, uint32 hashcode); +extern Size LockShmemSize(void); +extern LockData *GetLockStatusData(void); +extern BlockedProcsData *GetBlockerStatusData(int blocked_pid); + +extern xl_standby_lock *GetRunningTransactionLocks(int *nlocks); +extern const char *GetLockmodeName(LOCKMETHODID lockmethodid, LOCKMODE mode); + +extern void lock_twophase_recover(TransactionId xid, uint16 info, + void *recdata, uint32 len); +extern void lock_twophase_postcommit(TransactionId xid, uint16 info, + void *recdata, uint32 len); +extern void lock_twophase_postabort(TransactionId xid, uint16 info, + void *recdata, uint32 len); +extern void lock_twophase_standby_recover(TransactionId xid, uint16 info, + void *recdata, uint32 len); + +extern DeadLockState DeadLockCheck(PGPROC *proc); +extern PGPROC *GetBlockingAutoVacuumPgproc(void); +extern void DeadLockReport(void) pg_attribute_noreturn(); +extern void RememberSimpleDeadLock(PGPROC *proc1, + LOCKMODE lockmode, + LOCK *lock, + PGPROC *proc2); +extern void InitDeadLockChecking(void); + +extern int LockWaiterCount(const LOCKTAG *locktag); + +#ifdef LOCK_DEBUG +extern void DumpLocks(PGPROC *proc); +extern void DumpAllLocks(void); +#endif + +/* Lock a VXID (used to wait for a transaction to finish) */ +extern void VirtualXactLockTableInsert(VirtualTransactionId vxid); +extern void VirtualXactLockTableCleanup(void); +extern bool VirtualXactLock(VirtualTransactionId vxid, bool wait); + +#endif /* LOCK_H_ */ diff --git a/src/include/storage/lockdefs.h b/src/include/storage/lockdefs.h new file mode 100644 index 0000000..350ddd4 --- /dev/null +++ b/src/include/storage/lockdefs.h @@ -0,0 +1,59 @@ +/*------------------------------------------------------------------------- + * + * lockdefs.h + * Frontend exposed parts of postgres' low level lock mechanism + * + * The split between lockdefs.h and lock.h is not very principled. This file + * contains definition that have to (indirectly) be available when included by + * FRONTEND code. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/lockdefs.h + * + *------------------------------------------------------------------------- + */ +#ifndef LOCKDEFS_H_ +#define LOCKDEFS_H_ + +/* + * LOCKMODE is an integer (1..N) indicating a lock type. LOCKMASK is a bit + * mask indicating a set of held or requested lock types (the bit 1<<mode + * corresponds to a particular lock mode). + */ +typedef int LOCKMASK; +typedef int LOCKMODE; + +/* + * These are the valid values of type LOCKMODE for all the standard lock + * methods (both DEFAULT and USER). + */ + +/* NoLock is not a lock mode, but a flag value meaning "don't get a lock" */ +#define NoLock 0 + +#define AccessShareLock 1 /* SELECT */ +#define RowShareLock 2 /* SELECT FOR UPDATE/FOR SHARE */ +#define RowExclusiveLock 3 /* INSERT, UPDATE, DELETE */ +#define ShareUpdateExclusiveLock 4 /* VACUUM (non-FULL),ANALYZE, CREATE INDEX + * CONCURRENTLY */ +#define ShareLock 5 /* CREATE INDEX (WITHOUT CONCURRENTLY) */ +#define ShareRowExclusiveLock 6 /* like EXCLUSIVE MODE, but allows ROW + * SHARE */ +#define ExclusiveLock 7 /* blocks ROW SHARE/SELECT...FOR UPDATE */ +#define AccessExclusiveLock 8 /* ALTER TABLE, DROP TABLE, VACUUM FULL, + * and unqualified LOCK TABLE */ + +#define MaxLockMode 8 /* highest standard lock mode */ + + +/* WAL representation of an AccessExclusiveLock on a table */ +typedef struct xl_standby_lock +{ + TransactionId xid; /* xid of holder of AccessExclusiveLock */ + Oid dbOid; /* DB containing table */ + Oid relOid; /* OID of table */ +} xl_standby_lock; + +#endif /* LOCKDEFS_H_ */ diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h new file mode 100644 index 0000000..e03d317 --- /dev/null +++ b/src/include/storage/lwlock.h @@ -0,0 +1,206 @@ +/*------------------------------------------------------------------------- + * + * lwlock.h + * Lightweight lock manager + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/lwlock.h + * + *------------------------------------------------------------------------- + */ +#ifndef LWLOCK_H +#define LWLOCK_H + +#ifdef FRONTEND +#error "lwlock.h may not be included from frontend code" +#endif + +#include "port/atomics.h" +#include "storage/proclist_types.h" + +struct PGPROC; + +/* + * Code outside of lwlock.c should not manipulate the contents of this + * structure directly, but we have to declare it here to allow LWLocks to be + * incorporated into other data structures. + */ +typedef struct LWLock +{ + uint16 tranche; /* tranche ID */ + pg_atomic_uint32 state; /* state of exclusive/nonexclusive lockers */ + proclist_head waiters; /* list of waiting PGPROCs */ +#ifdef LOCK_DEBUG + pg_atomic_uint32 nwaiters; /* number of waiters */ + struct PGPROC *owner; /* last exclusive owner of the lock */ +#endif +} LWLock; + +/* + * In most cases, it's desirable to force each tranche of LWLocks to be aligned + * on a cache line boundary and make the array stride a power of 2. This saves + * a few cycles in indexing, but more importantly ensures that individual + * LWLocks don't cross cache line boundaries. This reduces cache contention + * problems, especially on AMD Opterons. In some cases, it's useful to add + * even more padding so that each LWLock takes up an entire cache line; this is + * useful, for example, in the main LWLock array, where the overall number of + * locks is small but some are heavily contended. + */ +#define LWLOCK_PADDED_SIZE PG_CACHE_LINE_SIZE + +/* LWLock, padded to a full cache line size */ +typedef union LWLockPadded +{ + LWLock lock; + char pad[LWLOCK_PADDED_SIZE]; +} LWLockPadded; + +extern PGDLLIMPORT LWLockPadded *MainLWLockArray; + +/* struct for storing named tranche information */ +typedef struct NamedLWLockTranche +{ + int trancheId; + char *trancheName; +} NamedLWLockTranche; + +extern PGDLLIMPORT NamedLWLockTranche *NamedLWLockTrancheArray; +extern PGDLLIMPORT int NamedLWLockTrancheRequests; + +/* Names for fixed lwlocks */ +#include "storage/lwlocknames.h" + +/* + * It's a bit odd to declare NUM_BUFFER_PARTITIONS and NUM_LOCK_PARTITIONS + * here, but we need them to figure out offsets within MainLWLockArray, and + * having this file include lock.h or bufmgr.h would be backwards. + */ + +/* Number of partitions of the shared buffer mapping hashtable */ +#define NUM_BUFFER_PARTITIONS 128 + +/* Number of partitions the shared lock tables are divided into */ +#define LOG2_NUM_LOCK_PARTITIONS 4 +#define NUM_LOCK_PARTITIONS (1 << LOG2_NUM_LOCK_PARTITIONS) + +/* Number of partitions the shared predicate lock tables are divided into */ +#define LOG2_NUM_PREDICATELOCK_PARTITIONS 4 +#define NUM_PREDICATELOCK_PARTITIONS (1 << LOG2_NUM_PREDICATELOCK_PARTITIONS) + +/* Offsets for various chunks of preallocated lwlocks. */ +#define BUFFER_MAPPING_LWLOCK_OFFSET NUM_INDIVIDUAL_LWLOCKS +#define LOCK_MANAGER_LWLOCK_OFFSET \ + (BUFFER_MAPPING_LWLOCK_OFFSET + NUM_BUFFER_PARTITIONS) +#define PREDICATELOCK_MANAGER_LWLOCK_OFFSET \ + (LOCK_MANAGER_LWLOCK_OFFSET + NUM_LOCK_PARTITIONS) +#define NUM_FIXED_LWLOCKS \ + (PREDICATELOCK_MANAGER_LWLOCK_OFFSET + NUM_PREDICATELOCK_PARTITIONS) + +typedef enum LWLockMode +{ + LW_EXCLUSIVE, + LW_SHARED, + LW_WAIT_UNTIL_FREE /* A special mode used in PGPROC->lwWaitMode, + * when waiting for lock to become free. Not + * to be used as LWLockAcquire argument */ +} LWLockMode; + + +#ifdef LOCK_DEBUG +extern PGDLLIMPORT bool Trace_lwlocks; +#endif + +extern bool LWLockAcquire(LWLock *lock, LWLockMode mode); +extern bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode); +extern bool LWLockAcquireOrWait(LWLock *lock, LWLockMode mode); +extern void LWLockRelease(LWLock *lock); +extern void LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val); +extern void LWLockReleaseAll(void); +extern bool LWLockHeldByMe(LWLock *lock); +extern bool LWLockAnyHeldByMe(LWLock *lock, int nlocks, size_t stride); +extern bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode); + +extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval); +extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 value); + +extern Size LWLockShmemSize(void); +extern void CreateLWLocks(void); +extern void InitLWLockAccess(void); + +extern const char *GetLWLockIdentifier(uint32 classId, uint16 eventId); + +/* + * Extensions (or core code) can obtain an LWLocks by calling + * RequestNamedLWLockTranche() during postmaster startup. Subsequently, + * call GetNamedLWLockTranche() to obtain a pointer to an array containing + * the number of LWLocks requested. + */ +extern void RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks); +extern LWLockPadded *GetNamedLWLockTranche(const char *tranche_name); + +/* + * There is another, more flexible method of obtaining lwlocks. First, call + * LWLockNewTrancheId just once to obtain a tranche ID; this allocates from + * a shared counter. Next, each individual process using the tranche should + * call LWLockRegisterTranche() to associate that tranche ID with a name. + * Finally, LWLockInitialize should be called just once per lwlock, passing + * the tranche ID as an argument. + * + * It may seem strange that each process using the tranche must register it + * separately, but dynamic shared memory segments aren't guaranteed to be + * mapped at the same address in all coordinating backends, so storing the + * registration in the main shared memory segment wouldn't work for that case. + */ +extern int LWLockNewTrancheId(void); +extern void LWLockRegisterTranche(int tranche_id, const char *tranche_name); +extern void LWLockInitialize(LWLock *lock, int tranche_id); + +/* + * Every tranche ID less than NUM_INDIVIDUAL_LWLOCKS is reserved; also, + * we reserve additional tranche IDs for builtin tranches not included in + * the set of individual LWLocks. A call to LWLockNewTrancheId will never + * return a value less than LWTRANCHE_FIRST_USER_DEFINED. + */ +typedef enum BuiltinTrancheIds +{ + LWTRANCHE_XACT_BUFFER = NUM_INDIVIDUAL_LWLOCKS, + LWTRANCHE_COMMITTS_BUFFER, + LWTRANCHE_SUBTRANS_BUFFER, + LWTRANCHE_MULTIXACTOFFSET_BUFFER, + LWTRANCHE_MULTIXACTMEMBER_BUFFER, + LWTRANCHE_NOTIFY_BUFFER, + LWTRANCHE_SERIAL_BUFFER, + LWTRANCHE_WAL_INSERT, + LWTRANCHE_BUFFER_CONTENT, + LWTRANCHE_REPLICATION_ORIGIN_STATE, + LWTRANCHE_REPLICATION_SLOT_IO, + LWTRANCHE_LOCK_FASTPATH, + LWTRANCHE_BUFFER_MAPPING, + LWTRANCHE_LOCK_MANAGER, + LWTRANCHE_PREDICATE_LOCK_MANAGER, + LWTRANCHE_PARALLEL_HASH_JOIN, + LWTRANCHE_PARALLEL_QUERY_DSA, + LWTRANCHE_PER_SESSION_DSA, + LWTRANCHE_PER_SESSION_RECORD_TYPE, + LWTRANCHE_PER_SESSION_RECORD_TYPMOD, + LWTRANCHE_SHARED_TUPLESTORE, + LWTRANCHE_SHARED_TIDBITMAP, + LWTRANCHE_PARALLEL_APPEND, + LWTRANCHE_PER_XACT_PREDICATE_LIST, + LWTRANCHE_PGSTATS_DSA, + LWTRANCHE_PGSTATS_HASH, + LWTRANCHE_PGSTATS_DATA, + LWTRANCHE_FIRST_USER_DEFINED +} BuiltinTrancheIds; + +/* + * Prior to PostgreSQL 9.4, we used an enum type called LWLockId to refer + * to LWLocks. New code should instead use LWLock *. However, for the + * convenience of third-party code, we include the following typedef. + */ +typedef LWLock *LWLockId; + +#endif /* LWLOCK_H */ diff --git a/src/include/storage/md.h b/src/include/storage/md.h new file mode 100644 index 0000000..ffffa40 --- /dev/null +++ b/src/include/storage/md.h @@ -0,0 +1,52 @@ +/*------------------------------------------------------------------------- + * + * md.h + * magnetic disk storage manager public interface declarations. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/md.h + * + *------------------------------------------------------------------------- + */ +#ifndef MD_H +#define MD_H + +#include "storage/block.h" +#include "storage/relfilenode.h" +#include "storage/smgr.h" +#include "storage/sync.h" + +/* md storage manager functionality */ +extern void mdinit(void); +extern void mdopen(SMgrRelation reln); +extern void mdclose(SMgrRelation reln, ForkNumber forknum); +extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo); +extern bool mdexists(SMgrRelation reln, ForkNumber forknum); +extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); +extern void mdextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer); +extern void mdwrite(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void mdwriteback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum); +extern void mdtruncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum); + +extern void ForgetDatabaseSyncRequests(Oid dbid); +extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo); + +/* md sync callbacks */ +extern int mdsyncfiletag(const FileTag *ftag, char *path); +extern int mdunlinkfiletag(const FileTag *ftag, char *path); +extern bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate); + +#endif /* MD_H */ diff --git a/src/include/storage/off.h b/src/include/storage/off.h new file mode 100644 index 0000000..e6573ac --- /dev/null +++ b/src/include/storage/off.h @@ -0,0 +1,57 @@ +/*------------------------------------------------------------------------- + * + * off.h + * POSTGRES disk "offset" definitions. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/off.h + * + *------------------------------------------------------------------------- + */ +#ifndef OFF_H +#define OFF_H + +#include "storage/itemid.h" +/* + * OffsetNumber: + * + * this is a 1-based index into the linp (ItemIdData) array in the + * header of each disk page. + */ +typedef uint16 OffsetNumber; + +#define InvalidOffsetNumber ((OffsetNumber) 0) +#define FirstOffsetNumber ((OffsetNumber) 1) +#define MaxOffsetNumber ((OffsetNumber) (BLCKSZ / sizeof(ItemIdData))) + +/* ---------------- + * support macros + * ---------------- + */ + +/* + * OffsetNumberIsValid + * True iff the offset number is valid. + */ +#define OffsetNumberIsValid(offsetNumber) \ + ((bool) ((offsetNumber != InvalidOffsetNumber) && \ + (offsetNumber <= MaxOffsetNumber))) + +/* + * OffsetNumberNext + * OffsetNumberPrev + * Increments/decrements the argument. These macros look pointless + * but they help us disambiguate the different manipulations on + * OffsetNumbers (e.g., sometimes we subtract one from an + * OffsetNumber to move back, and sometimes we do so to form a + * real C array index). + */ +#define OffsetNumberNext(offsetNumber) \ + ((OffsetNumber) (1 + (offsetNumber))) +#define OffsetNumberPrev(offsetNumber) \ + ((OffsetNumber) (-1 + (offsetNumber))) + +#endif /* OFF_H */ diff --git a/src/include/storage/pg_sema.h b/src/include/storage/pg_sema.h new file mode 100644 index 0000000..5ca941a --- /dev/null +++ b/src/include/storage/pg_sema.h @@ -0,0 +1,61 @@ +/*------------------------------------------------------------------------- + * + * pg_sema.h + * Platform-independent API for semaphores. + * + * PostgreSQL requires counting semaphores (the kind that keep track of + * multiple unlock operations, and will allow an equal number of subsequent + * lock operations before blocking). The underlying implementation is + * not the same on every platform. This file defines the API that must + * be provided by each port. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/pg_sema.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_SEMA_H +#define PG_SEMA_H + +/* + * struct PGSemaphoreData and pointer type PGSemaphore are the data structure + * representing an individual semaphore. The contents of PGSemaphoreData vary + * across implementations and must never be touched by platform-independent + * code; hence, PGSemaphoreData is declared as an opaque struct here. + * + * However, Windows is sufficiently unlike our other ports that it doesn't + * seem worth insisting on ABI compatibility for Windows too. Hence, on + * that platform just define PGSemaphore as HANDLE. + */ +#ifndef USE_WIN32_SEMAPHORES +typedef struct PGSemaphoreData *PGSemaphore; +#else +typedef HANDLE PGSemaphore; +#endif + + +/* Report amount of shared memory needed */ +extern Size PGSemaphoreShmemSize(int maxSemas); + +/* Module initialization (called during postmaster start or shmem reinit) */ +extern void PGReserveSemaphores(int maxSemas); + +/* Allocate a PGSemaphore structure with initial count 1 */ +extern PGSemaphore PGSemaphoreCreate(void); + +/* Reset a previously-initialized PGSemaphore to have count 0 */ +extern void PGSemaphoreReset(PGSemaphore sema); + +/* Lock a semaphore (decrement count), blocking if count would be < 0 */ +extern void PGSemaphoreLock(PGSemaphore sema); + +/* Unlock a semaphore (increment count) */ +extern void PGSemaphoreUnlock(PGSemaphore sema); + +/* Lock a semaphore only if able to do so without blocking */ +extern bool PGSemaphoreTryLock(PGSemaphore sema); + +#endif /* PG_SEMA_H */ diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h new file mode 100644 index 0000000..da5962e --- /dev/null +++ b/src/include/storage/pg_shmem.h @@ -0,0 +1,92 @@ +/*------------------------------------------------------------------------- + * + * pg_shmem.h + * Platform-independent API for shared memory support. + * + * Every port is expected to support shared memory with approximately + * SysV-ish semantics; in particular, a memory block is not anonymous + * but has an ID, and we must be able to tell whether there are any + * remaining processes attached to a block of a specified ID. + * + * To simplify life for the SysV implementation, the ID is assumed to + * consist of two unsigned long values (these are key and ID in SysV + * terms). Other platforms may ignore the second value if they need + * only one ID number. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/pg_shmem.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_SHMEM_H +#define PG_SHMEM_H + +#include "storage/dsm_impl.h" + +typedef struct PGShmemHeader /* standard header for all Postgres shmem */ +{ + int32 magic; /* magic # to identify Postgres segments */ +#define PGShmemMagic 679834894 + pid_t creatorPID; /* PID of creating process (set but unread) */ + Size totalsize; /* total size of segment */ + Size freeoffset; /* offset to first free space */ + dsm_handle dsm_control; /* ID of dynamic shared memory control seg */ + void *index; /* pointer to ShmemIndex table */ +#ifndef WIN32 /* Windows doesn't have useful inode#s */ + dev_t device; /* device data directory is on */ + ino_t inode; /* inode number of data directory */ +#endif +} PGShmemHeader; + +/* GUC variables */ +extern PGDLLIMPORT int shared_memory_type; +extern PGDLLIMPORT int huge_pages; +extern PGDLLIMPORT int huge_page_size; + +/* Possible values for huge_pages */ +typedef enum +{ + HUGE_PAGES_OFF, + HUGE_PAGES_ON, + HUGE_PAGES_TRY +} HugePagesType; + +/* Possible values for shared_memory_type */ +typedef enum +{ + SHMEM_TYPE_WINDOWS, + SHMEM_TYPE_SYSV, + SHMEM_TYPE_MMAP +} PGShmemType; + +#ifndef WIN32 +extern PGDLLIMPORT unsigned long UsedShmemSegID; +#else +extern PGDLLIMPORT HANDLE UsedShmemSegID; +extern PGDLLIMPORT void *ShmemProtectiveRegion; +#endif +extern PGDLLIMPORT void *UsedShmemSegAddr; + +#if !defined(WIN32) && !defined(EXEC_BACKEND) +#define DEFAULT_SHARED_MEMORY_TYPE SHMEM_TYPE_MMAP +#elif !defined(WIN32) +#define DEFAULT_SHARED_MEMORY_TYPE SHMEM_TYPE_SYSV +#else +#define DEFAULT_SHARED_MEMORY_TYPE SHMEM_TYPE_WINDOWS +#endif + +#ifdef EXEC_BACKEND +extern void PGSharedMemoryReAttach(void); +extern void PGSharedMemoryNoReAttach(void); +#endif + +extern PGShmemHeader *PGSharedMemoryCreate(Size size, + PGShmemHeader **shim); +extern bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2); +extern void PGSharedMemoryDetach(void); +extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags); + +#endif /* PG_SHMEM_H */ diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h new file mode 100644 index 0000000..58f4ddf --- /dev/null +++ b/src/include/storage/pmsignal.h @@ -0,0 +1,105 @@ +/*------------------------------------------------------------------------- + * + * pmsignal.h + * routines for signaling between the postmaster and its child processes + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/pmsignal.h + * + *------------------------------------------------------------------------- + */ +#ifndef PMSIGNAL_H +#define PMSIGNAL_H + +#include <signal.h> + +#ifdef HAVE_SYS_PRCTL_H +#include "sys/prctl.h" +#endif + +#ifdef HAVE_SYS_PROCCTL_H +#include "sys/procctl.h" +#endif + +/* + * Reasons for signaling the postmaster. We can cope with simultaneous + * signals for different reasons. If the same reason is signaled multiple + * times in quick succession, however, the postmaster is likely to observe + * only one notification of it. This is okay for the present uses. + */ +typedef enum +{ + PMSIGNAL_RECOVERY_STARTED, /* recovery has started */ + PMSIGNAL_BEGIN_HOT_STANDBY, /* begin Hot Standby */ + PMSIGNAL_ROTATE_LOGFILE, /* send SIGUSR1 to syslogger to rotate logfile */ + PMSIGNAL_START_AUTOVAC_LAUNCHER, /* start an autovacuum launcher */ + PMSIGNAL_START_AUTOVAC_WORKER, /* start an autovacuum worker */ + PMSIGNAL_BACKGROUND_WORKER_CHANGE, /* background worker state change */ + PMSIGNAL_START_WALRECEIVER, /* start a walreceiver */ + PMSIGNAL_ADVANCE_STATE_MACHINE, /* advance postmaster's state machine */ + + NUM_PMSIGNALS /* Must be last value of enum! */ +} PMSignalReason; + +/* + * Reasons why the postmaster would send SIGQUIT to its children. + */ +typedef enum +{ + PMQUIT_NOT_SENT = 0, /* postmaster hasn't sent SIGQUIT */ + PMQUIT_FOR_CRASH, /* some other backend bought the farm */ + PMQUIT_FOR_STOP /* immediate stop was commanded */ +} QuitSignalReason; + +/* PMSignalData is an opaque struct, details known only within pmsignal.c */ +typedef struct PMSignalData PMSignalData; + +/* + * prototypes for functions in pmsignal.c + */ +extern Size PMSignalShmemSize(void); +extern void PMSignalShmemInit(void); +extern void SendPostmasterSignal(PMSignalReason reason); +extern bool CheckPostmasterSignal(PMSignalReason reason); +extern void SetQuitSignalReason(QuitSignalReason reason); +extern QuitSignalReason GetQuitSignalReason(void); +extern int AssignPostmasterChildSlot(void); +extern bool ReleasePostmasterChildSlot(int slot); +extern bool IsPostmasterChildWalSender(int slot); +extern void MarkPostmasterChildActive(void); +extern void MarkPostmasterChildInactive(void); +extern void MarkPostmasterChildWalSender(void); +extern bool PostmasterIsAliveInternal(void); +extern void PostmasterDeathSignalInit(void); + + +/* + * Do we have a way to ask for a signal on parent death? + * + * If we do, pmsignal.c will set up a signal handler, that sets a flag when + * the parent dies. Checking the flag first makes PostmasterIsAlive() a lot + * cheaper in usual case that the postmaster is alive. + */ +#if (defined(HAVE_SYS_PRCTL_H) && defined(PR_SET_PDEATHSIG)) || \ + (defined(HAVE_SYS_PROCCTL_H) && defined(PROC_PDEATHSIG_CTL)) +#define USE_POSTMASTER_DEATH_SIGNAL +#endif + +#ifdef USE_POSTMASTER_DEATH_SIGNAL +extern PGDLLIMPORT volatile sig_atomic_t postmaster_possibly_dead; + +static inline bool +PostmasterIsAlive(void) +{ + if (likely(!postmaster_possibly_dead)) + return true; + return PostmasterIsAliveInternal(); +} +#else +#define PostmasterIsAlive() PostmasterIsAliveInternal() +#endif + +#endif /* PMSIGNAL_H */ diff --git a/src/include/storage/predicate.h b/src/include/storage/predicate.h new file mode 100644 index 0000000..8dfcb39 --- /dev/null +++ b/src/include/storage/predicate.h @@ -0,0 +1,87 @@ +/*------------------------------------------------------------------------- + * + * predicate.h + * POSTGRES public predicate locking definitions. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/predicate.h + * + *------------------------------------------------------------------------- + */ +#ifndef PREDICATE_H +#define PREDICATE_H + +#include "storage/lock.h" +#include "utils/relcache.h" +#include "utils/snapshot.h" + + +/* + * GUC variables + */ +extern PGDLLIMPORT int max_predicate_locks_per_xact; +extern PGDLLIMPORT int max_predicate_locks_per_relation; +extern PGDLLIMPORT int max_predicate_locks_per_page; + + +/* Number of SLRU buffers to use for Serial SLRU */ +#define NUM_SERIAL_BUFFERS 16 + +/* + * A handle used for sharing SERIALIZABLEXACT objects between the participants + * in a parallel query. + */ +typedef void *SerializableXactHandle; + +/* + * function prototypes + */ + +/* housekeeping for shared memory predicate lock structures */ +extern void InitPredicateLocks(void); +extern Size PredicateLockShmemSize(void); + +extern void CheckPointPredicate(void); + +/* predicate lock reporting */ +extern bool PageIsPredicateLocked(Relation relation, BlockNumber blkno); + +/* predicate lock maintenance */ +extern Snapshot GetSerializableTransactionSnapshot(Snapshot snapshot); +extern void SetSerializableTransactionSnapshot(Snapshot snapshot, + VirtualTransactionId *sourcevxid, + int sourcepid); +extern void RegisterPredicateLockingXid(TransactionId xid); +extern void PredicateLockRelation(Relation relation, Snapshot snapshot); +extern void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot); +extern void PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot, + TransactionId insert_xid); +extern void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno); +extern void PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, BlockNumber newblkno); +extern void TransferPredicateLocksToHeapRelation(Relation relation); +extern void ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe); + +/* conflict detection (may also trigger rollback) */ +extern bool CheckForSerializableConflictOutNeeded(Relation relation, Snapshot snapshot); +extern void CheckForSerializableConflictOut(Relation relation, TransactionId xid, Snapshot snapshot); +extern void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno); +extern void CheckTableForSerializableConflictIn(Relation relation); + +/* final rollback checking */ +extern void PreCommit_CheckForSerializationFailure(void); + +/* two-phase commit support */ +extern void AtPrepare_PredicateLocks(void); +extern void PostPrepare_PredicateLocks(TransactionId xid); +extern void PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit); +extern void predicatelock_twophase_recover(TransactionId xid, uint16 info, + void *recdata, uint32 len); + +/* parallel query support */ +extern SerializableXactHandle ShareSerializableXact(void); +extern void AttachSerializableXact(SerializableXactHandle handle); + +#endif /* PREDICATE_H */ diff --git a/src/include/storage/predicate_internals.h b/src/include/storage/predicate_internals.h new file mode 100644 index 0000000..2416d3c --- /dev/null +++ b/src/include/storage/predicate_internals.h @@ -0,0 +1,494 @@ +/*------------------------------------------------------------------------- + * + * predicate_internals.h + * POSTGRES internal predicate locking definitions. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/predicate_internals.h + * + *------------------------------------------------------------------------- + */ +#ifndef PREDICATE_INTERNALS_H +#define PREDICATE_INTERNALS_H + +#include "storage/lock.h" +#include "storage/lwlock.h" + +/* + * Commit number. + */ +typedef uint64 SerCommitSeqNo; + +/* + * Reserved commit sequence numbers: + * - 0 is reserved to indicate a non-existent SLRU entry; it cannot be + * used as a SerCommitSeqNo, even an invalid one + * - InvalidSerCommitSeqNo is used to indicate a transaction that + * hasn't committed yet, so use a number greater than all valid + * ones to make comparison do the expected thing + * - RecoverySerCommitSeqNo is used to refer to transactions that + * happened before a crash/recovery, since we restart the sequence + * at that point. It's earlier than all normal sequence numbers, + * and is only used by recovered prepared transactions + */ +#define InvalidSerCommitSeqNo ((SerCommitSeqNo) PG_UINT64_MAX) +#define RecoverySerCommitSeqNo ((SerCommitSeqNo) 1) +#define FirstNormalSerCommitSeqNo ((SerCommitSeqNo) 2) + +/* + * The SERIALIZABLEXACT struct contains information needed for each + * serializable database transaction to support SSI techniques. + * + * A home-grown list is maintained in shared memory to manage these. + * An entry is used when the serializable transaction acquires a snapshot. + * Unless the transaction is rolled back, this entry must generally remain + * until all concurrent transactions have completed. (There are special + * optimizations for READ ONLY transactions which often allow them to be + * cleaned up earlier.) A transaction which is rolled back is cleaned up + * as soon as possible. + * + * Eligibility for cleanup of committed transactions is generally determined + * by comparing the transaction's finishedBefore field to + * SxactGlobalXmin. + */ +typedef struct SERIALIZABLEXACT +{ + VirtualTransactionId vxid; /* The executing process always has one of + * these. */ + + /* + * We use two numbers to track the order that transactions commit. Before + * commit, a transaction is marked as prepared, and prepareSeqNo is set. + * Shortly after commit, it's marked as committed, and commitSeqNo is set. + * This doesn't give a strict commit order, but these two values together + * are good enough for us, as we can always err on the safe side and + * assume that there's a conflict, if we can't be sure of the exact + * ordering of two commits. + * + * Note that a transaction is marked as prepared for a short period during + * commit processing, even if two-phase commit is not used. But with + * two-phase commit, a transaction can stay in prepared state for some + * time. + */ + SerCommitSeqNo prepareSeqNo; + SerCommitSeqNo commitSeqNo; + + /* these values are not both interesting at the same time */ + union + { + SerCommitSeqNo earliestOutConflictCommit; /* when committed with + * conflict out */ + SerCommitSeqNo lastCommitBeforeSnapshot; /* when not committed or + * no conflict out */ + } SeqNo; + SHM_QUEUE outConflicts; /* list of write transactions whose data we + * couldn't read. */ + SHM_QUEUE inConflicts; /* list of read transactions which couldn't + * see our write. */ + SHM_QUEUE predicateLocks; /* list of associated PREDICATELOCK objects */ + SHM_QUEUE finishedLink; /* list link in + * FinishedSerializableTransactions */ + + /* + * perXactPredicateListLock is only used in parallel queries: it protects + * this SERIALIZABLEXACT's predicate lock list against other workers of + * the same session. + */ + LWLock perXactPredicateListLock; + + /* + * for r/o transactions: list of concurrent r/w transactions that we could + * potentially have conflicts with, and vice versa for r/w transactions + */ + SHM_QUEUE possibleUnsafeConflicts; + + TransactionId topXid; /* top level xid for the transaction, if one + * exists; else invalid */ + TransactionId finishedBefore; /* invalid means still running; else the + * struct expires when no serializable + * xids are before this. */ + TransactionId xmin; /* the transaction's snapshot xmin */ + uint32 flags; /* OR'd combination of values defined below */ + int pid; /* pid of associated process */ + int pgprocno; /* pgprocno of associated process */ +} SERIALIZABLEXACT; + +#define SXACT_FLAG_COMMITTED 0x00000001 /* already committed */ +#define SXACT_FLAG_PREPARED 0x00000002 /* about to commit */ +#define SXACT_FLAG_ROLLED_BACK 0x00000004 /* already rolled back */ +#define SXACT_FLAG_DOOMED 0x00000008 /* will roll back */ +/* + * The following flag actually means that the flagged transaction has a + * conflict out *to a transaction which committed ahead of it*. It's hard + * to get that into a name of a reasonable length. + */ +#define SXACT_FLAG_CONFLICT_OUT 0x00000010 +#define SXACT_FLAG_READ_ONLY 0x00000020 +#define SXACT_FLAG_DEFERRABLE_WAITING 0x00000040 +#define SXACT_FLAG_RO_SAFE 0x00000080 +#define SXACT_FLAG_RO_UNSAFE 0x00000100 +#define SXACT_FLAG_SUMMARY_CONFLICT_IN 0x00000200 +#define SXACT_FLAG_SUMMARY_CONFLICT_OUT 0x00000400 +/* + * The following flag means the transaction has been partially released + * already, but is being preserved because parallel workers might have a + * reference to it. It'll be recycled by the leader at end-of-transaction. + */ +#define SXACT_FLAG_PARTIALLY_RELEASED 0x00000800 + +/* + * The following types are used to provide an ad hoc list for holding + * SERIALIZABLEXACT objects. An HTAB is overkill, since there is no need to + * access these by key -- there are direct pointers to these objects where + * needed. If a shared memory list is created, these types can probably be + * eliminated in favor of using the general solution. + */ +typedef struct PredXactListElementData +{ + SHM_QUEUE link; + SERIALIZABLEXACT sxact; +} PredXactListElementData; + +typedef struct PredXactListElementData *PredXactListElement; + +#define PredXactListElementDataSize \ + ((Size)MAXALIGN(sizeof(PredXactListElementData))) + +typedef struct PredXactListData +{ + SHM_QUEUE availableList; + SHM_QUEUE activeList; + + /* + * These global variables are maintained when registering and cleaning up + * serializable transactions. They must be global across all backends, + * but are not needed outside the predicate.c source file. Protected by + * SerializableXactHashLock. + */ + TransactionId SxactGlobalXmin; /* global xmin for active serializable + * transactions */ + int SxactGlobalXminCount; /* how many active serializable + * transactions have this xmin */ + int WritableSxactCount; /* how many non-read-only serializable + * transactions are active */ + SerCommitSeqNo LastSxactCommitSeqNo; /* a strictly monotonically + * increasing number for commits + * of serializable transactions */ + /* Protected by SerializableXactHashLock. */ + SerCommitSeqNo CanPartialClearThrough; /* can clear predicate locks and + * inConflicts for committed + * transactions through this seq + * no */ + /* Protected by SerializableFinishedListLock. */ + SerCommitSeqNo HavePartialClearedThrough; /* have cleared through this + * seq no */ + SERIALIZABLEXACT *OldCommittedSxact; /* shared copy of dummy sxact */ + + PredXactListElement element; +} PredXactListData; + +typedef struct PredXactListData *PredXactList; + +#define PredXactListDataSize \ + ((Size)MAXALIGN(sizeof(PredXactListData))) + + +/* + * The following types are used to provide lists of rw-conflicts between + * pairs of transactions. Since exactly the same information is needed, + * they are also used to record possible unsafe transaction relationships + * for purposes of identifying safe snapshots for read-only transactions. + * + * When a RWConflictData is not in use to record either type of relationship + * between a pair of transactions, it is kept on an "available" list. The + * outLink field is used for maintaining that list. + */ +typedef struct RWConflictData +{ + SHM_QUEUE outLink; /* link for list of conflicts out from a sxact */ + SHM_QUEUE inLink; /* link for list of conflicts in to a sxact */ + SERIALIZABLEXACT *sxactOut; + SERIALIZABLEXACT *sxactIn; +} RWConflictData; + +typedef struct RWConflictData *RWConflict; + +#define RWConflictDataSize \ + ((Size)MAXALIGN(sizeof(RWConflictData))) + +typedef struct RWConflictPoolHeaderData +{ + SHM_QUEUE availableList; + RWConflict element; +} RWConflictPoolHeaderData; + +typedef struct RWConflictPoolHeaderData *RWConflictPoolHeader; + +#define RWConflictPoolHeaderDataSize \ + ((Size)MAXALIGN(sizeof(RWConflictPoolHeaderData))) + + +/* + * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable + * transaction or any of its subtransactions. + */ +typedef struct SERIALIZABLEXIDTAG +{ + TransactionId xid; +} SERIALIZABLEXIDTAG; + +/* + * The SERIALIZABLEXID struct provides a link from a TransactionId for a + * serializable transaction to the related SERIALIZABLEXACT record, even if + * the transaction has completed and its connection has been closed. + * + * These are created as new top level transaction IDs are first assigned to + * transactions which are participating in predicate locking. This may + * never happen for a particular transaction if it doesn't write anything. + * They are removed with their related serializable transaction objects. + * + * The SubTransGetTopmostTransaction method is used where necessary to get + * from an XID which might be from a subtransaction to the top level XID. + */ +typedef struct SERIALIZABLEXID +{ + /* hash key */ + SERIALIZABLEXIDTAG tag; + + /* data */ + SERIALIZABLEXACT *myXact; /* pointer to the top level transaction data */ +} SERIALIZABLEXID; + + +/* + * The PREDICATELOCKTARGETTAG struct identifies a database object which can + * be the target of predicate locks. + * + * Note that the hash function being used doesn't properly respect tag + * length -- if the length of the structure isn't a multiple of four bytes it + * will go to a four byte boundary past the end of the tag. If you change + * this struct, make sure any slack space is initialized, so that any random + * bytes in the middle or at the end are not included in the hash. + * + * TODO SSI: If we always use the same fields for the same type of value, we + * should rename these. Holding off until it's clear there are no exceptions. + * Since indexes are relations with blocks and tuples, it's looking likely that + * the rename will be possible. If not, we may need to divide the last field + * and use part of it for a target type, so that we know how to interpret the + * data.. + */ +typedef struct PREDICATELOCKTARGETTAG +{ + uint32 locktag_field1; /* a 32-bit ID field */ + uint32 locktag_field2; /* a 32-bit ID field */ + uint32 locktag_field3; /* a 32-bit ID field */ + uint32 locktag_field4; /* a 32-bit ID field */ +} PREDICATELOCKTARGETTAG; + +/* + * The PREDICATELOCKTARGET struct represents a database object on which there + * are predicate locks. + * + * A hash list of these objects is maintained in shared memory. An entry is + * added when a predicate lock is requested on an object which doesn't + * already have one. An entry is removed when the last lock is removed from + * its list. + */ +typedef struct PREDICATELOCKTARGET +{ + /* hash key */ + PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */ + + /* data */ + SHM_QUEUE predicateLocks; /* list of PREDICATELOCK objects assoc. with + * predicate lock target */ +} PREDICATELOCKTARGET; + + +/* + * The PREDICATELOCKTAG struct identifies an individual predicate lock. + * + * It is the combination of predicate lock target (which is a lockable + * object) and a serializable transaction which has acquired a lock on that + * target. + */ +typedef struct PREDICATELOCKTAG +{ + PREDICATELOCKTARGET *myTarget; + SERIALIZABLEXACT *myXact; +} PREDICATELOCKTAG; + +/* + * The PREDICATELOCK struct represents an individual lock. + * + * An entry can be created here when the related database object is read, or + * by promotion of multiple finer-grained targets. All entries related to a + * serializable transaction are removed when that serializable transaction is + * cleaned up. Entries can also be removed when they are combined into a + * single coarser-grained lock entry. + */ +typedef struct PREDICATELOCK +{ + /* hash key */ + PREDICATELOCKTAG tag; /* unique identifier of lock */ + + /* data */ + SHM_QUEUE targetLink; /* list link in PREDICATELOCKTARGET's list of + * predicate locks */ + SHM_QUEUE xactLink; /* list link in SERIALIZABLEXACT's list of + * predicate locks */ + SerCommitSeqNo commitSeqNo; /* only used for summarized predicate locks */ +} PREDICATELOCK; + + +/* + * The LOCALPREDICATELOCK struct represents a local copy of data which is + * also present in the PREDICATELOCK table, organized for fast access without + * needing to acquire a LWLock. It is strictly for optimization. + * + * Each serializable transaction creates its own local hash table to hold a + * collection of these. This information is used to determine when a number + * of fine-grained locks should be promoted to a single coarser-grained lock. + * The information is maintained more-or-less in parallel to the + * PREDICATELOCK data, but because this data is not protected by locks and is + * only used in an optimization heuristic, it is allowed to drift in a few + * corner cases where maintaining exact data would be expensive. + * + * The hash table is created when the serializable transaction acquires its + * snapshot, and its memory is released upon completion of the transaction. + */ +typedef struct LOCALPREDICATELOCK +{ + /* hash key */ + PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */ + + /* data */ + bool held; /* is lock held, or just its children? */ + int childLocks; /* number of child locks currently held */ +} LOCALPREDICATELOCK; + + +/* + * The types of predicate locks which can be acquired. + */ +typedef enum PredicateLockTargetType +{ + PREDLOCKTAG_RELATION, + PREDLOCKTAG_PAGE, + PREDLOCKTAG_TUPLE + /* TODO SSI: Other types may be needed for index locking */ +} PredicateLockTargetType; + + +/* + * This structure is used to quickly capture a copy of all predicate + * locks. This is currently used only by the pg_lock_status function, + * which in turn is used by the pg_locks view. + */ +typedef struct PredicateLockData +{ + int nelements; + PREDICATELOCKTARGETTAG *locktags; + SERIALIZABLEXACT *xacts; +} PredicateLockData; + + +/* + * These macros define how we map logical IDs of lockable objects into the + * physical fields of PREDICATELOCKTARGETTAG. Use these to set up values, + * rather than accessing the fields directly. Note multiple eval of target! + */ +#define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = (reloid), \ + (locktag).locktag_field3 = InvalidBlockNumber, \ + (locktag).locktag_field4 = InvalidOffsetNumber) + +#define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = (reloid), \ + (locktag).locktag_field3 = (blocknum), \ + (locktag).locktag_field4 = InvalidOffsetNumber) + +#define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = (reloid), \ + (locktag).locktag_field3 = (blocknum), \ + (locktag).locktag_field4 = (offnum)) + +#define GET_PREDICATELOCKTARGETTAG_DB(locktag) \ + ((Oid) (locktag).locktag_field1) +#define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \ + ((Oid) (locktag).locktag_field2) +#define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \ + ((BlockNumber) (locktag).locktag_field3) +#define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \ + ((OffsetNumber) (locktag).locktag_field4) +#define GET_PREDICATELOCKTARGETTAG_TYPE(locktag) \ + (((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \ + (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE : \ + PREDLOCKTAG_RELATION)) + +/* + * Two-phase commit statefile records. There are two types: for each + * transaction, we generate one per-transaction record and a variable + * number of per-predicate-lock records. + */ +typedef enum TwoPhasePredicateRecordType +{ + TWOPHASEPREDICATERECORD_XACT, + TWOPHASEPREDICATERECORD_LOCK +} TwoPhasePredicateRecordType; + +/* + * Per-transaction information to reconstruct a SERIALIZABLEXACT. Not + * much is needed because most of it not meaningful for a recovered + * prepared transaction. + * + * In particular, we do not record the in and out conflict lists for a + * prepared transaction because the associated SERIALIZABLEXACTs will + * not be available after recovery. Instead, we simply record the + * existence of each type of conflict by setting the transaction's + * summary conflict in/out flag. + */ +typedef struct TwoPhasePredicateXactRecord +{ + TransactionId xmin; + uint32 flags; +} TwoPhasePredicateXactRecord; + +/* Per-lock state */ +typedef struct TwoPhasePredicateLockRecord +{ + PREDICATELOCKTARGETTAG target; + uint32 filler; /* to avoid length change in back-patched fix */ +} TwoPhasePredicateLockRecord; + +typedef struct TwoPhasePredicateRecord +{ + TwoPhasePredicateRecordType type; + union + { + TwoPhasePredicateXactRecord xactRecord; + TwoPhasePredicateLockRecord lockRecord; + } data; +} TwoPhasePredicateRecord; + +/* + * Define a macro to use for an "empty" SERIALIZABLEXACT reference. + */ +#define InvalidSerializableXact ((SERIALIZABLEXACT *) NULL) + + +/* + * Function definitions for functions needing awareness of predicate + * locking internals. + */ +extern PredicateLockData *GetPredicateLockStatusData(void); +extern int GetSafeSnapshotBlockingPids(int blocked_pid, + int *output, int output_size); + +#endif /* PREDICATE_INTERNALS_H */ diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h new file mode 100644 index 0000000..2579e61 --- /dev/null +++ b/src/include/storage/proc.h @@ -0,0 +1,461 @@ +/*------------------------------------------------------------------------- + * + * proc.h + * per-process shared memory data structures + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/proc.h + * + *------------------------------------------------------------------------- + */ +#ifndef _PROC_H_ +#define _PROC_H_ + +#include "access/clog.h" +#include "access/xlogdefs.h" +#include "lib/ilist.h" +#include "storage/latch.h" +#include "storage/lock.h" +#include "storage/pg_sema.h" +#include "storage/proclist_types.h" + +/* + * Each backend advertises up to PGPROC_MAX_CACHED_SUBXIDS TransactionIds + * for non-aborted subtransactions of its current top transaction. These + * have to be treated as running XIDs by other backends. + * + * We also keep track of whether the cache overflowed (ie, the transaction has + * generated at least one subtransaction that didn't fit in the cache). + * If none of the caches have overflowed, we can assume that an XID that's not + * listed anywhere in the PGPROC array is not a running transaction. Else we + * have to look at pg_subtrans. + */ +#define PGPROC_MAX_CACHED_SUBXIDS 64 /* XXX guessed-at value */ + +typedef struct XidCacheStatus +{ + /* number of cached subxids, never more than PGPROC_MAX_CACHED_SUBXIDS */ + uint8 count; + /* has PGPROC->subxids overflowed */ + bool overflowed; +} XidCacheStatus; + +struct XidCache +{ + TransactionId xids[PGPROC_MAX_CACHED_SUBXIDS]; +}; + +/* + * Flags for PGPROC->statusFlags and PROC_HDR->statusFlags[] + */ +#define PROC_IS_AUTOVACUUM 0x01 /* is it an autovac worker? */ +#define PROC_IN_VACUUM 0x02 /* currently running lazy vacuum */ +#define PROC_IN_SAFE_IC 0x04 /* currently running CREATE INDEX + * CONCURRENTLY or REINDEX + * CONCURRENTLY on non-expressional, + * non-partial index */ +#define PROC_VACUUM_FOR_WRAPAROUND 0x08 /* set by autovac only */ +#define PROC_IN_LOGICAL_DECODING 0x10 /* currently doing logical + * decoding outside xact */ +#define PROC_AFFECTS_ALL_HORIZONS 0x20 /* this proc's xmin must be + * included in vacuum horizons + * in all databases */ + +/* flags reset at EOXact */ +#define PROC_VACUUM_STATE_MASK \ + (PROC_IN_VACUUM | PROC_IN_SAFE_IC | PROC_VACUUM_FOR_WRAPAROUND) + +/* + * Xmin-related flags. Make sure any flags that affect how the process' Xmin + * value is interpreted by VACUUM are included here. + */ +#define PROC_XMIN_FLAGS (PROC_IN_VACUUM | PROC_IN_SAFE_IC) + +/* + * We allow a small number of "weak" relation locks (AccessShareLock, + * RowShareLock, RowExclusiveLock) to be recorded in the PGPROC structure + * rather than the main lock table. This eases contention on the lock + * manager LWLocks. See storage/lmgr/README for additional details. + */ +#define FP_LOCK_SLOTS_PER_BACKEND 16 + +/* + * An invalid pgprocno. Must be larger than the maximum number of PGPROC + * structures we could possibly have. See comments for MAX_BACKENDS. + */ +#define INVALID_PGPROCNO PG_INT32_MAX + +/* + * Flags for PGPROC.delayChkpt + * + * These flags can be used to delay the start or completion of a checkpoint + * for short periods. A flag is in effect if the corresponding bit is set in + * the PGPROC of any backend. + * + * For our purposes here, a checkpoint has three phases: (1) determine the + * location to which the redo pointer will be moved, (2) write all the + * data durably to disk, and (3) WAL-log the checkpoint. + * + * Setting DELAY_CHKPT_START prevents the system from moving from phase 1 + * to phase 2. This is useful when we are performing a WAL-logged modification + * of data that will be flushed to disk in phase 2. By setting this flag + * before writing WAL and clearing it after we've both written WAL and + * performed the corresponding modification, we ensure that if the WAL record + * is inserted prior to the new redo point, the corresponding data changes will + * also be flushed to disk before the checkpoint can complete. (In the + * extremely common case where the data being modified is in shared buffers + * and we acquire an exclusive content lock on the relevant buffers before + * writing WAL, this mechanism is not needed, because phase 2 will block + * until we release the content lock and then flush the modified data to + * disk.) + * + * Setting DELAY_CHKPT_COMPLETE prevents the system from moving from phase 2 + * to phase 3. This is useful if we are performing a WAL-logged operation that + * might invalidate buffers, such as relation truncation. In this case, we need + * to ensure that any buffers which were invalidated and thus not flushed by + * the checkpoint are actaully destroyed on disk. Replay can cope with a file + * or block that doesn't exist, but not with a block that has the wrong + * contents. + */ +#define DELAY_CHKPT_START (1<<0) +#define DELAY_CHKPT_COMPLETE (1<<1) + +typedef enum +{ + PROC_WAIT_STATUS_OK, + PROC_WAIT_STATUS_WAITING, + PROC_WAIT_STATUS_ERROR, +} ProcWaitStatus; + +/* + * Each backend has a PGPROC struct in shared memory. There is also a list of + * currently-unused PGPROC structs that will be reallocated to new backends. + * + * links: list link for any list the PGPROC is in. When waiting for a lock, + * the PGPROC is linked into that lock's waitProcs queue. A recycled PGPROC + * is linked into ProcGlobal's freeProcs list. + * + * Note: twophase.c also sets up a dummy PGPROC struct for each currently + * prepared transaction. These PGPROCs appear in the ProcArray data structure + * so that the prepared transactions appear to be still running and are + * correctly shown as holding locks. A prepared transaction PGPROC can be + * distinguished from a real one at need by the fact that it has pid == 0. + * The semaphore and lock-activity fields in a prepared-xact PGPROC are unused, + * but its myProcLocks[] lists are valid. + * + * We allow many fields of this struct to be accessed without locks, such as + * delayChkpt and isBackgroundWorker. However, keep in mind that writing + * mirrored ones (see below) requires holding ProcArrayLock or XidGenLock in + * at least shared mode, so that pgxactoff does not change concurrently. + * + * Mirrored fields: + * + * Some fields in PGPROC (see "mirrored in ..." comment) are mirrored into an + * element of more densely packed ProcGlobal arrays. These arrays are indexed + * by PGPROC->pgxactoff. Both copies need to be maintained coherently. + * + * NB: The pgxactoff indexed value can *never* be accessed without holding + * locks. + * + * See PROC_HDR for details. + */ +struct PGPROC +{ + /* proc->links MUST BE FIRST IN STRUCT (see ProcSleep,ProcWakeup,etc) */ + SHM_QUEUE links; /* list link if process is in a list */ + PGPROC **procgloballist; /* procglobal list that owns this PGPROC */ + + PGSemaphore sem; /* ONE semaphore to sleep on */ + ProcWaitStatus waitStatus; + + Latch procLatch; /* generic latch for process */ + + + TransactionId xid; /* id of top-level transaction currently being + * executed by this proc, if running and XID + * is assigned; else InvalidTransactionId. + * mirrored in ProcGlobal->xids[pgxactoff] */ + + TransactionId xmin; /* minimal running XID as it was when we were + * starting our xact, excluding LAZY VACUUM: + * vacuum must not remove tuples deleted by + * xid >= xmin ! */ + + LocalTransactionId lxid; /* local id of top-level transaction currently + * being executed by this proc, if running; + * else InvalidLocalTransactionId */ + int pid; /* Backend's process ID; 0 if prepared xact */ + + int pgxactoff; /* offset into various ProcGlobal->arrays with + * data mirrored from this PGPROC */ + int pgprocno; + + /* These fields are zero while a backend is still starting up: */ + BackendId backendId; /* This backend's backend ID (if assigned) */ + Oid databaseId; /* OID of database this backend is using */ + Oid roleId; /* OID of role using this backend */ + + Oid tempNamespaceId; /* OID of temp schema this backend is + * using */ + + bool isBackgroundWorker; /* true if background worker. */ + + /* + * While in hot standby mode, shows that a conflict signal has been sent + * for the current transaction. Set/cleared while holding ProcArrayLock, + * though not required. Accessed without lock, if needed. + */ + bool recoveryConflictPending; + + /* Info about LWLock the process is currently waiting for, if any. */ + bool lwWaiting; /* true if waiting for an LW lock */ + uint8 lwWaitMode; /* lwlock mode being waited for */ + proclist_node lwWaitLink; /* position in LW lock wait list */ + + /* Support for condition variables. */ + proclist_node cvWaitLink; /* position in CV wait list */ + + /* Info about lock the process is currently waiting for, if any. */ + /* waitLock and waitProcLock are NULL if not currently waiting. */ + LOCK *waitLock; /* Lock object we're sleeping on ... */ + PROCLOCK *waitProcLock; /* Per-holder info for awaited lock */ + LOCKMODE waitLockMode; /* type of lock we're waiting for */ + LOCKMASK heldLocks; /* bitmask for lock types already held on this + * lock object by this backend */ + pg_atomic_uint64 waitStart; /* time at which wait for lock acquisition + * started */ + + int delayChkptFlags; /* for DELAY_CHKPT_* flags */ + + uint8 statusFlags; /* this backend's status flags, see PROC_* + * above. mirrored in + * ProcGlobal->statusFlags[pgxactoff] */ + + /* + * Info to allow us to wait for synchronous replication, if needed. + * waitLSN is InvalidXLogRecPtr if not waiting; set only by user backend. + * syncRepState must not be touched except by owning process or WALSender. + * syncRepLinks used only while holding SyncRepLock. + */ + XLogRecPtr waitLSN; /* waiting for this LSN or higher */ + int syncRepState; /* wait state for sync rep */ + SHM_QUEUE syncRepLinks; /* list link if process is in syncrep queue */ + + /* + * All PROCLOCK objects for locks held or awaited by this backend are + * linked into one of these lists, according to the partition number of + * their lock. + */ + SHM_QUEUE myProcLocks[NUM_LOCK_PARTITIONS]; + + XidCacheStatus subxidStatus; /* mirrored with + * ProcGlobal->subxidStates[i] */ + struct XidCache subxids; /* cache for subtransaction XIDs */ + + /* Support for group XID clearing. */ + /* true, if member of ProcArray group waiting for XID clear */ + bool procArrayGroupMember; + /* next ProcArray group member waiting for XID clear */ + pg_atomic_uint32 procArrayGroupNext; + + /* + * latest transaction id among the transaction's main XID and + * subtransactions + */ + TransactionId procArrayGroupMemberXid; + + uint32 wait_event_info; /* proc's wait information */ + + /* Support for group transaction status update. */ + bool clogGroupMember; /* true, if member of clog group */ + pg_atomic_uint32 clogGroupNext; /* next clog group member */ + TransactionId clogGroupMemberXid; /* transaction id of clog group member */ + XidStatus clogGroupMemberXidStatus; /* transaction status of clog + * group member */ + int clogGroupMemberPage; /* clog page corresponding to + * transaction id of clog group member */ + XLogRecPtr clogGroupMemberLsn; /* WAL location of commit record for clog + * group member */ + + /* Lock manager data, recording fast-path locks taken by this backend. */ + LWLock fpInfoLock; /* protects per-backend fast-path state */ + uint64 fpLockBits; /* lock modes held for each fast-path slot */ + Oid fpRelId[FP_LOCK_SLOTS_PER_BACKEND]; /* slots for rel oids */ + bool fpVXIDLock; /* are we holding a fast-path VXID lock? */ + LocalTransactionId fpLocalTransactionId; /* lxid for fast-path VXID + * lock */ + + /* + * Support for lock groups. Use LockHashPartitionLockByProc on the group + * leader to get the LWLock protecting these fields. + */ + PGPROC *lockGroupLeader; /* lock group leader, if I'm a member */ + dlist_head lockGroupMembers; /* list of members, if I'm a leader */ + dlist_node lockGroupLink; /* my member link, if I'm a member */ +}; + +/* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */ + + +extern PGDLLIMPORT PGPROC *MyProc; + +/* + * There is one ProcGlobal struct for the whole database cluster. + * + * Adding/Removing an entry into the procarray requires holding *both* + * ProcArrayLock and XidGenLock in exclusive mode (in that order). Both are + * needed because the dense arrays (see below) are accessed from + * GetNewTransactionId() and GetSnapshotData(), and we don't want to add + * further contention by both using the same lock. Adding/Removing a procarray + * entry is much less frequent. + * + * Some fields in PGPROC are mirrored into more densely packed arrays (e.g. + * xids), with one entry for each backend. These arrays only contain entries + * for PGPROCs that have been added to the shared array with ProcArrayAdd() + * (in contrast to PGPROC array which has unused PGPROCs interspersed). + * + * The dense arrays are indexed by PGPROC->pgxactoff. Any concurrent + * ProcArrayAdd() / ProcArrayRemove() can lead to pgxactoff of a procarray + * member to change. Therefore it is only safe to use PGPROC->pgxactoff to + * access the dense array while holding either ProcArrayLock or XidGenLock. + * + * As long as a PGPROC is in the procarray, the mirrored values need to be + * maintained in both places in a coherent manner. + * + * The denser separate arrays are beneficial for three main reasons: First, to + * allow for as tight loops accessing the data as possible. Second, to prevent + * updates of frequently changing data (e.g. xmin) from invalidating + * cachelines also containing less frequently changing data (e.g. xid, + * statusFlags). Third to condense frequently accessed data into as few + * cachelines as possible. + * + * There are two main reasons to have the data mirrored between these dense + * arrays and PGPROC. First, as explained above, a PGPROC's array entries can + * only be accessed with either ProcArrayLock or XidGenLock held, whereas the + * PGPROC entries do not require that (obviously there may still be locking + * requirements around the individual field, separate from the concerns + * here). That is particularly important for a backend to efficiently checks + * it own values, which it often can safely do without locking. Second, the + * PGPROC fields allow to avoid unnecessary accesses and modification to the + * dense arrays. A backend's own PGPROC is more likely to be in a local cache, + * whereas the cachelines for the dense array will be modified by other + * backends (often removing it from the cache for other cores/sockets). At + * commit/abort time a check of the PGPROC value can avoid accessing/dirtying + * the corresponding array value. + * + * Basically it makes sense to access the PGPROC variable when checking a + * single backend's data, especially when already looking at the PGPROC for + * other reasons already. It makes sense to look at the "dense" arrays if we + * need to look at many / most entries, because we then benefit from the + * reduced indirection and better cross-process cache-ability. + * + * When entering a PGPROC for 2PC transactions with ProcArrayAdd(), the data + * in the dense arrays is initialized from the PGPROC while it already holds + * ProcArrayLock. + */ +typedef struct PROC_HDR +{ + /* Array of PGPROC structures (not including dummies for prepared txns) */ + PGPROC *allProcs; + + /* Array mirroring PGPROC.xid for each PGPROC currently in the procarray */ + TransactionId *xids; + + /* + * Array mirroring PGPROC.subxidStatus for each PGPROC currently in the + * procarray. + */ + XidCacheStatus *subxidStates; + + /* + * Array mirroring PGPROC.statusFlags for each PGPROC currently in the + * procarray. + */ + uint8 *statusFlags; + + /* Length of allProcs array */ + uint32 allProcCount; + /* Head of list of free PGPROC structures */ + PGPROC *freeProcs; + /* Head of list of autovacuum's free PGPROC structures */ + PGPROC *autovacFreeProcs; + /* Head of list of bgworker free PGPROC structures */ + PGPROC *bgworkerFreeProcs; + /* Head of list of walsender free PGPROC structures */ + PGPROC *walsenderFreeProcs; + /* First pgproc waiting for group XID clear */ + pg_atomic_uint32 procArrayGroupFirst; + /* First pgproc waiting for group transaction status update */ + pg_atomic_uint32 clogGroupFirst; + /* WALWriter process's latch */ + Latch *walwriterLatch; + /* Checkpointer process's latch */ + Latch *checkpointerLatch; + /* Current shared estimate of appropriate spins_per_delay value */ + int spins_per_delay; + /* Buffer id of the buffer that Startup process waits for pin on, or -1 */ + int startupBufferPinWaitBufId; +} PROC_HDR; + +extern PGDLLIMPORT PROC_HDR *ProcGlobal; + +extern PGDLLIMPORT PGPROC *PreparedXactProcs; + +/* Accessor for PGPROC given a pgprocno. */ +#define GetPGProcByNumber(n) (&ProcGlobal->allProcs[(n)]) + +/* + * We set aside some extra PGPROC structures for auxiliary processes, + * ie things that aren't full-fledged backends but need shmem access. + * + * Background writer, checkpointer, WAL writer and archiver run during normal + * operation. Startup process and WAL receiver also consume 2 slots, but WAL + * writer is launched only after startup has exited, so we only need 5 slots. + */ +#define NUM_AUXILIARY_PROCS 5 + +/* configurable options */ +extern PGDLLIMPORT int DeadlockTimeout; +extern PGDLLIMPORT int StatementTimeout; +extern PGDLLIMPORT int LockTimeout; +extern PGDLLIMPORT int IdleInTransactionSessionTimeout; +extern PGDLLIMPORT int IdleSessionTimeout; +extern PGDLLIMPORT bool log_lock_waits; + + +/* + * Function Prototypes + */ +extern int ProcGlobalSemas(void); +extern Size ProcGlobalShmemSize(void); +extern void InitProcGlobal(void); +extern void InitProcess(void); +extern void InitProcessPhase2(void); +extern void InitAuxiliaryProcess(void); + +extern void SetStartupBufferPinWaitBufId(int bufid); +extern int GetStartupBufferPinWaitBufId(void); + +extern bool HaveNFreeProcs(int n); +extern void ProcReleaseLocks(bool isCommit); + +extern void ProcQueueInit(PROC_QUEUE *queue); +extern ProcWaitStatus ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable); +extern PGPROC *ProcWakeup(PGPROC *proc, ProcWaitStatus waitStatus); +extern void ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock); +extern void CheckDeadLockAlert(void); +extern bool IsWaitingForLock(void); +extern void LockErrorCleanup(void); + +extern void ProcWaitForSignal(uint32 wait_event_info); +extern void ProcSendSignal(int pgprocno); + +extern PGPROC *AuxiliaryPidGetProc(int pid); + +extern void BecomeLockGroupLeader(void); +extern bool BecomeLockGroupMember(PGPROC *leader, int pid); + +#endif /* _PROC_H_ */ diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h new file mode 100644 index 0000000..781e3f6 --- /dev/null +++ b/src/include/storage/procarray.h @@ -0,0 +1,99 @@ +/*------------------------------------------------------------------------- + * + * procarray.h + * POSTGRES process array definitions. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/procarray.h + * + *------------------------------------------------------------------------- + */ +#ifndef PROCARRAY_H +#define PROCARRAY_H + +#include "storage/lock.h" +#include "storage/standby.h" +#include "utils/relcache.h" +#include "utils/snapshot.h" + + +extern Size ProcArrayShmemSize(void); +extern void CreateSharedProcArray(void); +extern void ProcArrayAdd(PGPROC *proc); +extern void ProcArrayRemove(PGPROC *proc, TransactionId latestXid); + +extern void ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid); +extern void ProcArrayClearTransaction(PGPROC *proc); + +extern void ProcArrayInitRecovery(TransactionId initializedUptoXID); +extern void ProcArrayApplyRecoveryInfo(RunningTransactions running); +extern void ProcArrayApplyXidAssignment(TransactionId topxid, + int nsubxids, TransactionId *subxids); + +extern void RecordKnownAssignedTransactionIds(TransactionId xid); +extern void ExpireTreeKnownAssignedTransactionIds(TransactionId xid, + int nsubxids, TransactionId *subxids, + TransactionId max_xid); +extern void ExpireAllKnownAssignedTransactionIds(void); +extern void ExpireOldKnownAssignedTransactionIds(TransactionId xid); +extern void KnownAssignedTransactionIdsIdleMaintenance(void); + +extern int GetMaxSnapshotXidCount(void); +extern int GetMaxSnapshotSubxidCount(void); + +extern Snapshot GetSnapshotData(Snapshot snapshot); + +extern bool ProcArrayInstallImportedXmin(TransactionId xmin, + VirtualTransactionId *sourcevxid); +extern bool ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc); + +extern RunningTransactions GetRunningTransactionData(void); + +extern bool TransactionIdIsInProgress(TransactionId xid); +extern bool TransactionIdIsActive(TransactionId xid); +extern TransactionId GetOldestNonRemovableTransactionId(Relation rel); +extern TransactionId GetOldestTransactionIdConsideredRunning(void); +extern TransactionId GetOldestActiveTransactionId(void); +extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly); +extern void GetReplicationHorizons(TransactionId *slot_xmin, TransactionId *catalog_xmin); + +extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids, int type); +extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, + int nvxids, int type); + +extern PGPROC *BackendPidGetProc(int pid); +extern PGPROC *BackendPidGetProcWithLock(int pid); +extern int BackendXidGetPid(TransactionId xid); +extern bool IsBackendPid(int pid); + +extern VirtualTransactionId *GetCurrentVirtualXIDs(TransactionId limitXmin, + bool excludeXmin0, bool allDbs, int excludeVacuum, + int *nvxids); +extern VirtualTransactionId *GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid); +extern pid_t CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode); +extern pid_t SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode, + bool conflictPending); + +extern bool MinimumActiveBackends(int min); +extern int CountDBBackends(Oid databaseid); +extern int CountDBConnections(Oid databaseid); +extern void CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending); +extern int CountUserBackends(Oid roleid); +extern bool CountOtherDBBackends(Oid databaseId, + int *nbackends, int *nprepared); +extern void TerminateOtherDBBackends(Oid databaseId); + +extern void XidCacheRemoveRunningXids(TransactionId xid, + int nxids, const TransactionId *xids, + TransactionId latestXid); + +extern void ProcArraySetReplicationSlotXmin(TransactionId xmin, + TransactionId catalog_xmin, bool already_locked); + +extern void ProcArrayGetReplicationSlotXmin(TransactionId *xmin, + TransactionId *catalog_xmin); + +#endif /* PROCARRAY_H */ diff --git a/src/include/storage/proclist.h b/src/include/storage/proclist.h new file mode 100644 index 0000000..509e341 --- /dev/null +++ b/src/include/storage/proclist.h @@ -0,0 +1,219 @@ +/*------------------------------------------------------------------------- + * + * proclist.h + * operations on doubly-linked lists of pgprocnos + * + * The interface is similar to dlist from ilist.h, but uses pgprocno instead + * of pointers. This allows proclist_head to be mapped at different addresses + * in different backends. + * + * See proclist_types.h for the structs that these functions operate on. They + * are separated to break a header dependency cycle with proc.h. + * + * Portions Copyright (c) 2016-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/include/storage/proclist.h + *------------------------------------------------------------------------- + */ +#ifndef PROCLIST_H +#define PROCLIST_H + +#include "storage/proc.h" +#include "storage/proclist_types.h" + +/* + * Initialize a proclist. + */ +static inline void +proclist_init(proclist_head *list) +{ + list->head = list->tail = INVALID_PGPROCNO; +} + +/* + * Is the list empty? + */ +static inline bool +proclist_is_empty(proclist_head *list) +{ + return list->head == INVALID_PGPROCNO; +} + +/* + * Get a pointer to a proclist_node inside a given PGPROC, given a procno and + * the proclist_node field's offset within struct PGPROC. + */ +static inline proclist_node * +proclist_node_get(int procno, size_t node_offset) +{ + char *entry = (char *) GetPGProcByNumber(procno); + + return (proclist_node *) (entry + node_offset); +} + +/* + * Insert a process at the beginning of a list. + */ +static inline void +proclist_push_head_offset(proclist_head *list, int procno, size_t node_offset) +{ + proclist_node *node = proclist_node_get(procno, node_offset); + + Assert(node->next == 0 && node->prev == 0); + + if (list->head == INVALID_PGPROCNO) + { + Assert(list->tail == INVALID_PGPROCNO); + node->next = node->prev = INVALID_PGPROCNO; + list->head = list->tail = procno; + } + else + { + Assert(list->tail != INVALID_PGPROCNO); + Assert(list->head != procno); + Assert(list->tail != procno); + node->next = list->head; + proclist_node_get(node->next, node_offset)->prev = procno; + node->prev = INVALID_PGPROCNO; + list->head = procno; + } +} + +/* + * Insert a process at the end of a list. + */ +static inline void +proclist_push_tail_offset(proclist_head *list, int procno, size_t node_offset) +{ + proclist_node *node = proclist_node_get(procno, node_offset); + + Assert(node->next == 0 && node->prev == 0); + + if (list->tail == INVALID_PGPROCNO) + { + Assert(list->head == INVALID_PGPROCNO); + node->next = node->prev = INVALID_PGPROCNO; + list->head = list->tail = procno; + } + else + { + Assert(list->head != INVALID_PGPROCNO); + Assert(list->head != procno); + Assert(list->tail != procno); + node->prev = list->tail; + proclist_node_get(node->prev, node_offset)->next = procno; + node->next = INVALID_PGPROCNO; + list->tail = procno; + } +} + +/* + * Delete a process from a list --- it must be in the list! + */ +static inline void +proclist_delete_offset(proclist_head *list, int procno, size_t node_offset) +{ + proclist_node *node = proclist_node_get(procno, node_offset); + + Assert(node->next != 0 || node->prev != 0); + + if (node->prev == INVALID_PGPROCNO) + { + Assert(list->head == procno); + list->head = node->next; + } + else + proclist_node_get(node->prev, node_offset)->next = node->next; + + if (node->next == INVALID_PGPROCNO) + { + Assert(list->tail == procno); + list->tail = node->prev; + } + else + proclist_node_get(node->next, node_offset)->prev = node->prev; + + node->next = node->prev = 0; +} + +/* + * Check if a process is currently in a list. It must be known that the + * process is not in any _other_ proclist that uses the same proclist_node, + * so that the only possibilities are that it is in this list or none. + */ +static inline bool +proclist_contains_offset(proclist_head *list, int procno, + size_t node_offset) +{ + proclist_node *node = proclist_node_get(procno, node_offset); + + /* If it's not in any list, it's definitely not in this one. */ + if (node->prev == 0 && node->next == 0) + return false; + + /* + * It must, in fact, be in this list. Ideally, in assert-enabled builds, + * we'd verify that. But since this function is typically used while + * holding a spinlock, crawling the whole list is unacceptable. However, + * we can verify matters in O(1) time when the node is a list head or + * tail, and that seems worth doing, since in practice that should often + * be enough to catch mistakes. + */ + Assert(node->prev != INVALID_PGPROCNO || list->head == procno); + Assert(node->next != INVALID_PGPROCNO || list->tail == procno); + + return true; +} + +/* + * Remove and return the first process from a list (there must be one). + */ +static inline PGPROC * +proclist_pop_head_node_offset(proclist_head *list, size_t node_offset) +{ + PGPROC *proc; + + Assert(!proclist_is_empty(list)); + proc = GetPGProcByNumber(list->head); + proclist_delete_offset(list, list->head, node_offset); + return proc; +} + +/* + * Helper macros to avoid repetition of offsetof(PGPROC, <member>). + * 'link_member' is the name of a proclist_node member in PGPROC. + */ +#define proclist_delete(list, procno, link_member) \ + proclist_delete_offset((list), (procno), offsetof(PGPROC, link_member)) +#define proclist_push_head(list, procno, link_member) \ + proclist_push_head_offset((list), (procno), offsetof(PGPROC, link_member)) +#define proclist_push_tail(list, procno, link_member) \ + proclist_push_tail_offset((list), (procno), offsetof(PGPROC, link_member)) +#define proclist_pop_head_node(list, link_member) \ + proclist_pop_head_node_offset((list), offsetof(PGPROC, link_member)) +#define proclist_contains(list, procno, link_member) \ + proclist_contains_offset((list), (procno), offsetof(PGPROC, link_member)) + +/* + * Iterate through the list pointed at by 'lhead', storing the current + * position in 'iter'. 'link_member' is the name of a proclist_node member in + * PGPROC. Access the current position with iter.cur. + * + * The only list modification allowed while iterating is deleting the current + * node with proclist_delete(list, iter.cur, node_offset). + */ +#define proclist_foreach_modify(iter, lhead, link_member) \ + for (AssertVariableIsOfTypeMacro(iter, proclist_mutable_iter), \ + AssertVariableIsOfTypeMacro(lhead, proclist_head *), \ + (iter).cur = (lhead)->head, \ + (iter).next = (iter).cur == INVALID_PGPROCNO ? INVALID_PGPROCNO : \ + proclist_node_get((iter).cur, \ + offsetof(PGPROC, link_member))->next; \ + (iter).cur != INVALID_PGPROCNO; \ + (iter).cur = (iter).next, \ + (iter).next = (iter).cur == INVALID_PGPROCNO ? INVALID_PGPROCNO : \ + proclist_node_get((iter).cur, \ + offsetof(PGPROC, link_member))->next) + +#endif /* PROCLIST_H */ diff --git a/src/include/storage/proclist_types.h b/src/include/storage/proclist_types.h new file mode 100644 index 0000000..5232679 --- /dev/null +++ b/src/include/storage/proclist_types.h @@ -0,0 +1,51 @@ +/*------------------------------------------------------------------------- + * + * proclist_types.h + * doubly-linked lists of pgprocnos + * + * See proclist.h for functions that operate on these types. + * + * Portions Copyright (c) 2016-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/include/storage/proclist_types.h + *------------------------------------------------------------------------- + */ + +#ifndef PROCLIST_TYPES_H +#define PROCLIST_TYPES_H + +/* + * A node in a doubly-linked list of processes. The link fields contain + * the 0-based PGPROC indexes of the next and previous process, or + * INVALID_PGPROCNO in the next-link of the last node and the prev-link + * of the first node. A node that is currently not in any list + * should have next == prev == 0; this is not a possible state for a node + * that is in a list, because we disallow circularity. + */ +typedef struct proclist_node +{ + int next; /* pgprocno of the next PGPROC */ + int prev; /* pgprocno of the prev PGPROC */ +} proclist_node; + +/* + * Header of a doubly-linked list of PGPROCs, identified by pgprocno. + * An empty list is represented by head == tail == INVALID_PGPROCNO. + */ +typedef struct proclist_head +{ + int head; /* pgprocno of the head PGPROC */ + int tail; /* pgprocno of the tail PGPROC */ +} proclist_head; + +/* + * List iterator allowing some modifications while iterating. + */ +typedef struct proclist_mutable_iter +{ + int cur; /* pgprocno of the current PGPROC */ + int next; /* pgprocno of the next PGPROC */ +} proclist_mutable_iter; + +#endif /* PROCLIST_TYPES_H */ diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h new file mode 100644 index 0000000..ee63690 --- /dev/null +++ b/src/include/storage/procsignal.h @@ -0,0 +1,71 @@ +/*------------------------------------------------------------------------- + * + * procsignal.h + * Routines for interprocess signaling + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/procsignal.h + * + *------------------------------------------------------------------------- + */ +#ifndef PROCSIGNAL_H +#define PROCSIGNAL_H + +#include "storage/backendid.h" + + +/* + * Reasons for signaling a Postgres child process (a backend or an auxiliary + * process, like checkpointer). We can cope with concurrent signals for different + * reasons. However, if the same reason is signaled multiple times in quick + * succession, the process is likely to observe only one notification of it. + * This is okay for the present uses. + * + * Also, because of race conditions, it's important that all the signals be + * defined so that no harm is done if a process mistakenly receives one. + */ +typedef enum +{ + PROCSIG_CATCHUP_INTERRUPT, /* sinval catchup interrupt */ + PROCSIG_NOTIFY_INTERRUPT, /* listen/notify interrupt */ + PROCSIG_PARALLEL_MESSAGE, /* message from cooperating parallel backend */ + PROCSIG_WALSND_INIT_STOPPING, /* ask walsenders to prepare for shutdown */ + PROCSIG_BARRIER, /* global barrier interrupt */ + PROCSIG_LOG_MEMORY_CONTEXT, /* ask backend to log the memory contexts */ + + /* Recovery conflict reasons */ + PROCSIG_RECOVERY_CONFLICT_DATABASE, + PROCSIG_RECOVERY_CONFLICT_TABLESPACE, + PROCSIG_RECOVERY_CONFLICT_LOCK, + PROCSIG_RECOVERY_CONFLICT_SNAPSHOT, + PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, + PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK, + + NUM_PROCSIGNALS /* Must be last! */ +} ProcSignalReason; + +typedef enum +{ + PROCSIGNAL_BARRIER_SMGRRELEASE /* ask smgr to close files */ +} ProcSignalBarrierType; + +/* + * prototypes for functions in procsignal.c + */ +extern Size ProcSignalShmemSize(void); +extern void ProcSignalShmemInit(void); + +extern void ProcSignalInit(int pss_idx); +extern int SendProcSignal(pid_t pid, ProcSignalReason reason, + BackendId backendId); + +extern uint64 EmitProcSignalBarrier(ProcSignalBarrierType type); +extern void WaitForProcSignalBarrier(uint64 generation); +extern void ProcessProcSignalBarrier(void); + +extern void procsignal_sigusr1_handler(SIGNAL_ARGS); + +#endif /* PROCSIGNAL_H */ diff --git a/src/include/storage/reinit.h b/src/include/storage/reinit.h new file mode 100644 index 0000000..bf2c10d --- /dev/null +++ b/src/include/storage/reinit.h @@ -0,0 +1,28 @@ +/*------------------------------------------------------------------------- + * + * reinit.h + * Reinitialization of unlogged relations + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/reinit.h + * + *------------------------------------------------------------------------- + */ + +#ifndef REINIT_H +#define REINIT_H + +#include "common/relpath.h" + + +extern void ResetUnloggedRelations(int op); +extern bool parse_filename_for_nontemp_relation(const char *name, + int *oidchars, ForkNumber *fork); + +#define UNLOGGED_RELATION_CLEANUP 0x0001 +#define UNLOGGED_RELATION_INIT 0x0002 + +#endif /* REINIT_H */ diff --git a/src/include/storage/relfilenode.h b/src/include/storage/relfilenode.h new file mode 100644 index 0000000..4fdc606 --- /dev/null +++ b/src/include/storage/relfilenode.h @@ -0,0 +1,99 @@ +/*------------------------------------------------------------------------- + * + * relfilenode.h + * Physical access information for relations. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/relfilenode.h + * + *------------------------------------------------------------------------- + */ +#ifndef RELFILENODE_H +#define RELFILENODE_H + +#include "common/relpath.h" +#include "storage/backendid.h" + +/* + * RelFileNode must provide all that we need to know to physically access + * a relation, with the exception of the backend ID, which can be provided + * separately. Note, however, that a "physical" relation is comprised of + * multiple files on the filesystem, as each fork is stored as a separate + * file, and each fork can be divided into multiple segments. See md.c. + * + * spcNode identifies the tablespace of the relation. It corresponds to + * pg_tablespace.oid. + * + * dbNode identifies the database of the relation. It is zero for + * "shared" relations (those common to all databases of a cluster). + * Nonzero dbNode values correspond to pg_database.oid. + * + * relNode identifies the specific relation. relNode corresponds to + * pg_class.relfilenode (NOT pg_class.oid, because we need to be able + * to assign new physical files to relations in some situations). + * Notice that relNode is only unique within a database in a particular + * tablespace. + * + * Note: spcNode must be GLOBALTABLESPACE_OID if and only if dbNode is + * zero. We support shared relations only in the "global" tablespace. + * + * Note: in pg_class we allow reltablespace == 0 to denote that the + * relation is stored in its database's "default" tablespace (as + * identified by pg_database.dattablespace). However this shorthand + * is NOT allowed in RelFileNode structs --- the real tablespace ID + * must be supplied when setting spcNode. + * + * Note: in pg_class, relfilenode can be zero to denote that the relation + * is a "mapped" relation, whose current true filenode number is available + * from relmapper.c. Again, this case is NOT allowed in RelFileNodes. + * + * Note: various places use RelFileNode in hashtable keys. Therefore, + * there *must not* be any unused padding bytes in this struct. That + * should be safe as long as all the fields are of type Oid. + */ +typedef struct RelFileNode +{ + Oid spcNode; /* tablespace */ + Oid dbNode; /* database */ + Oid relNode; /* relation */ +} RelFileNode; + +/* + * Augmenting a relfilenode with the backend ID provides all the information + * we need to locate the physical storage. The backend ID is InvalidBackendId + * for regular relations (those accessible to more than one backend), or the + * owning backend's ID for backend-local relations. Backend-local relations + * are always transient and removed in case of a database crash; they are + * never WAL-logged or fsync'd. + */ +typedef struct RelFileNodeBackend +{ + RelFileNode node; + BackendId backend; +} RelFileNodeBackend; + +#define RelFileNodeBackendIsTemp(rnode) \ + ((rnode).backend != InvalidBackendId) + +/* + * Note: RelFileNodeEquals and RelFileNodeBackendEquals compare relNode first + * since that is most likely to be different in two unequal RelFileNodes. It + * is probably redundant to compare spcNode if the other fields are found equal, + * but do it anyway to be sure. Likewise for checking the backend ID in + * RelFileNodeBackendEquals. + */ +#define RelFileNodeEquals(node1, node2) \ + ((node1).relNode == (node2).relNode && \ + (node1).dbNode == (node2).dbNode && \ + (node1).spcNode == (node2).spcNode) + +#define RelFileNodeBackendEquals(node1, node2) \ + ((node1).node.relNode == (node2).node.relNode && \ + (node1).node.dbNode == (node2).node.dbNode && \ + (node1).backend == (node2).backend && \ + (node1).node.spcNode == (node2).node.spcNode) + +#endif /* RELFILENODE_H */ diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h new file mode 100644 index 0000000..4d3ffc7 --- /dev/null +++ b/src/include/storage/s_lock.h @@ -0,0 +1,1110 @@ +/*------------------------------------------------------------------------- + * + * s_lock.h + * Hardware-dependent implementation of spinlocks. + * + * NOTE: none of the macros in this file are intended to be called directly. + * Call them through the hardware-independent macros in spin.h. + * + * The following hardware-dependent macros must be provided for each + * supported platform: + * + * void S_INIT_LOCK(slock_t *lock) + * Initialize a spinlock (to the unlocked state). + * + * int S_LOCK(slock_t *lock) + * Acquire a spinlock, waiting if necessary. + * Time out and abort() if unable to acquire the lock in a + * "reasonable" amount of time --- typically ~ 1 minute. + * Should return number of "delays"; see s_lock.c + * + * void S_UNLOCK(slock_t *lock) + * Unlock a previously acquired lock. + * + * bool S_LOCK_FREE(slock_t *lock) + * Tests if the lock is free. Returns true if free, false if locked. + * This does *not* change the state of the lock. + * + * void SPIN_DELAY(void) + * Delay operation to occur inside spinlock wait loop. + * + * Note to implementors: there are default implementations for all these + * macros at the bottom of the file. Check if your platform can use + * these or needs to override them. + * + * Usually, S_LOCK() is implemented in terms of even lower-level macros + * TAS() and TAS_SPIN(): + * + * int TAS(slock_t *lock) + * Atomic test-and-set instruction. Attempt to acquire the lock, + * but do *not* wait. Returns 0 if successful, nonzero if unable + * to acquire the lock. + * + * int TAS_SPIN(slock_t *lock) + * Like TAS(), but this version is used when waiting for a lock + * previously found to be contended. By default, this is the + * same as TAS(), but on some architectures it's better to poll a + * contended lock using an unlocked instruction and retry the + * atomic test-and-set only when it appears free. + * + * TAS() and TAS_SPIN() are NOT part of the API, and should never be called + * directly. + * + * CAUTION: on some platforms TAS() and/or TAS_SPIN() may sometimes report + * failure to acquire a lock even when the lock is not locked. For example, + * on Alpha TAS() will "fail" if interrupted. Therefore a retry loop must + * always be used, even if you are certain the lock is free. + * + * It is the responsibility of these macros to make sure that the compiler + * does not re-order accesses to shared memory to precede the actual lock + * acquisition, or follow the lock release. Prior to PostgreSQL 9.5, this + * was the caller's responsibility, which meant that callers had to use + * volatile-qualified pointers to refer to both the spinlock itself and the + * shared data being accessed within the spinlocked critical section. This + * was notationally awkward, easy to forget (and thus error-prone), and + * prevented some useful compiler optimizations. For these reasons, we + * now require that the macros themselves prevent compiler re-ordering, + * so that the caller doesn't need to take special precautions. + * + * On platforms with weak memory ordering, the TAS(), TAS_SPIN(), and + * S_UNLOCK() macros must further include hardware-level memory fence + * instructions to prevent similar re-ordering at the hardware level. + * TAS() and TAS_SPIN() must guarantee that loads and stores issued after + * the macro are not executed until the lock has been obtained. Conversely, + * S_UNLOCK() must guarantee that loads and stores issued before the macro + * have been executed before the lock is released. + * + * On most supported platforms, TAS() uses a tas() function written + * in assembly language to execute a hardware atomic-test-and-set + * instruction. Equivalent OS-supplied mutex routines could be used too. + * + * If no system-specific TAS() is available (ie, HAVE_SPINLOCKS is not + * defined), then we fall back on an emulation that uses SysV semaphores + * (see spin.c). This emulation will be MUCH MUCH slower than a proper TAS() + * implementation, because of the cost of a kernel call per lock or unlock. + * An old report is that Postgres spends around 40% of its time in semop(2) + * when using the SysV semaphore code. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/s_lock.h + * + *------------------------------------------------------------------------- + */ +#ifndef S_LOCK_H +#define S_LOCK_H + +#ifdef FRONTEND +#error "s_lock.h may not be included from frontend code" +#endif + +#ifdef HAVE_SPINLOCKS /* skip spinlocks if requested */ + +#if defined(__GNUC__) || defined(__INTEL_COMPILER) +/************************************************************************* + * All the gcc inlines + * Gcc consistently defines the CPU as __cpu__. + * Other compilers use __cpu or __cpu__ so we test for both in those cases. + */ + +/*---------- + * Standard gcc asm format (assuming "volatile slock_t *lock"): + + __asm__ __volatile__( + " instruction \n" + " instruction \n" + " instruction \n" +: "=r"(_res), "+m"(*lock) // return register, in/out lock value +: "r"(lock) // lock pointer, in input register +: "memory", "cc"); // show clobbered registers here + + * The output-operands list (after first colon) should always include + * "+m"(*lock), whether or not the asm code actually refers to this + * operand directly. This ensures that gcc believes the value in the + * lock variable is used and set by the asm code. Also, the clobbers + * list (after third colon) should always include "memory"; this prevents + * gcc from thinking it can cache the values of shared-memory fields + * across the asm code. Add "cc" if your asm code changes the condition + * code register, and also list any temp registers the code uses. + *---------- + */ + + +#ifdef __i386__ /* 32-bit i386 */ +#define HAS_TEST_AND_SET + +typedef unsigned char slock_t; + +#define TAS(lock) tas(lock) + +static __inline__ int +tas(volatile slock_t *lock) +{ + register slock_t _res = 1; + + /* + * Use a non-locking test before asserting the bus lock. Note that the + * extra test appears to be a small loss on some x86 platforms and a small + * win on others; it's by no means clear that we should keep it. + * + * When this was last tested, we didn't have separate TAS() and TAS_SPIN() + * macros. Nowadays it probably would be better to do a non-locking test + * in TAS_SPIN() but not in TAS(), like on x86_64, but no-one's done the + * testing to verify that. Without some empirical evidence, better to + * leave it alone. + */ + __asm__ __volatile__( + " cmpb $0,%1 \n" + " jne 1f \n" + " lock \n" + " xchgb %0,%1 \n" + "1: \n" +: "+q"(_res), "+m"(*lock) +: /* no inputs */ +: "memory", "cc"); + return (int) _res; +} + +#define SPIN_DELAY() spin_delay() + +static __inline__ void +spin_delay(void) +{ + /* + * This sequence is equivalent to the PAUSE instruction ("rep" is + * ignored by old IA32 processors if the following instruction is + * not a string operation); the IA-32 Architecture Software + * Developer's Manual, Vol. 3, Section 7.7.2 describes why using + * PAUSE in the inner loop of a spin lock is necessary for good + * performance: + * + * The PAUSE instruction improves the performance of IA-32 + * processors supporting Hyper-Threading Technology when + * executing spin-wait loops and other routines where one + * thread is accessing a shared lock or semaphore in a tight + * polling loop. When executing a spin-wait loop, the + * processor can suffer a severe performance penalty when + * exiting the loop because it detects a possible memory order + * violation and flushes the core processor's pipeline. The + * PAUSE instruction provides a hint to the processor that the + * code sequence is a spin-wait loop. The processor uses this + * hint to avoid the memory order violation and prevent the + * pipeline flush. In addition, the PAUSE instruction + * de-pipelines the spin-wait loop to prevent it from + * consuming execution resources excessively. + */ + __asm__ __volatile__( + " rep; nop \n"); +} + +#endif /* __i386__ */ + + +#ifdef __x86_64__ /* AMD Opteron, Intel EM64T */ +#define HAS_TEST_AND_SET + +typedef unsigned char slock_t; + +#define TAS(lock) tas(lock) + +/* + * On Intel EM64T, it's a win to use a non-locking test before the xchg proper, + * but only when spinning. + * + * See also Implementing Scalable Atomic Locks for Multi-Core Intel(tm) EM64T + * and IA32, by Michael Chynoweth and Mary R. Lee. As of this writing, it is + * available at: + * http://software.intel.com/en-us/articles/implementing-scalable-atomic-locks-for-multi-core-intel-em64t-and-ia32-architectures + */ +#define TAS_SPIN(lock) (*(lock) ? 1 : TAS(lock)) + +static __inline__ int +tas(volatile slock_t *lock) +{ + register slock_t _res = 1; + + __asm__ __volatile__( + " lock \n" + " xchgb %0,%1 \n" +: "+q"(_res), "+m"(*lock) +: /* no inputs */ +: "memory", "cc"); + return (int) _res; +} + +#define SPIN_DELAY() spin_delay() + +static __inline__ void +spin_delay(void) +{ + /* + * Adding a PAUSE in the spin delay loop is demonstrably a no-op on + * Opteron, but it may be of some use on EM64T, so we keep it. + */ + __asm__ __volatile__( + " rep; nop \n"); +} + +#endif /* __x86_64__ */ + + +#if defined(__ia64__) || defined(__ia64) +/* + * Intel Itanium, gcc or Intel's compiler. + * + * Itanium has weak memory ordering, but we rely on the compiler to enforce + * strict ordering of accesses to volatile data. In particular, while the + * xchg instruction implicitly acts as a memory barrier with 'acquire' + * semantics, we do not have an explicit memory fence instruction in the + * S_UNLOCK macro. We use a regular assignment to clear the spinlock, and + * trust that the compiler marks the generated store instruction with the + * ".rel" opcode. + * + * Testing shows that assumption to hold on gcc, although I could not find + * any explicit statement on that in the gcc manual. In Intel's compiler, + * the -m[no-]serialize-volatile option controls that, and testing shows that + * it is enabled by default. + * + * While icc accepts gcc asm blocks on x86[_64], this is not true on ia64 + * (at least not in icc versions before 12.x). So we have to carry a separate + * compiler-intrinsic-based implementation for it. + */ +#define HAS_TEST_AND_SET + +typedef unsigned int slock_t; + +#define TAS(lock) tas(lock) + +/* On IA64, it's a win to use a non-locking test before the xchg proper */ +#define TAS_SPIN(lock) (*(lock) ? 1 : TAS(lock)) + +#ifndef __INTEL_COMPILER + +static __inline__ int +tas(volatile slock_t *lock) +{ + long int ret; + + __asm__ __volatile__( + " xchg4 %0=%1,%2 \n" +: "=r"(ret), "+m"(*lock) +: "r"(1) +: "memory"); + return (int) ret; +} + +#else /* __INTEL_COMPILER */ + +static __inline__ int +tas(volatile slock_t *lock) +{ + int ret; + + ret = _InterlockedExchange(lock,1); /* this is a xchg asm macro */ + + return ret; +} + +/* icc can't use the regular gcc S_UNLOCK() macro either in this case */ +#define S_UNLOCK(lock) \ + do { __memory_barrier(); *(lock) = 0; } while (0) + +#endif /* __INTEL_COMPILER */ +#endif /* __ia64__ || __ia64 */ + + +/* + * On ARM and ARM64, we use __sync_lock_test_and_set(int *, int) if available. + * + * We use the int-width variant of the builtin because it works on more chips + * than other widths. + */ +#if defined(__arm__) || defined(__arm) || defined(__aarch64__) || defined(__aarch64) +#ifdef HAVE_GCC__SYNC_INT32_TAS +#define HAS_TEST_AND_SET + +#define TAS(lock) tas(lock) + +typedef int slock_t; + +static __inline__ int +tas(volatile slock_t *lock) +{ + return __sync_lock_test_and_set(lock, 1); +} + +#define S_UNLOCK(lock) __sync_lock_release(lock) + +/* + * Using an ISB instruction to delay in spinlock loops appears beneficial on + * high-core-count ARM64 processors. It seems mostly a wash for smaller gear, + * and ISB doesn't exist at all on pre-v7 ARM chips. + */ +#if defined(__aarch64__) || defined(__aarch64) + +#define SPIN_DELAY() spin_delay() + +static __inline__ void +spin_delay(void) +{ + __asm__ __volatile__( + " isb; \n"); +} + +#endif /* __aarch64__ || __aarch64 */ +#endif /* HAVE_GCC__SYNC_INT32_TAS */ +#endif /* __arm__ || __arm || __aarch64__ || __aarch64 */ + + +/* S/390 and S/390x Linux (32- and 64-bit zSeries) */ +#if defined(__s390__) || defined(__s390x__) +#define HAS_TEST_AND_SET + +typedef unsigned int slock_t; + +#define TAS(lock) tas(lock) + +static __inline__ int +tas(volatile slock_t *lock) +{ + int _res = 0; + + __asm__ __volatile__( + " cs %0,%3,0(%2) \n" +: "+d"(_res), "+m"(*lock) +: "a"(lock), "d"(1) +: "memory", "cc"); + return _res; +} + +#endif /* __s390__ || __s390x__ */ + + +#if defined(__sparc__) /* Sparc */ +/* + * Solaris has always run sparc processors in TSO (total store) mode, but + * linux didn't use to and the *BSDs still don't. So, be careful about + * acquire/release semantics. The CPU will treat superfluous membars as + * NOPs, so it's just code space. + */ +#define HAS_TEST_AND_SET + +typedef unsigned char slock_t; + +#define TAS(lock) tas(lock) + +static __inline__ int +tas(volatile slock_t *lock) +{ + register slock_t _res; + + /* + * See comment in src/backend/port/tas/sunstudio_sparc.s for why this + * uses "ldstub", and that file uses "cas". gcc currently generates + * sparcv7-targeted binaries, so "cas" use isn't possible. + */ + __asm__ __volatile__( + " ldstub [%2], %0 \n" +: "=r"(_res), "+m"(*lock) +: "r"(lock) +: "memory"); +#if defined(__sparcv7) || defined(__sparc_v7__) + /* + * No stbar or membar available, luckily no actually produced hardware + * requires a barrier. + */ +#elif defined(__sparcv8) || defined(__sparc_v8__) + /* stbar is available (and required for both PSO, RMO), membar isn't */ + __asm__ __volatile__ ("stbar \n":::"memory"); +#else + /* + * #LoadStore (RMO) | #LoadLoad (RMO) together are the appropriate acquire + * barrier for sparcv8+ upwards. + */ + __asm__ __volatile__ ("membar #LoadStore | #LoadLoad \n":::"memory"); +#endif + return (int) _res; +} + +#if defined(__sparcv7) || defined(__sparc_v7__) +/* + * No stbar or membar available, luckily no actually produced hardware + * requires a barrier. We fall through to the default gcc definition of + * S_UNLOCK in this case. + */ +#elif defined(__sparcv8) || defined(__sparc_v8__) +/* stbar is available (and required for both PSO, RMO), membar isn't */ +#define S_UNLOCK(lock) \ +do \ +{ \ + __asm__ __volatile__ ("stbar \n":::"memory"); \ + *((volatile slock_t *) (lock)) = 0; \ +} while (0) +#else +/* + * #LoadStore (RMO) | #StoreStore (RMO, PSO) together are the appropriate + * release barrier for sparcv8+ upwards. + */ +#define S_UNLOCK(lock) \ +do \ +{ \ + __asm__ __volatile__ ("membar #LoadStore | #StoreStore \n":::"memory"); \ + *((volatile slock_t *) (lock)) = 0; \ +} while (0) +#endif + +#endif /* __sparc__ */ + + +/* PowerPC */ +#if defined(__ppc__) || defined(__powerpc__) || defined(__ppc64__) || defined(__powerpc64__) +#define HAS_TEST_AND_SET + +typedef unsigned int slock_t; + +#define TAS(lock) tas(lock) + +/* On PPC, it's a win to use a non-locking test before the lwarx */ +#define TAS_SPIN(lock) (*(lock) ? 1 : TAS(lock)) + +/* + * The second operand of addi can hold a constant zero or a register number, + * hence constraint "=&b" to avoid allocating r0. "b" stands for "address + * base register"; most operands having this register-or-zero property are + * address bases, e.g. the second operand of lwax. + * + * NOTE: per the Enhanced PowerPC Architecture manual, v1.0 dated 7-May-2002, + * an isync is a sufficient synchronization barrier after a lwarx/stwcx loop. + * On newer machines, we can use lwsync instead for better performance. + * + * Ordinarily, we'd code the branches here using GNU-style local symbols, that + * is "1f" referencing "1:" and so on. But some people run gcc on AIX with + * IBM's assembler as backend, and IBM's assembler doesn't do local symbols. + * So hand-code the branch offsets; fortunately, all PPC instructions are + * exactly 4 bytes each, so it's not too hard to count. + */ +static __inline__ int +tas(volatile slock_t *lock) +{ + slock_t _t; + int _res; + + __asm__ __volatile__( +#ifdef USE_PPC_LWARX_MUTEX_HINT +" lwarx %0,0,%3,1 \n" +#else +" lwarx %0,0,%3 \n" +#endif +" cmpwi %0,0 \n" +" bne $+16 \n" /* branch to li %1,1 */ +" addi %0,%0,1 \n" +" stwcx. %0,0,%3 \n" +" beq $+12 \n" /* branch to lwsync/isync */ +" li %1,1 \n" +" b $+12 \n" /* branch to end of asm sequence */ +#ifdef USE_PPC_LWSYNC +" lwsync \n" +#else +" isync \n" +#endif +" li %1,0 \n" + +: "=&b"(_t), "=r"(_res), "+m"(*lock) +: "r"(lock) +: "memory", "cc"); + return _res; +} + +/* + * PowerPC S_UNLOCK is almost standard but requires a "sync" instruction. + * On newer machines, we can use lwsync instead for better performance. + */ +#ifdef USE_PPC_LWSYNC +#define S_UNLOCK(lock) \ +do \ +{ \ + __asm__ __volatile__ (" lwsync \n" ::: "memory"); \ + *((volatile slock_t *) (lock)) = 0; \ +} while (0) +#else +#define S_UNLOCK(lock) \ +do \ +{ \ + __asm__ __volatile__ (" sync \n" ::: "memory"); \ + *((volatile slock_t *) (lock)) = 0; \ +} while (0) +#endif /* USE_PPC_LWSYNC */ + +#endif /* powerpc */ + + +/* Linux Motorola 68k */ +#if (defined(__mc68000__) || defined(__m68k__)) && defined(__linux__) +#define HAS_TEST_AND_SET + +typedef unsigned char slock_t; + +#define TAS(lock) tas(lock) + +static __inline__ int +tas(volatile slock_t *lock) +{ + register int rv; + + __asm__ __volatile__( + " clrl %0 \n" + " tas %1 \n" + " sne %0 \n" +: "=d"(rv), "+m"(*lock) +: /* no inputs */ +: "memory", "cc"); + return rv; +} + +#endif /* (__mc68000__ || __m68k__) && __linux__ */ + + +/* Motorola 88k */ +#if defined(__m88k__) +#define HAS_TEST_AND_SET + +typedef unsigned int slock_t; + +#define TAS(lock) tas(lock) + +static __inline__ int +tas(volatile slock_t *lock) +{ + register slock_t _res = 1; + + __asm__ __volatile__( + " xmem %0, %2, %%r0 \n" +: "+r"(_res), "+m"(*lock) +: "r"(lock) +: "memory"); + return (int) _res; +} + +#endif /* __m88k__ */ + + +/* + * VAXen -- even multiprocessor ones + * (thanks to Tom Ivar Helbekkmo) + */ +#if defined(__vax__) +#define HAS_TEST_AND_SET + +typedef unsigned char slock_t; + +#define TAS(lock) tas(lock) + +static __inline__ int +tas(volatile slock_t *lock) +{ + register int _res; + + __asm__ __volatile__( + " movl $1, %0 \n" + " bbssi $0, (%2), 1f \n" + " clrl %0 \n" + "1: \n" +: "=&r"(_res), "+m"(*lock) +: "r"(lock) +: "memory"); + return _res; +} + +#endif /* __vax__ */ + + +#if defined(__mips__) && !defined(__sgi) /* non-SGI MIPS */ +#define HAS_TEST_AND_SET + +typedef unsigned int slock_t; + +#define TAS(lock) tas(lock) + +/* + * Original MIPS-I processors lacked the LL/SC instructions, but if we are + * so unfortunate as to be running on one of those, we expect that the kernel + * will handle the illegal-instruction traps and emulate them for us. On + * anything newer (and really, MIPS-I is extinct) LL/SC is the only sane + * choice because any other synchronization method must involve a kernel + * call. Unfortunately, many toolchains still default to MIPS-I as the + * codegen target; if the symbol __mips shows that that's the case, we + * have to force the assembler to accept LL/SC. + * + * R10000 and up processors require a separate SYNC, which has the same + * issues as LL/SC. + */ +#if __mips < 2 +#define MIPS_SET_MIPS2 " .set mips2 \n" +#else +#define MIPS_SET_MIPS2 +#endif + +static __inline__ int +tas(volatile slock_t *lock) +{ + register volatile slock_t *_l = lock; + register int _res; + register int _tmp; + + __asm__ __volatile__( + " .set push \n" + MIPS_SET_MIPS2 + " .set noreorder \n" + " .set nomacro \n" + " ll %0, %2 \n" + " or %1, %0, 1 \n" + " sc %1, %2 \n" + " xori %1, 1 \n" + " or %0, %0, %1 \n" + " sync \n" + " .set pop " +: "=&r" (_res), "=&r" (_tmp), "+R" (*_l) +: /* no inputs */ +: "memory"); + return _res; +} + +/* MIPS S_UNLOCK is almost standard but requires a "sync" instruction */ +#define S_UNLOCK(lock) \ +do \ +{ \ + __asm__ __volatile__( \ + " .set push \n" \ + MIPS_SET_MIPS2 \ + " .set noreorder \n" \ + " .set nomacro \n" \ + " sync \n" \ + " .set pop " \ +: /* no outputs */ \ +: /* no inputs */ \ +: "memory"); \ + *((volatile slock_t *) (lock)) = 0; \ +} while (0) + +#endif /* __mips__ && !__sgi */ + + +#if defined(__m32r__) && defined(HAVE_SYS_TAS_H) /* Renesas' M32R */ +#define HAS_TEST_AND_SET + +#include <sys/tas.h> + +typedef int slock_t; + +#define TAS(lock) tas(lock) + +#endif /* __m32r__ */ + + +#if defined(__sh__) /* Renesas' SuperH */ +#define HAS_TEST_AND_SET + +typedef unsigned char slock_t; + +#define TAS(lock) tas(lock) + +static __inline__ int +tas(volatile slock_t *lock) +{ + register int _res; + + /* + * This asm is coded as if %0 could be any register, but actually SuperH + * restricts the target of xor-immediate to be R0. That's handled by + * the "z" constraint on _res. + */ + __asm__ __volatile__( + " tas.b @%2 \n" + " movt %0 \n" + " xor #1,%0 \n" +: "=z"(_res), "+m"(*lock) +: "r"(lock) +: "memory", "t"); + return _res; +} + +#endif /* __sh__ */ + + +/* These live in s_lock.c, but only for gcc */ + + +#if defined(__m68k__) && !defined(__linux__) /* non-Linux Motorola 68k */ +#define HAS_TEST_AND_SET + +typedef unsigned char slock_t; +#endif + + +/* + * If we have no platform-specific knowledge, but we found that the compiler + * provides __sync_lock_test_and_set(), use that. Prefer the int-width + * version over the char-width version if we have both, on the rather dubious + * grounds that that's known to be more likely to work in the ARM ecosystem. + * (But we dealt with ARM above.) + */ +#if !defined(HAS_TEST_AND_SET) + +#if defined(HAVE_GCC__SYNC_INT32_TAS) +#define HAS_TEST_AND_SET + +#define TAS(lock) tas(lock) + +typedef int slock_t; + +static __inline__ int +tas(volatile slock_t *lock) +{ + return __sync_lock_test_and_set(lock, 1); +} + +#define S_UNLOCK(lock) __sync_lock_release(lock) + +#elif defined(HAVE_GCC__SYNC_CHAR_TAS) +#define HAS_TEST_AND_SET + +#define TAS(lock) tas(lock) + +typedef char slock_t; + +static __inline__ int +tas(volatile slock_t *lock) +{ + return __sync_lock_test_and_set(lock, 1); +} + +#define S_UNLOCK(lock) __sync_lock_release(lock) + +#endif /* HAVE_GCC__SYNC_INT32_TAS */ + +#endif /* !defined(HAS_TEST_AND_SET) */ + + +/* + * Default implementation of S_UNLOCK() for gcc/icc. + * + * Note that this implementation is unsafe for any platform that can reorder + * a memory access (either load or store) after a following store. That + * happens not to be possible on x86 and most legacy architectures (some are + * single-processor!), but many modern systems have weaker memory ordering. + * Those that do must define their own version of S_UNLOCK() rather than + * relying on this one. + */ +#if !defined(S_UNLOCK) +#define S_UNLOCK(lock) \ + do { __asm__ __volatile__("" : : : "memory"); *(lock) = 0; } while (0) +#endif + +#endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */ + + + +/* + * --------------------------------------------------------------------- + * Platforms that use non-gcc inline assembly: + * --------------------------------------------------------------------- + */ + +#if !defined(HAS_TEST_AND_SET) /* We didn't trigger above, let's try here */ + + +#if defined(__hppa) || defined(__hppa__) /* HP PA-RISC, GCC and HP compilers */ +/* + * HP's PA-RISC + * + * See src/backend/port/hpux/tas.c.template for details about LDCWX. Because + * LDCWX requires a 16-byte-aligned address, we declare slock_t as a 16-byte + * struct. The active word in the struct is whichever has the aligned address; + * the other three words just sit at -1. + * + * When using gcc, we can inline the required assembly code. + */ +#define HAS_TEST_AND_SET + +typedef struct +{ + int sema[4]; +} slock_t; + +#define TAS_ACTIVE_WORD(lock) ((volatile int *) (((uintptr_t) (lock) + 15) & ~15)) + +#if defined(__GNUC__) + +static __inline__ int +tas(volatile slock_t *lock) +{ + volatile int *lockword = TAS_ACTIVE_WORD(lock); + register int lockval; + + __asm__ __volatile__( + " ldcwx 0(0,%2),%0 \n" +: "=r"(lockval), "+m"(*lockword) +: "r"(lockword) +: "memory"); + return (lockval == 0); +} + +/* + * The hppa implementation doesn't follow the rules of this files and provides + * a gcc specific implementation outside of the above defined(__GNUC__). It + * does so to avoid duplication between the HP compiler and gcc. So undefine + * the generic fallback S_UNLOCK from above. + */ +#ifdef S_UNLOCK +#undef S_UNLOCK +#endif +#define S_UNLOCK(lock) \ + do { \ + __asm__ __volatile__("" : : : "memory"); \ + *TAS_ACTIVE_WORD(lock) = -1; \ + } while (0) + +#endif /* __GNUC__ */ + +#define S_INIT_LOCK(lock) \ + do { \ + volatile slock_t *lock_ = (lock); \ + lock_->sema[0] = -1; \ + lock_->sema[1] = -1; \ + lock_->sema[2] = -1; \ + lock_->sema[3] = -1; \ + } while (0) + +#define S_LOCK_FREE(lock) (*TAS_ACTIVE_WORD(lock) != 0) + +#endif /* __hppa || __hppa__ */ + + +#if defined(__hpux) && defined(__ia64) && !defined(__GNUC__) +/* + * HP-UX on Itanium, non-gcc/icc compiler + * + * We assume that the compiler enforces strict ordering of loads/stores on + * volatile data (see comments on the gcc-version earlier in this file). + * Note that this assumption does *not* hold if you use the + * +Ovolatile=__unordered option on the HP-UX compiler, so don't do that. + * + * See also Implementing Spinlocks on the Intel Itanium Architecture and + * PA-RISC, by Tor Ekqvist and David Graves, for more information. As of + * this writing, version 1.0 of the manual is available at: + * http://h21007.www2.hp.com/portal/download/files/unprot/itanium/spinlocks.pdf + */ +#define HAS_TEST_AND_SET + +typedef unsigned int slock_t; + +#include <ia64/sys/inline.h> +#define TAS(lock) _Asm_xchg(_SZ_W, lock, 1, _LDHINT_NONE) +/* On IA64, it's a win to use a non-locking test before the xchg proper */ +#define TAS_SPIN(lock) (*(lock) ? 1 : TAS(lock)) +#define S_UNLOCK(lock) \ + do { _Asm_mf(); (*(lock)) = 0; } while (0) + +#endif /* HPUX on IA64, non gcc/icc */ + +#if defined(_AIX) /* AIX */ +/* + * AIX (POWER) + */ +#define HAS_TEST_AND_SET + +#include <sys/atomic_op.h> + +typedef int slock_t; + +#define TAS(lock) _check_lock((slock_t *) (lock), 0, 1) +#define S_UNLOCK(lock) _clear_lock((slock_t *) (lock), 0) +#endif /* _AIX */ + + +/* These are in sunstudio_(sparc|x86).s */ + +#if defined(__SUNPRO_C) && (defined(__i386) || defined(__x86_64__) || defined(__sparc__) || defined(__sparc)) +#define HAS_TEST_AND_SET + +#if defined(__i386) || defined(__x86_64__) || defined(__sparcv9) || defined(__sparcv8plus) +typedef unsigned int slock_t; +#else +typedef unsigned char slock_t; +#endif + +extern slock_t pg_atomic_cas(volatile slock_t *lock, slock_t with, + slock_t cmp); + +#define TAS(a) (pg_atomic_cas((a), 1, 0) != 0) +#endif + + +#ifdef _MSC_VER +typedef LONG slock_t; + +#define HAS_TEST_AND_SET +#define TAS(lock) (InterlockedCompareExchange(lock, 1, 0)) + +#define SPIN_DELAY() spin_delay() + +/* If using Visual C++ on Win64, inline assembly is unavailable. + * Use a _mm_pause intrinsic instead of rep nop. + */ +#if defined(_WIN64) +static __forceinline void +spin_delay(void) +{ + _mm_pause(); +} +#else +static __forceinline void +spin_delay(void) +{ + /* See comment for gcc code. Same code, MASM syntax */ + __asm rep nop; +} +#endif + +#include <intrin.h> +#pragma intrinsic(_ReadWriteBarrier) + +#define S_UNLOCK(lock) \ + do { _ReadWriteBarrier(); (*(lock)) = 0; } while (0) + +#endif + + +#endif /* !defined(HAS_TEST_AND_SET) */ + + +/* Blow up if we didn't have any way to do spinlocks */ +#ifndef HAS_TEST_AND_SET +#error PostgreSQL does not have native spinlock support on this platform. To continue the compilation, rerun configure using --disable-spinlocks. However, performance will be poor. Please report this to pgsql-bugs@lists.postgresql.org. +#endif + + +#else /* !HAVE_SPINLOCKS */ + + +/* + * Fake spinlock implementation using semaphores --- slow and prone + * to fall foul of kernel limits on number of semaphores, so don't use this + * unless you must! The subroutines appear in spin.c. + */ +typedef int slock_t; + +extern bool s_lock_free_sema(volatile slock_t *lock); +extern void s_unlock_sema(volatile slock_t *lock); +extern void s_init_lock_sema(volatile slock_t *lock, bool nested); +extern int tas_sema(volatile slock_t *lock); + +#define S_LOCK_FREE(lock) s_lock_free_sema(lock) +#define S_UNLOCK(lock) s_unlock_sema(lock) +#define S_INIT_LOCK(lock) s_init_lock_sema(lock, false) +#define TAS(lock) tas_sema(lock) + + +#endif /* HAVE_SPINLOCKS */ + + +/* + * Default Definitions - override these above as needed. + */ + +#if !defined(S_LOCK) +#define S_LOCK(lock) \ + (TAS(lock) ? s_lock((lock), __FILE__, __LINE__, PG_FUNCNAME_MACRO) : 0) +#endif /* S_LOCK */ + +#if !defined(S_LOCK_FREE) +#define S_LOCK_FREE(lock) (*(lock) == 0) +#endif /* S_LOCK_FREE */ + +#if !defined(S_UNLOCK) +/* + * Our default implementation of S_UNLOCK is essentially *(lock) = 0. This + * is unsafe if the platform can reorder a memory access (either load or + * store) after a following store; platforms where this is possible must + * define their own S_UNLOCK. But CPU reordering is not the only concern: + * if we simply defined S_UNLOCK() as an inline macro, the compiler might + * reorder instructions from inside the critical section to occur after the + * lock release. Since the compiler probably can't know what the external + * function s_unlock is doing, putting the same logic there should be adequate. + * A sufficiently-smart globally optimizing compiler could break that + * assumption, though, and the cost of a function call for every spinlock + * release may hurt performance significantly, so we use this implementation + * only for platforms where we don't know of a suitable intrinsic. For the + * most part, those are relatively obscure platform/compiler combinations to + * which the PostgreSQL project does not have access. + */ +#define USE_DEFAULT_S_UNLOCK +extern void s_unlock(volatile slock_t *lock); +#define S_UNLOCK(lock) s_unlock(lock) +#endif /* S_UNLOCK */ + +#if !defined(S_INIT_LOCK) +#define S_INIT_LOCK(lock) S_UNLOCK(lock) +#endif /* S_INIT_LOCK */ + +#if !defined(SPIN_DELAY) +#define SPIN_DELAY() ((void) 0) +#endif /* SPIN_DELAY */ + +#if !defined(TAS) +extern int tas(volatile slock_t *lock); /* in port/.../tas.s, or + * s_lock.c */ + +#define TAS(lock) tas(lock) +#endif /* TAS */ + +#if !defined(TAS_SPIN) +#define TAS_SPIN(lock) TAS(lock) +#endif /* TAS_SPIN */ + +extern PGDLLIMPORT slock_t dummy_spinlock; + +/* + * Platform-independent out-of-line support routines + */ +extern int s_lock(volatile slock_t *lock, const char *file, int line, const char *func); + +/* Support for dynamic adjustment of spins_per_delay */ +#define DEFAULT_SPINS_PER_DELAY 100 + +extern void set_spins_per_delay(int shared_spins_per_delay); +extern int update_spins_per_delay(int shared_spins_per_delay); + +/* + * Support for spin delay which is useful in various places where + * spinlock-like procedures take place. + */ +typedef struct +{ + int spins; + int delays; + int cur_delay; + const char *file; + int line; + const char *func; +} SpinDelayStatus; + +static inline void +init_spin_delay(SpinDelayStatus *status, + const char *file, int line, const char *func) +{ + status->spins = 0; + status->delays = 0; + status->cur_delay = 0; + status->file = file; + status->line = line; + status->func = func; +} + +#define init_local_spin_delay(status) init_spin_delay(status, __FILE__, __LINE__, PG_FUNCNAME_MACRO) +extern void perform_spin_delay(SpinDelayStatus *status); +extern void finish_spin_delay(SpinDelayStatus *status); + +#endif /* S_LOCK_H */ diff --git a/src/include/storage/sharedfileset.h b/src/include/storage/sharedfileset.h new file mode 100644 index 0000000..b1cde36 --- /dev/null +++ b/src/include/storage/sharedfileset.h @@ -0,0 +1,37 @@ +/*------------------------------------------------------------------------- + * + * sharedfileset.h + * Shared temporary file management. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/sharedfileset.h + * + *------------------------------------------------------------------------- + */ + +#ifndef SHAREDFILESET_H +#define SHAREDFILESET_H + +#include "storage/dsm.h" +#include "storage/fd.h" +#include "storage/fileset.h" +#include "storage/spin.h" + +/* + * A set of temporary files that can be shared by multiple backends. + */ +typedef struct SharedFileSet +{ + FileSet fs; + slock_t mutex; /* mutex protecting the reference count */ + int refcnt; /* number of attached backends */ +} SharedFileSet; + +extern void SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg); +extern void SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg); +extern void SharedFileSetDeleteAll(SharedFileSet *fileset); + +#endif diff --git a/src/include/storage/shm_mq.h b/src/include/storage/shm_mq.h new file mode 100644 index 0000000..b6fe687 --- /dev/null +++ b/src/include/storage/shm_mq.h @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------- + * + * shm_mq.h + * single-reader, single-writer shared memory message queue + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/shm_mq.h + * + *------------------------------------------------------------------------- + */ +#ifndef SHM_MQ_H +#define SHM_MQ_H + +#include "postmaster/bgworker.h" +#include "storage/dsm.h" +#include "storage/proc.h" + +/* The queue itself, in shared memory. */ +struct shm_mq; +typedef struct shm_mq shm_mq; + +/* Backend-private state. */ +struct shm_mq_handle; +typedef struct shm_mq_handle shm_mq_handle; + +/* Descriptors for a single write spanning multiple locations. */ +typedef struct +{ + const char *data; + Size len; +} shm_mq_iovec; + +/* Possible results of a send or receive operation. */ +typedef enum +{ + SHM_MQ_SUCCESS, /* Sent or received a message. */ + SHM_MQ_WOULD_BLOCK, /* Not completed; retry later. */ + SHM_MQ_DETACHED /* Other process has detached queue. */ +} shm_mq_result; + +/* + * Primitives to create a queue and set the sender and receiver. + * + * Both the sender and the receiver must be set before any messages are read + * or written, but they need not be set by the same process. Each must be + * set exactly once. + */ +extern shm_mq *shm_mq_create(void *address, Size size); +extern void shm_mq_set_receiver(shm_mq *mq, PGPROC *); +extern void shm_mq_set_sender(shm_mq *mq, PGPROC *); + +/* Accessor methods for sender and receiver. */ +extern PGPROC *shm_mq_get_receiver(shm_mq *); +extern PGPROC *shm_mq_get_sender(shm_mq *); + +/* Set up backend-local queue state. */ +extern shm_mq_handle *shm_mq_attach(shm_mq *mq, dsm_segment *seg, + BackgroundWorkerHandle *handle); + +/* Associate worker handle with shm_mq. */ +extern void shm_mq_set_handle(shm_mq_handle *, BackgroundWorkerHandle *); + +/* Break connection, release handle resources. */ +extern void shm_mq_detach(shm_mq_handle *mqh); + +/* Get the shm_mq from handle. */ +extern shm_mq *shm_mq_get_queue(shm_mq_handle *mqh); + +/* Send or receive messages. */ +extern shm_mq_result shm_mq_send(shm_mq_handle *mqh, + Size nbytes, const void *data, bool nowait, + bool force_flush); +extern shm_mq_result shm_mq_sendv(shm_mq_handle *mqh, shm_mq_iovec *iov, + int iovcnt, bool nowait, bool force_flush); +extern shm_mq_result shm_mq_receive(shm_mq_handle *mqh, + Size *nbytesp, void **datap, bool nowait); + +/* Wait for our counterparty to attach to the queue. */ +extern shm_mq_result shm_mq_wait_for_attach(shm_mq_handle *mqh); + +/* Smallest possible queue. */ +extern PGDLLIMPORT const Size shm_mq_minimum_size; + +#endif /* SHM_MQ_H */ diff --git a/src/include/storage/shm_toc.h b/src/include/storage/shm_toc.h new file mode 100644 index 0000000..153a57c --- /dev/null +++ b/src/include/storage/shm_toc.h @@ -0,0 +1,58 @@ +/*------------------------------------------------------------------------- + * + * shm_toc.h + * shared memory segment table of contents + * + * This is intended to provide a simple way to divide a chunk of shared + * memory (probably dynamic shared memory allocated via dsm_create) into + * a number of regions and keep track of the addresses of those regions or + * key data structures within those regions. This is not intended to + * scale to a large number of keys and will perform poorly if used that + * way; if you need a large number of pointers, store them within some + * other data structure within the segment and only put the pointer to + * the data structure itself in the table of contents. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/shm_toc.h + * + *------------------------------------------------------------------------- + */ +#ifndef SHM_TOC_H +#define SHM_TOC_H + +#include "storage/shmem.h" /* for add_size() */ + +/* shm_toc is an opaque type known only within shm_toc.c */ +typedef struct shm_toc shm_toc; + +extern shm_toc *shm_toc_create(uint64 magic, void *address, Size nbytes); +extern shm_toc *shm_toc_attach(uint64 magic, void *address); +extern void *shm_toc_allocate(shm_toc *toc, Size nbytes); +extern Size shm_toc_freespace(shm_toc *toc); +extern void shm_toc_insert(shm_toc *toc, uint64 key, void *address); +extern void *shm_toc_lookup(shm_toc *toc, uint64 key, bool noError); + +/* + * Tools for estimating how large a chunk of shared memory will be needed + * to store a TOC and its dependent objects. Note: we don't really support + * large numbers of keys, but it's convenient to declare number_of_keys + * as a Size anyway. + */ +typedef struct +{ + Size space_for_chunks; + Size number_of_keys; +} shm_toc_estimator; + +#define shm_toc_initialize_estimator(e) \ + ((e)->space_for_chunks = 0, (e)->number_of_keys = 0) +#define shm_toc_estimate_chunk(e, sz) \ + ((e)->space_for_chunks = add_size((e)->space_for_chunks, BUFFERALIGN(sz))) +#define shm_toc_estimate_keys(e, cnt) \ + ((e)->number_of_keys = add_size((e)->number_of_keys, cnt)) + +extern Size shm_toc_estimate(shm_toc_estimator *e); + +#endif /* SHM_TOC_H */ diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h new file mode 100644 index 0000000..de9e7c6 --- /dev/null +++ b/src/include/storage/shmem.h @@ -0,0 +1,81 @@ +/*------------------------------------------------------------------------- + * + * shmem.h + * shared memory management structures + * + * Historical note: + * A long time ago, Postgres' shared memory region was allowed to be mapped + * at a different address in each process, and shared memory "pointers" were + * passed around as offsets relative to the start of the shared memory region. + * That is no longer the case: each process must map the shared memory region + * at the same address. This means shared memory pointers can be passed + * around directly between different processes. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/shmem.h + * + *------------------------------------------------------------------------- + */ +#ifndef SHMEM_H +#define SHMEM_H + +#include "utils/hsearch.h" + + +/* shmqueue.c */ +typedef struct SHM_QUEUE +{ + struct SHM_QUEUE *prev; + struct SHM_QUEUE *next; +} SHM_QUEUE; + +/* shmem.c */ +extern void InitShmemAccess(void *seghdr); +extern void InitShmemAllocation(void); +extern void *ShmemAlloc(Size size); +extern void *ShmemAllocNoError(Size size); +extern void *ShmemAllocUnlocked(Size size); +extern bool ShmemAddrIsValid(const void *addr); +extern void InitShmemIndex(void); +extern HTAB *ShmemInitHash(const char *name, long init_size, long max_size, + HASHCTL *infoP, int hash_flags); +extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr); +extern Size add_size(Size s1, Size s2); +extern Size mul_size(Size s1, Size s2); + +/* ipci.c */ +extern void RequestAddinShmemSpace(Size size); + +/* size constants for the shmem index table */ + /* max size of data structure string name */ +#define SHMEM_INDEX_KEYSIZE (48) + /* estimated size of the shmem index table (not a hard limit) */ +#define SHMEM_INDEX_SIZE (64) + +/* this is a hash bucket in the shmem index table */ +typedef struct +{ + char key[SHMEM_INDEX_KEYSIZE]; /* string name */ + void *location; /* location in shared mem */ + Size size; /* # bytes requested for the structure */ + Size allocated_size; /* # bytes actually allocated */ +} ShmemIndexEnt; + +/* + * prototypes for functions in shmqueue.c + */ +extern void SHMQueueInit(SHM_QUEUE *queue); +extern void SHMQueueElemInit(SHM_QUEUE *queue); +extern void SHMQueueDelete(SHM_QUEUE *queue); +extern void SHMQueueInsertBefore(SHM_QUEUE *queue, SHM_QUEUE *elem); +extern void SHMQueueInsertAfter(SHM_QUEUE *queue, SHM_QUEUE *elem); +extern Pointer SHMQueueNext(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, + Size linkOffset); +extern Pointer SHMQueuePrev(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, + Size linkOffset); +extern bool SHMQueueEmpty(const SHM_QUEUE *queue); +extern bool SHMQueueIsDetached(const SHM_QUEUE *queue); + +#endif /* SHMEM_H */ diff --git a/src/include/storage/sinval.h b/src/include/storage/sinval.h new file mode 100644 index 0000000..e7cd456 --- /dev/null +++ b/src/include/storage/sinval.h @@ -0,0 +1,153 @@ +/*------------------------------------------------------------------------- + * + * sinval.h + * POSTGRES shared cache invalidation communication definitions. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/sinval.h + * + *------------------------------------------------------------------------- + */ +#ifndef SINVAL_H +#define SINVAL_H + +#include <signal.h> + +#include "storage/relfilenode.h" + +/* + * We support several types of shared-invalidation messages: + * * invalidate a specific tuple in a specific catcache + * * invalidate all catcache entries from a given system catalog + * * invalidate a relcache entry for a specific logical relation + * * invalidate all relcache entries + * * invalidate an smgr cache entry for a specific physical relation + * * invalidate the mapped-relation mapping for a given database + * * invalidate any saved snapshot that might be used to scan a given relation + * More types could be added if needed. The message type is identified by + * the first "int8" field of the message struct. Zero or positive means a + * specific-catcache inval message (and also serves as the catcache ID field). + * Negative values identify the other message types, as per codes below. + * + * Catcache inval events are initially driven by detecting tuple inserts, + * updates and deletions in system catalogs (see CacheInvalidateHeapTuple). + * An update can generate two inval events, one for the old tuple and one for + * the new, but this is reduced to one event if the tuple's hash key doesn't + * change. Note that the inval events themselves don't actually say whether + * the tuple is being inserted or deleted. Also, since we transmit only a + * hash key, there is a small risk of unnecessary invalidations due to chance + * matches of hash keys. + * + * Note that some system catalogs have multiple caches on them (with different + * indexes). On detecting a tuple invalidation in such a catalog, separate + * catcache inval messages must be generated for each of its caches, since + * the hash keys will generally be different. + * + * Catcache, relcache, and snapshot invalidations are transactional, and so + * are sent to other backends upon commit. Internally to the generating + * backend, they are also processed at CommandCounterIncrement so that later + * commands in the same transaction see the new state. The generating backend + * also has to process them at abort, to flush out any cache state it's loaded + * from no-longer-valid entries. + * + * smgr and relation mapping invalidations are non-transactional: they are + * sent immediately when the underlying file change is made. + */ + +typedef struct +{ + int8 id; /* cache ID --- must be first */ + Oid dbId; /* database ID, or 0 if a shared relation */ + uint32 hashValue; /* hash value of key for this catcache */ +} SharedInvalCatcacheMsg; + +#define SHAREDINVALCATALOG_ID (-1) + +typedef struct +{ + int8 id; /* type field --- must be first */ + Oid dbId; /* database ID, or 0 if a shared catalog */ + Oid catId; /* ID of catalog whose contents are invalid */ +} SharedInvalCatalogMsg; + +#define SHAREDINVALRELCACHE_ID (-2) + +typedef struct +{ + int8 id; /* type field --- must be first */ + Oid dbId; /* database ID, or 0 if a shared relation */ + Oid relId; /* relation ID, or 0 if whole relcache */ +} SharedInvalRelcacheMsg; + +#define SHAREDINVALSMGR_ID (-3) + +typedef struct +{ + /* note: field layout chosen to pack into 16 bytes */ + int8 id; /* type field --- must be first */ + int8 backend_hi; /* high bits of backend ID, if temprel */ + uint16 backend_lo; /* low bits of backend ID, if temprel */ + RelFileNode rnode; /* spcNode, dbNode, relNode */ +} SharedInvalSmgrMsg; + +#define SHAREDINVALRELMAP_ID (-4) + +typedef struct +{ + int8 id; /* type field --- must be first */ + Oid dbId; /* database ID, or 0 for shared catalogs */ +} SharedInvalRelmapMsg; + +#define SHAREDINVALSNAPSHOT_ID (-5) + +typedef struct +{ + int8 id; /* type field --- must be first */ + Oid dbId; /* database ID, or 0 if a shared relation */ + Oid relId; /* relation ID */ +} SharedInvalSnapshotMsg; + +typedef union +{ + int8 id; /* type field --- must be first */ + SharedInvalCatcacheMsg cc; + SharedInvalCatalogMsg cat; + SharedInvalRelcacheMsg rc; + SharedInvalSmgrMsg sm; + SharedInvalRelmapMsg rm; + SharedInvalSnapshotMsg sn; +} SharedInvalidationMessage; + + +/* Counter of messages processed; don't worry about overflow. */ +extern PGDLLIMPORT uint64 SharedInvalidMessageCounter; + +extern PGDLLIMPORT volatile sig_atomic_t catchupInterruptPending; + +extern void SendSharedInvalidMessages(const SharedInvalidationMessage *msgs, + int n); +extern void ReceiveSharedInvalidMessages(void (*invalFunction) (SharedInvalidationMessage *msg), + void (*resetFunction) (void)); + +/* signal handler for catchup events (PROCSIG_CATCHUP_INTERRUPT) */ +extern void HandleCatchupInterrupt(void); + +/* + * enable/disable processing of catchup events directly from signal handler. + * The enable routine first performs processing of any catchup events that + * have occurred since the last disable. + */ +extern void ProcessCatchupInterrupt(void); + +extern int xactGetCommittedInvalidationMessages(SharedInvalidationMessage **msgs, + bool *RelcacheInitFileInval); +extern void ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs, + int nmsgs, bool RelcacheInitFileInval, + Oid dbid, Oid tsid); + +extern void LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg); + +#endif /* SINVAL_H */ diff --git a/src/include/storage/sinvaladt.h b/src/include/storage/sinvaladt.h new file mode 100644 index 0000000..91e2418 --- /dev/null +++ b/src/include/storage/sinvaladt.h @@ -0,0 +1,43 @@ +/*------------------------------------------------------------------------- + * + * sinvaladt.h + * POSTGRES shared cache invalidation data manager. + * + * The shared cache invalidation manager is responsible for transmitting + * invalidation messages between backends. Any message sent by any backend + * must be delivered to all already-running backends before it can be + * forgotten. (If we run out of space, we instead deliver a "RESET" + * message to backends that have fallen too far behind.) + * + * The struct type SharedInvalidationMessage, defining the contents of + * a single message, is defined in sinval.h. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/sinvaladt.h + * + *------------------------------------------------------------------------- + */ +#ifndef SINVALADT_H +#define SINVALADT_H + +#include "storage/lock.h" +#include "storage/sinval.h" + +/* + * prototypes for functions in sinvaladt.c + */ +extern Size SInvalShmemSize(void); +extern void CreateSharedInvalidationState(void); +extern void SharedInvalBackendInit(bool sendOnly); +extern PGPROC *BackendIdGetProc(int backendID); +extern void BackendIdGetTransactionIds(int backendID, TransactionId *xid, TransactionId *xmin); + +extern void SIInsertDataEntries(const SharedInvalidationMessage *data, int n); +extern int SIGetDataEntries(SharedInvalidationMessage *data, int datasize); +extern void SICleanupQueue(bool callerHasWriteLock, int minFree); + +extern LocalTransactionId GetNextLocalTransactionId(void); + +#endif /* SINVALADT_H */ diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h new file mode 100644 index 0000000..6b63c60 --- /dev/null +++ b/src/include/storage/smgr.h @@ -0,0 +1,111 @@ +/*------------------------------------------------------------------------- + * + * smgr.h + * storage manager switch public interface declarations. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/smgr.h + * + *------------------------------------------------------------------------- + */ +#ifndef SMGR_H +#define SMGR_H + +#include "lib/ilist.h" +#include "storage/block.h" +#include "storage/relfilenode.h" + +/* + * smgr.c maintains a table of SMgrRelation objects, which are essentially + * cached file handles. An SMgrRelation is created (if not already present) + * by smgropen(), and destroyed by smgrclose(). Note that neither of these + * operations imply I/O, they just create or destroy a hashtable entry. + * (But smgrclose() may release associated resources, such as OS-level file + * descriptors.) + * + * An SMgrRelation may have an "owner", which is just a pointer to it from + * somewhere else; smgr.c will clear this pointer if the SMgrRelation is + * closed. We use this to avoid dangling pointers from relcache to smgr + * without having to make the smgr explicitly aware of relcache. There + * can't be more than one "owner" pointer per SMgrRelation, but that's + * all we need. + * + * SMgrRelations that do not have an "owner" are considered to be transient, + * and are deleted at end of transaction. + */ +typedef struct SMgrRelationData +{ + /* rnode is the hashtable lookup key, so it must be first! */ + RelFileNodeBackend smgr_rnode; /* relation physical identifier */ + + /* pointer to owning pointer, or NULL if none */ + struct SMgrRelationData **smgr_owner; + + /* + * The following fields are reset to InvalidBlockNumber upon a cache flush + * event, and hold the last known size for each fork. This information is + * currently only reliable during recovery, since there is no cache + * invalidation for fork extension. + */ + BlockNumber smgr_targblock; /* current insertion target block */ + BlockNumber smgr_cached_nblocks[MAX_FORKNUM + 1]; /* last known size */ + + /* additional public fields may someday exist here */ + + /* + * Fields below here are intended to be private to smgr.c and its + * submodules. Do not touch them from elsewhere. + */ + int smgr_which; /* storage manager selector */ + + /* + * for md.c; per-fork arrays of the number of open segments + * (md_num_open_segs) and the segments themselves (md_seg_fds). + */ + int md_num_open_segs[MAX_FORKNUM + 1]; + struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1]; + + /* if unowned, list link in list of all unowned SMgrRelations */ + dlist_node node; +} SMgrRelationData; + +typedef SMgrRelationData *SMgrRelation; + +#define SmgrIsTemp(smgr) \ + RelFileNodeBackendIsTemp((smgr)->smgr_rnode) + +extern void smgrinit(void); +extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend); +extern bool smgrexists(SMgrRelation reln, ForkNumber forknum); +extern void smgrsetowner(SMgrRelation *owner, SMgrRelation reln); +extern void smgrclearowner(SMgrRelation *owner, SMgrRelation reln); +extern void smgrclose(SMgrRelation reln); +extern void smgrcloseall(void); +extern void smgrclosenode(RelFileNodeBackend rnode); +extern void smgrrelease(SMgrRelation reln); +extern void smgrreleaseall(void); +extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo); +extern void smgrdosyncall(SMgrRelation *rels, int nrels); +extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo); +extern void smgrextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void smgrread(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer); +extern void smgrwrite(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum); +extern BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum); +extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum, + int nforks, BlockNumber *nblocks); +extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum); +extern void AtEOXact_SMgr(void); +extern bool ProcessBarrierSmgrRelease(void); + +#endif /* SMGR_H */ diff --git a/src/include/storage/spin.h b/src/include/storage/spin.h new file mode 100644 index 0000000..7031f1d --- /dev/null +++ b/src/include/storage/spin.h @@ -0,0 +1,77 @@ +/*------------------------------------------------------------------------- + * + * spin.h + * Hardware-independent implementation of spinlocks. + * + * + * The hardware-independent interface to spinlocks is defined by the + * typedef "slock_t" and these macros: + * + * void SpinLockInit(volatile slock_t *lock) + * Initialize a spinlock (to the unlocked state). + * + * void SpinLockAcquire(volatile slock_t *lock) + * Acquire a spinlock, waiting if necessary. + * Time out and abort() if unable to acquire the lock in a + * "reasonable" amount of time --- typically ~ 1 minute. + * + * void SpinLockRelease(volatile slock_t *lock) + * Unlock a previously acquired lock. + * + * bool SpinLockFree(slock_t *lock) + * Tests if the lock is free. Returns true if free, false if locked. + * This does *not* change the state of the lock. + * + * Callers must beware that the macro argument may be evaluated multiple + * times! + * + * Load and store operations in calling code are guaranteed not to be + * reordered with respect to these operations, because they include a + * compiler barrier. (Before PostgreSQL 9.5, callers needed to use a + * volatile qualifier to access data protected by spinlocks.) + * + * Keep in mind the coding rule that spinlocks must not be held for more + * than a few instructions. In particular, we assume it is not possible + * for a CHECK_FOR_INTERRUPTS() to occur while holding a spinlock, and so + * it is not necessary to do HOLD/RESUME_INTERRUPTS() in these macros. + * + * These macros are implemented in terms of hardware-dependent macros + * supplied by s_lock.h. There is not currently any extra functionality + * added by this header, but there has been in the past and may someday + * be again. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/spin.h + * + *------------------------------------------------------------------------- + */ +#ifndef SPIN_H +#define SPIN_H + +#include "storage/s_lock.h" +#ifndef HAVE_SPINLOCKS +#include "storage/pg_sema.h" +#endif + + +#define SpinLockInit(lock) S_INIT_LOCK(lock) + +#define SpinLockAcquire(lock) S_LOCK(lock) + +#define SpinLockRelease(lock) S_UNLOCK(lock) + +#define SpinLockFree(lock) S_LOCK_FREE(lock) + + +extern int SpinlockSemas(void); +extern Size SpinlockSemaSize(void); + +#ifndef HAVE_SPINLOCKS +extern void SpinlockSemaInit(void); +extern PGDLLIMPORT PGSemaphore *SpinlockSemaArray; +#endif + +#endif /* SPIN_H */ diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h new file mode 100644 index 0000000..6a77632 --- /dev/null +++ b/src/include/storage/standby.h @@ -0,0 +1,98 @@ +/*------------------------------------------------------------------------- + * + * standby.h + * Definitions for hot standby mode. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/standby.h + * + *------------------------------------------------------------------------- + */ +#ifndef STANDBY_H +#define STANDBY_H + +#include "datatype/timestamp.h" +#include "storage/lock.h" +#include "storage/procsignal.h" +#include "storage/relfilenode.h" +#include "storage/standbydefs.h" + +/* User-settable GUC parameters */ +extern PGDLLIMPORT int vacuum_defer_cleanup_age; +extern PGDLLIMPORT int max_standby_archive_delay; +extern PGDLLIMPORT int max_standby_streaming_delay; +extern PGDLLIMPORT bool log_recovery_conflict_waits; + +extern void InitRecoveryTransactionEnvironment(void); +extern void ShutdownRecoveryTransactionEnvironment(void); + +extern void ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, + RelFileNode node); +extern void ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid, + RelFileNode node); +extern void ResolveRecoveryConflictWithTablespace(Oid tsid); +extern void ResolveRecoveryConflictWithDatabase(Oid dbid); + +extern void ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict); +extern void ResolveRecoveryConflictWithBufferPin(void); +extern void CheckRecoveryConflictDeadlock(void); +extern void StandbyDeadLockHandler(void); +extern void StandbyTimeoutHandler(void); +extern void StandbyLockTimeoutHandler(void); +extern void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, + TimestampTz cur_ts, VirtualTransactionId *wait_list, + bool still_waiting); + +/* + * Standby Rmgr (RM_STANDBY_ID) + * + * Standby recovery manager exists to perform actions that are required + * to make hot standby work. That includes logging AccessExclusiveLocks taken + * by transactions and running-xacts snapshots. + */ +extern void StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid); +extern void StandbyReleaseLockTree(TransactionId xid, + int nsubxids, TransactionId *subxids); +extern void StandbyReleaseAllLocks(void); +extern void StandbyReleaseOldLocks(TransactionId oldxid); + +#define MinSizeOfXactRunningXacts offsetof(xl_running_xacts, xids) + + +/* + * Declarations for GetRunningTransactionData(). Similar to Snapshots, but + * not quite. This has nothing at all to do with visibility on this server, + * so this is completely separate from snapmgr.c and snapmgr.h. + * This data is important for creating the initial snapshot state on a + * standby server. We need lots more information than a normal snapshot, + * hence we use a specific data structure for our needs. This data + * is written to WAL as a separate record immediately after each + * checkpoint. That means that wherever we start a standby from we will + * almost immediately see the data we need to begin executing queries. + */ + +typedef struct RunningTransactionsData +{ + int xcnt; /* # of xact ids in xids[] */ + int subxcnt; /* # of subxact ids in xids[] */ + bool subxid_overflow; /* snapshot overflowed, subxids missing */ + TransactionId nextXid; /* xid from ShmemVariableCache->nextXid */ + TransactionId oldestRunningXid; /* *not* oldestXmin */ + TransactionId latestCompletedXid; /* so we can set xmax */ + + TransactionId *xids; /* array of (sub)xids still running */ +} RunningTransactionsData; + +typedef RunningTransactionsData *RunningTransactions; + +extern void LogAccessExclusiveLock(Oid dbOid, Oid relOid); +extern void LogAccessExclusiveLockPrepare(void); + +extern XLogRecPtr LogStandbySnapshot(void); +extern void LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs, + bool relcacheInitFileInval); + +#endif /* STANDBY_H */ diff --git a/src/include/storage/standbydefs.h b/src/include/storage/standbydefs.h new file mode 100644 index 0000000..c0234b6 --- /dev/null +++ b/src/include/storage/standbydefs.h @@ -0,0 +1,74 @@ +/*------------------------------------------------------------------------- + * + * standbydefs.h + * Frontend exposed definitions for hot standby mode. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/standbydefs.h + * + *------------------------------------------------------------------------- + */ +#ifndef STANDBYDEFS_H +#define STANDBYDEFS_H + +#include "access/xlogreader.h" +#include "lib/stringinfo.h" +#include "storage/lockdefs.h" +#include "storage/sinval.h" + +/* Recovery handlers for the Standby Rmgr (RM_STANDBY_ID) */ +extern void standby_redo(XLogReaderState *record); +extern void standby_desc(StringInfo buf, XLogReaderState *record); +extern const char *standby_identify(uint8 info); +extern void standby_desc_invalidations(StringInfo buf, + int nmsgs, SharedInvalidationMessage *msgs, + Oid dbId, Oid tsId, + bool relcacheInitFileInval); + +/* + * XLOG message types + */ +#define XLOG_STANDBY_LOCK 0x00 +#define XLOG_RUNNING_XACTS 0x10 +#define XLOG_INVALIDATIONS 0x20 + +typedef struct xl_standby_locks +{ + int nlocks; /* number of entries in locks array */ + xl_standby_lock locks[FLEXIBLE_ARRAY_MEMBER]; +} xl_standby_locks; + +/* + * When we write running xact data to WAL, we use this structure. + */ +typedef struct xl_running_xacts +{ + int xcnt; /* # of xact ids in xids[] */ + int subxcnt; /* # of subxact ids in xids[] */ + bool subxid_overflow; /* snapshot overflowed, subxids missing */ + TransactionId nextXid; /* xid from ShmemVariableCache->nextXid */ + TransactionId oldestRunningXid; /* *not* oldestXmin */ + TransactionId latestCompletedXid; /* so we can set xmax */ + + TransactionId xids[FLEXIBLE_ARRAY_MEMBER]; +} xl_running_xacts; + +/* + * Invalidations for standby, currently only when transactions without an + * assigned xid commit. + */ +typedef struct xl_invalidations +{ + Oid dbId; /* MyDatabaseId */ + Oid tsId; /* MyDatabaseTableSpace */ + bool relcacheInitFileInval; /* invalidate relcache init files */ + int nmsgs; /* number of shared inval msgs */ + SharedInvalidationMessage msgs[FLEXIBLE_ARRAY_MEMBER]; +} xl_invalidations; + +#define MinSizeOfInvalidations offsetof(xl_invalidations, msgs) + +#endif /* STANDBYDEFS_H */ diff --git a/src/include/storage/sync.h b/src/include/storage/sync.h new file mode 100644 index 0000000..9737e1e --- /dev/null +++ b/src/include/storage/sync.h @@ -0,0 +1,66 @@ +/*------------------------------------------------------------------------- + * + * sync.h + * File synchronization management code. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/sync.h + * + *------------------------------------------------------------------------- + */ +#ifndef SYNC_H +#define SYNC_H + +#include "storage/relfilenode.h" + +/* + * Type of sync request. These are used to manage the set of pending + * requests to call a sync handler's sync or unlink functions at the next + * checkpoint. + */ +typedef enum SyncRequestType +{ + SYNC_REQUEST, /* schedule a call of sync function */ + SYNC_UNLINK_REQUEST, /* schedule a call of unlink function */ + SYNC_FORGET_REQUEST, /* forget all calls for a tag */ + SYNC_FILTER_REQUEST /* forget all calls satisfying match fn */ +} SyncRequestType; + +/* + * Which set of functions to use to handle a given request. The values of + * the enumerators must match the indexes of the function table in sync.c. + */ +typedef enum SyncRequestHandler +{ + SYNC_HANDLER_MD = 0, + SYNC_HANDLER_CLOG, + SYNC_HANDLER_COMMIT_TS, + SYNC_HANDLER_MULTIXACT_OFFSET, + SYNC_HANDLER_MULTIXACT_MEMBER, + SYNC_HANDLER_NONE +} SyncRequestHandler; + +/* + * A tag identifying a file. Currently it has the members required for md.c's + * usage, but sync.c has no knowledge of the internal structure, and it is + * liable to change as required by future handlers. + */ +typedef struct FileTag +{ + int16 handler; /* SyncRequestHandler value, saving space */ + int16 forknum; /* ForkNumber, saving space */ + RelFileNode rnode; + uint32 segno; +} FileTag; + +extern void InitSync(void); +extern void SyncPreCheckpoint(void); +extern void SyncPostCheckpoint(void); +extern void ProcessSyncRequests(void); +extern void RememberSyncRequest(const FileTag *ftag, SyncRequestType type); +extern bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, + bool retryOnError); + +#endif /* SYNC_H */ |