summaryrefslogtreecommitdiffstats
path: root/src/include/storage
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:17:33 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:17:33 +0000
commit5e45211a64149b3c659b90ff2de6fa982a5a93ed (patch)
tree739caf8c461053357daa9f162bef34516c7bf452 /src/include/storage
parentInitial commit. (diff)
downloadpostgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.tar.xz
postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.zip
Adding upstream version 15.5.upstream/15.5
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--src/include/storage/.gitignore1
-rw-r--r--src/include/storage/backendid.h37
-rw-r--r--src/include/storage/barrier.h46
-rw-r--r--src/include/storage/block.h115
-rw-r--r--src/include/storage/buf.h46
-rw-r--r--src/include/storage/buf_internals.h345
-rw-r--r--src/include/storage/buffile.h57
-rw-r--r--src/include/storage/bufmgr.h297
-rw-r--r--src/include/storage/bufpage.h457
-rw-r--r--src/include/storage/checksum.h24
-rw-r--r--src/include/storage/checksum_impl.h215
-rw-r--r--src/include/storage/condition_variable.h73
-rw-r--r--src/include/storage/copydir.h19
-rw-r--r--src/include/storage/dsm.h64
-rw-r--r--src/include/storage/dsm_impl.h76
-rw-r--r--src/include/storage/fd.h198
-rw-r--r--src/include/storage/fileset.h40
-rw-r--r--src/include/storage/freespace.h39
-rw-r--r--src/include/storage/fsm_internals.h72
-rw-r--r--src/include/storage/indexfsm.h26
-rw-r--r--src/include/storage/ipc.h84
-rw-r--r--src/include/storage/item.h19
-rw-r--r--src/include/storage/itemid.h184
-rw-r--r--src/include/storage/itemptr.h208
-rw-r--r--src/include/storage/large_object.h100
-rw-r--r--src/include/storage/latch.h186
-rw-r--r--src/include/storage/lmgr.h115
-rw-r--r--src/include/storage/lock.h616
-rw-r--r--src/include/storage/lockdefs.h59
-rw-r--r--src/include/storage/lwlock.h206
-rw-r--r--src/include/storage/md.h52
-rw-r--r--src/include/storage/off.h57
-rw-r--r--src/include/storage/pg_sema.h61
-rw-r--r--src/include/storage/pg_shmem.h92
-rw-r--r--src/include/storage/pmsignal.h105
-rw-r--r--src/include/storage/predicate.h87
-rw-r--r--src/include/storage/predicate_internals.h494
-rw-r--r--src/include/storage/proc.h461
-rw-r--r--src/include/storage/procarray.h99
-rw-r--r--src/include/storage/proclist.h219
-rw-r--r--src/include/storage/proclist_types.h51
-rw-r--r--src/include/storage/procsignal.h71
-rw-r--r--src/include/storage/reinit.h28
-rw-r--r--src/include/storage/relfilenode.h99
-rw-r--r--src/include/storage/s_lock.h1110
-rw-r--r--src/include/storage/sharedfileset.h37
-rw-r--r--src/include/storage/shm_mq.h86
-rw-r--r--src/include/storage/shm_toc.h58
-rw-r--r--src/include/storage/shmem.h81
-rw-r--r--src/include/storage/sinval.h153
-rw-r--r--src/include/storage/sinvaladt.h43
-rw-r--r--src/include/storage/smgr.h111
-rw-r--r--src/include/storage/spin.h77
-rw-r--r--src/include/storage/standby.h98
-rw-r--r--src/include/storage/standbydefs.h74
-rw-r--r--src/include/storage/sync.h66
56 files changed, 7994 insertions, 0 deletions
diff --git a/src/include/storage/.gitignore b/src/include/storage/.gitignore
new file mode 100644
index 0000000..209c8be
--- /dev/null
+++ b/src/include/storage/.gitignore
@@ -0,0 +1 @@
+/lwlocknames.h
diff --git a/src/include/storage/backendid.h b/src/include/storage/backendid.h
new file mode 100644
index 0000000..93d5b50
--- /dev/null
+++ b/src/include/storage/backendid.h
@@ -0,0 +1,37 @@
+/*-------------------------------------------------------------------------
+ *
+ * backendid.h
+ * POSTGRES backend id communication definitions
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/backendid.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BACKENDID_H
+#define BACKENDID_H
+
+/* ----------------
+ * -cim 8/17/90
+ * ----------------
+ */
+typedef int BackendId; /* unique currently active backend identifier */
+
+#define InvalidBackendId (-1)
+
+extern PGDLLIMPORT BackendId MyBackendId; /* backend id of this backend */
+
+/* backend id of our parallel session leader, or InvalidBackendId if none */
+extern PGDLLIMPORT BackendId ParallelLeaderBackendId;
+
+/*
+ * The BackendId to use for our session's temp relations is normally our own,
+ * but parallel workers should use their leader's ID.
+ */
+#define BackendIdForTempRelations() \
+ (ParallelLeaderBackendId == InvalidBackendId ? MyBackendId : ParallelLeaderBackendId)
+
+#endif /* BACKENDID_H */
diff --git a/src/include/storage/barrier.h b/src/include/storage/barrier.h
new file mode 100644
index 0000000..57d2c52
--- /dev/null
+++ b/src/include/storage/barrier.h
@@ -0,0 +1,46 @@
+/*-------------------------------------------------------------------------
+ *
+ * barrier.h
+ * Barriers for synchronizing cooperating processes.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/barrier.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BARRIER_H
+#define BARRIER_H
+
+/*
+ * For the header previously known as "barrier.h", please include
+ * "port/atomics.h", which deals with atomics, compiler barriers and memory
+ * barriers.
+ */
+
+#include "storage/condition_variable.h"
+#include "storage/spin.h"
+
+typedef struct Barrier
+{
+ slock_t mutex;
+ int phase; /* phase counter */
+ int participants; /* the number of participants attached */
+ int arrived; /* the number of participants that have
+ * arrived */
+ int elected; /* highest phase elected */
+ bool static_party; /* used only for assertions */
+ ConditionVariable condition_variable;
+} Barrier;
+
+extern void BarrierInit(Barrier *barrier, int num_workers);
+extern bool BarrierArriveAndWait(Barrier *barrier, uint32 wait_event_info);
+extern bool BarrierArriveAndDetach(Barrier *barrier);
+extern bool BarrierArriveAndDetachExceptLast(Barrier *barrier);
+extern int BarrierAttach(Barrier *barrier);
+extern bool BarrierDetach(Barrier *barrier);
+extern int BarrierPhase(Barrier *barrier);
+extern int BarrierParticipants(Barrier *barrier);
+
+#endif /* BARRIER_H */
diff --git a/src/include/storage/block.h b/src/include/storage/block.h
new file mode 100644
index 0000000..d756e3f
--- /dev/null
+++ b/src/include/storage/block.h
@@ -0,0 +1,115 @@
+/*-------------------------------------------------------------------------
+ *
+ * block.h
+ * POSTGRES disk block definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/block.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BLOCK_H
+#define BLOCK_H
+
+/*
+ * BlockNumber:
+ *
+ * each data file (heap or index) is divided into postgres disk blocks
+ * (which may be thought of as the unit of i/o -- a postgres buffer
+ * contains exactly one disk block). the blocks are numbered
+ * sequentially, 0 to 0xFFFFFFFE.
+ *
+ * InvalidBlockNumber is the same thing as P_NEW in bufmgr.h.
+ *
+ * the access methods, the buffer manager and the storage manager are
+ * more or less the only pieces of code that should be accessing disk
+ * blocks directly.
+ */
+typedef uint32 BlockNumber;
+
+#define InvalidBlockNumber ((BlockNumber) 0xFFFFFFFF)
+
+#define MaxBlockNumber ((BlockNumber) 0xFFFFFFFE)
+
+/*
+ * BlockId:
+ *
+ * this is a storage type for BlockNumber. in other words, this type
+ * is used for on-disk structures (e.g., in HeapTupleData) whereas
+ * BlockNumber is the type on which calculations are performed (e.g.,
+ * in access method code).
+ *
+ * there doesn't appear to be any reason to have separate types except
+ * for the fact that BlockIds can be SHORTALIGN'd (and therefore any
+ * structures that contains them, such as ItemPointerData, can also be
+ * SHORTALIGN'd). this is an important consideration for reducing the
+ * space requirements of the line pointer (ItemIdData) array on each
+ * page and the header of each heap or index tuple, so it doesn't seem
+ * wise to change this without good reason.
+ */
+typedef struct BlockIdData
+{
+ uint16 bi_hi;
+ uint16 bi_lo;
+} BlockIdData;
+
+typedef BlockIdData *BlockId; /* block identifier */
+
+/* ----------------
+ * support macros
+ * ----------------
+ */
+
+/*
+ * BlockNumberIsValid
+ * True iff blockNumber is valid.
+ */
+#define BlockNumberIsValid(blockNumber) \
+ ((BlockNumber) (blockNumber) != InvalidBlockNumber)
+
+/*
+ * BlockIdIsValid
+ * True iff the block identifier is valid.
+ */
+#define BlockIdIsValid(blockId) \
+ PointerIsValid(blockId)
+
+/*
+ * BlockIdSet
+ * Sets a block identifier to the specified value.
+ */
+#define BlockIdSet(blockId, blockNumber) \
+( \
+ (blockId)->bi_hi = (blockNumber) >> 16, \
+ (blockId)->bi_lo = (blockNumber) & 0xffff \
+)
+
+/*
+ * BlockIdCopy
+ * Copy a block identifier.
+ */
+#define BlockIdCopy(toBlockId, fromBlockId) \
+( \
+ (toBlockId)->bi_hi = (fromBlockId)->bi_hi, \
+ (toBlockId)->bi_lo = (fromBlockId)->bi_lo \
+)
+
+/*
+ * BlockIdEquals
+ * Check for block number equality.
+ */
+#define BlockIdEquals(blockId1, blockId2) \
+ ((blockId1)->bi_hi == (blockId2)->bi_hi && \
+ (blockId1)->bi_lo == (blockId2)->bi_lo)
+
+/*
+ * BlockIdGetBlockNumber
+ * Retrieve the block number from a block identifier.
+ */
+#define BlockIdGetBlockNumber(blockId) \
+ ((((BlockNumber) (blockId)->bi_hi) << 16) | ((BlockNumber) (blockId)->bi_lo))
+
+#endif /* BLOCK_H */
diff --git a/src/include/storage/buf.h b/src/include/storage/buf.h
new file mode 100644
index 0000000..aec01ca
--- /dev/null
+++ b/src/include/storage/buf.h
@@ -0,0 +1,46 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf.h
+ * Basic buffer manager data types.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/buf.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BUF_H
+#define BUF_H
+
+/*
+ * Buffer identifiers.
+ *
+ * Zero is invalid, positive is the index of a shared buffer (1..NBuffers),
+ * negative is the index of a local buffer (-1 .. -NLocBuffer).
+ */
+typedef int Buffer;
+
+#define InvalidBuffer 0
+
+/*
+ * BufferIsInvalid
+ * True iff the buffer is invalid.
+ */
+#define BufferIsInvalid(buffer) ((buffer) == InvalidBuffer)
+
+/*
+ * BufferIsLocal
+ * True iff the buffer is local (not visible to other backends).
+ */
+#define BufferIsLocal(buffer) ((buffer) < 0)
+
+/*
+ * Buffer access strategy objects.
+ *
+ * BufferAccessStrategyData is private to freelist.c
+ */
+typedef struct BufferAccessStrategyData *BufferAccessStrategy;
+
+#endif /* BUF_H */
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
new file mode 100644
index 0000000..a17e7b2
--- /dev/null
+++ b/src/include/storage/buf_internals.h
@@ -0,0 +1,345 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf_internals.h
+ * Internal definitions for buffer manager and the buffer replacement
+ * strategy.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/buf_internals.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BUFMGR_INTERNALS_H
+#define BUFMGR_INTERNALS_H
+
+#include "port/atomics.h"
+#include "storage/buf.h"
+#include "storage/bufmgr.h"
+#include "storage/condition_variable.h"
+#include "storage/latch.h"
+#include "storage/lwlock.h"
+#include "storage/shmem.h"
+#include "storage/smgr.h"
+#include "storage/spin.h"
+#include "utils/relcache.h"
+
+/*
+ * Buffer state is a single 32-bit variable where following data is combined.
+ *
+ * - 18 bits refcount
+ * - 4 bits usage count
+ * - 10 bits of flags
+ *
+ * Combining these values allows to perform some operations without locking
+ * the buffer header, by modifying them together with a CAS loop.
+ *
+ * The definition of buffer state components is below.
+ */
+#define BUF_REFCOUNT_ONE 1
+#define BUF_REFCOUNT_MASK ((1U << 18) - 1)
+#define BUF_USAGECOUNT_MASK 0x003C0000U
+#define BUF_USAGECOUNT_ONE (1U << 18)
+#define BUF_USAGECOUNT_SHIFT 18
+#define BUF_FLAG_MASK 0xFFC00000U
+
+/* Get refcount and usagecount from buffer state */
+#define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
+#define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
+
+/*
+ * Flags for buffer descriptors
+ *
+ * Note: BM_TAG_VALID essentially means that there is a buffer hashtable
+ * entry associated with the buffer's tag.
+ */
+#define BM_LOCKED (1U << 22) /* buffer header is locked */
+#define BM_DIRTY (1U << 23) /* data needs writing */
+#define BM_VALID (1U << 24) /* data is valid */
+#define BM_TAG_VALID (1U << 25) /* tag is assigned */
+#define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */
+#define BM_IO_ERROR (1U << 27) /* previous I/O failed */
+#define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */
+#define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */
+#define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */
+#define BM_PERMANENT (1U << 31) /* permanent buffer (not unlogged,
+ * or init fork) */
+/*
+ * The maximum allowed value of usage_count represents a tradeoff between
+ * accuracy and speed of the clock-sweep buffer management algorithm. A
+ * large value (comparable to NBuffers) would approximate LRU semantics.
+ * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
+ * clock sweeps to find a free buffer, so in practice we don't want the
+ * value to be very large.
+ */
+#define BM_MAX_USAGE_COUNT 5
+
+/*
+ * Buffer tag identifies which disk block the buffer contains.
+ *
+ * Note: the BufferTag data must be sufficient to determine where to write the
+ * block, without reference to pg_class or pg_tablespace entries. It's
+ * possible that the backend flushing the buffer doesn't even believe the
+ * relation is visible yet (its xact may have started before the xact that
+ * created the rel). The storage manager must be able to cope anyway.
+ *
+ * Note: if there's any pad bytes in the struct, INIT_BUFFERTAG will have
+ * to be fixed to zero them, since this struct is used as a hash key.
+ */
+typedef struct buftag
+{
+ RelFileNode rnode; /* physical relation identifier */
+ ForkNumber forkNum;
+ BlockNumber blockNum; /* blknum relative to begin of reln */
+} BufferTag;
+
+#define CLEAR_BUFFERTAG(a) \
+( \
+ (a).rnode.spcNode = InvalidOid, \
+ (a).rnode.dbNode = InvalidOid, \
+ (a).rnode.relNode = InvalidOid, \
+ (a).forkNum = InvalidForkNumber, \
+ (a).blockNum = InvalidBlockNumber \
+)
+
+#define INIT_BUFFERTAG(a,xx_rnode,xx_forkNum,xx_blockNum) \
+( \
+ (a).rnode = (xx_rnode), \
+ (a).forkNum = (xx_forkNum), \
+ (a).blockNum = (xx_blockNum) \
+)
+
+#define BUFFERTAGS_EQUAL(a,b) \
+( \
+ RelFileNodeEquals((a).rnode, (b).rnode) && \
+ (a).blockNum == (b).blockNum && \
+ (a).forkNum == (b).forkNum \
+)
+
+/*
+ * The shared buffer mapping table is partitioned to reduce contention.
+ * To determine which partition lock a given tag requires, compute the tag's
+ * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
+ * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
+ */
+#define BufTableHashPartition(hashcode) \
+ ((hashcode) % NUM_BUFFER_PARTITIONS)
+#define BufMappingPartitionLock(hashcode) \
+ (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \
+ BufTableHashPartition(hashcode)].lock)
+#define BufMappingPartitionLockByIndex(i) \
+ (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock)
+
+/*
+ * BufferDesc -- shared descriptor/state data for a single shared buffer.
+ *
+ * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
+ * tag, state or wait_backend_pgprocno fields. In general, buffer header lock
+ * is a spinlock which is combined with flags, refcount and usagecount into
+ * single atomic variable. This layout allow us to do some operations in a
+ * single atomic operation, without actually acquiring and releasing spinlock;
+ * for instance, increase or decrease refcount. buf_id field never changes
+ * after initialization, so does not need locking. freeNext is protected by
+ * the buffer_strategy_lock not buffer header lock. The LWLock can take care
+ * of itself. The buffer header lock is *not* used to control access to the
+ * data in the buffer!
+ *
+ * It's assumed that nobody changes the state field while buffer header lock
+ * is held. Thus buffer header lock holder can do complex updates of the
+ * state variable in single write, simultaneously with lock release (cleaning
+ * BM_LOCKED flag). On the other hand, updating of state without holding
+ * buffer header lock is restricted to CAS, which insure that BM_LOCKED flag
+ * is not set. Atomic increment/decrement, OR/AND etc. are not allowed.
+ *
+ * An exception is that if we have the buffer pinned, its tag can't change
+ * underneath us, so we can examine the tag without locking the buffer header.
+ * Also, in places we do one-time reads of the flags without bothering to
+ * lock the buffer header; this is generally for situations where we don't
+ * expect the flag bit being tested to be changing.
+ *
+ * We can't physically remove items from a disk page if another backend has
+ * the buffer pinned. Hence, a backend may need to wait for all other pins
+ * to go away. This is signaled by storing its own pgprocno into
+ * wait_backend_pgprocno and setting flag bit BM_PIN_COUNT_WAITER. At present,
+ * there can be only one such waiter per buffer.
+ *
+ * We use this same struct for local buffer headers, but the locks are not
+ * used and not all of the flag bits are useful either. To avoid unnecessary
+ * overhead, manipulations of the state field should be done without actual
+ * atomic operations (i.e. only pg_atomic_read_u32() and
+ * pg_atomic_unlocked_write_u32()).
+ *
+ * Be careful to avoid increasing the size of the struct when adding or
+ * reordering members. Keeping it below 64 bytes (the most common CPU
+ * cache line size) is fairly important for performance.
+ *
+ * Per-buffer I/O condition variables are currently kept outside this struct in
+ * a separate array. They could be moved in here and still fit within that
+ * limit on common systems, but for now that is not done.
+ */
+typedef struct BufferDesc
+{
+ BufferTag tag; /* ID of page contained in buffer */
+ int buf_id; /* buffer's index number (from 0) */
+
+ /* state of the tag, containing flags, refcount and usagecount */
+ pg_atomic_uint32 state;
+
+ int wait_backend_pgprocno; /* backend of pin-count waiter */
+ int freeNext; /* link in freelist chain */
+ LWLock content_lock; /* to lock access to buffer contents */
+} BufferDesc;
+
+/*
+ * Concurrent access to buffer headers has proven to be more efficient if
+ * they're cache line aligned. So we force the start of the BufferDescriptors
+ * array to be on a cache line boundary and force the elements to be cache
+ * line sized.
+ *
+ * XXX: As this is primarily matters in highly concurrent workloads which
+ * probably all are 64bit these days, and the space wastage would be a bit
+ * more noticeable on 32bit systems, we don't force the stride to be cache
+ * line sized on those. If somebody does actual performance testing, we can
+ * reevaluate.
+ *
+ * Note that local buffer descriptors aren't forced to be aligned - as there's
+ * no concurrent access to those it's unlikely to be beneficial.
+ *
+ * We use a 64-byte cache line size here, because that's the most common
+ * size. Making it bigger would be a waste of memory. Even if running on a
+ * platform with either 32 or 128 byte line sizes, it's good to align to
+ * boundaries and avoid false sharing.
+ */
+#define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1)
+
+typedef union BufferDescPadded
+{
+ BufferDesc bufferdesc;
+ char pad[BUFFERDESC_PAD_TO_SIZE];
+} BufferDescPadded;
+
+#define GetBufferDescriptor(id) (&BufferDescriptors[(id)].bufferdesc)
+#define GetLocalBufferDescriptor(id) (&LocalBufferDescriptors[(id)])
+
+#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
+
+#define BufferDescriptorGetIOCV(bdesc) \
+ (&(BufferIOCVArray[(bdesc)->buf_id]).cv)
+#define BufferDescriptorGetContentLock(bdesc) \
+ ((LWLock*) (&(bdesc)->content_lock))
+
+extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray;
+
+/*
+ * The freeNext field is either the index of the next freelist entry,
+ * or one of these special values:
+ */
+#define FREENEXT_END_OF_LIST (-1)
+#define FREENEXT_NOT_IN_LIST (-2)
+
+/*
+ * Functions for acquiring/releasing a shared buffer header's spinlock. Do
+ * not apply these to local buffers!
+ */
+extern uint32 LockBufHdr(BufferDesc *desc);
+#define UnlockBufHdr(desc, s) \
+ do { \
+ pg_write_barrier(); \
+ pg_atomic_write_u32(&(desc)->state, (s) & (~BM_LOCKED)); \
+ } while (0)
+
+
+/*
+ * The PendingWriteback & WritebackContext structure are used to keep
+ * information about pending flush requests to be issued to the OS.
+ */
+typedef struct PendingWriteback
+{
+ /* could store different types of pending flushes here */
+ BufferTag tag;
+} PendingWriteback;
+
+/* struct forward declared in bufmgr.h */
+typedef struct WritebackContext
+{
+ /* pointer to the max number of writeback requests to coalesce */
+ int *max_pending;
+
+ /* current number of pending writeback requests */
+ int nr_pending;
+
+ /* pending requests */
+ PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES];
+} WritebackContext;
+
+/* in buf_init.c */
+extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
+extern PGDLLIMPORT WritebackContext BackendWritebackContext;
+
+/* in localbuf.c */
+extern PGDLLIMPORT BufferDesc *LocalBufferDescriptors;
+
+/* in bufmgr.c */
+
+/*
+ * Structure to sort buffers per file on checkpoints.
+ *
+ * This structure is allocated per buffer in shared memory, so it should be
+ * kept as small as possible.
+ */
+typedef struct CkptSortItem
+{
+ Oid tsId;
+ Oid relNode;
+ ForkNumber forkNum;
+ BlockNumber blockNum;
+ int buf_id;
+} CkptSortItem;
+
+extern PGDLLIMPORT CkptSortItem *CkptBufferIds;
+
+/*
+ * Internal buffer management routines
+ */
+/* bufmgr.c */
+extern void WritebackContextInit(WritebackContext *context, int *max_pending);
+extern void IssuePendingWritebacks(WritebackContext *context);
+extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
+
+/* freelist.c */
+extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
+ uint32 *buf_state);
+extern void StrategyFreeBuffer(BufferDesc *buf);
+extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
+ BufferDesc *buf);
+
+extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
+extern void StrategyNotifyBgWriter(int bgwprocno);
+
+extern Size StrategyShmemSize(void);
+extern void StrategyInitialize(bool init);
+extern bool have_free_buffer(void);
+
+/* buf_table.c */
+extern Size BufTableShmemSize(int size);
+extern void InitBufTable(int size);
+extern uint32 BufTableHashCode(BufferTag *tagPtr);
+extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
+extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
+extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
+
+/* localbuf.c */
+extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr,
+ ForkNumber forkNum,
+ BlockNumber blockNum);
+extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
+ BlockNumber blockNum, bool *foundPtr);
+extern void MarkLocalBufferDirty(Buffer buffer);
+extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
+ BlockNumber firstDelBlock);
+extern void DropRelFileNodeAllLocalBuffers(RelFileNode rnode);
+extern void AtEOXact_LocalBuffers(bool isCommit);
+
+#endif /* BUFMGR_INTERNALS_H */
diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h
new file mode 100644
index 0000000..a4922d1
--- /dev/null
+++ b/src/include/storage/buffile.h
@@ -0,0 +1,57 @@
+/*-------------------------------------------------------------------------
+ *
+ * buffile.h
+ * Management of large buffered temporary files.
+ *
+ * The BufFile routines provide a partial replacement for stdio atop
+ * virtual file descriptors managed by fd.c. Currently they only support
+ * buffered access to a virtual file, without any of stdio's formatting
+ * features. That's enough for immediate needs, but the set of facilities
+ * could be expanded if necessary.
+ *
+ * BufFile also supports working with temporary files that exceed the OS
+ * file size limit and/or the largest offset representable in an int.
+ * It might be better to split that out as a separately accessible module,
+ * but currently we have no need for oversize temp files without buffered
+ * access.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/buffile.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef BUFFILE_H
+#define BUFFILE_H
+
+#include "storage/fileset.h"
+
+/* BufFile is an opaque type whose details are not known outside buffile.c. */
+
+typedef struct BufFile BufFile;
+
+/*
+ * prototypes for functions in buffile.c
+ */
+
+extern BufFile *BufFileCreateTemp(bool interXact);
+extern void BufFileClose(BufFile *file);
+extern size_t BufFileRead(BufFile *file, void *ptr, size_t size);
+extern void BufFileWrite(BufFile *file, void *ptr, size_t size);
+extern int BufFileSeek(BufFile *file, int fileno, off_t offset, int whence);
+extern void BufFileTell(BufFile *file, int *fileno, off_t *offset);
+extern int BufFileSeekBlock(BufFile *file, long blknum);
+extern int64 BufFileSize(BufFile *file);
+extern long BufFileAppend(BufFile *target, BufFile *source);
+
+extern BufFile *BufFileCreateFileSet(FileSet *fileset, const char *name);
+extern void BufFileExportFileSet(BufFile *file);
+extern BufFile *BufFileOpenFileSet(FileSet *fileset, const char *name,
+ int mode, bool missing_ok);
+extern void BufFileDeleteFileSet(FileSet *fileset, const char *name,
+ bool missing_ok);
+extern void BufFileTruncateFileSet(BufFile *file, int fileno, off_t offset);
+
+#endif /* BUFFILE_H */
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
new file mode 100644
index 0000000..5839140
--- /dev/null
+++ b/src/include/storage/bufmgr.h
@@ -0,0 +1,297 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufmgr.h
+ * POSTGRES buffer manager definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/bufmgr.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BUFMGR_H
+#define BUFMGR_H
+
+#include "storage/block.h"
+#include "storage/buf.h"
+#include "storage/bufpage.h"
+#include "storage/relfilenode.h"
+#include "utils/relcache.h"
+#include "utils/snapmgr.h"
+
+typedef void *Block;
+
+/* Possible arguments for GetAccessStrategy() */
+typedef enum BufferAccessStrategyType
+{
+ BAS_NORMAL, /* Normal random access */
+ BAS_BULKREAD, /* Large read-only scan (hint bit updates are
+ * ok) */
+ BAS_BULKWRITE, /* Large multi-block write (e.g. COPY IN) */
+ BAS_VACUUM /* VACUUM */
+} BufferAccessStrategyType;
+
+/* Possible modes for ReadBufferExtended() */
+typedef enum
+{
+ RBM_NORMAL, /* Normal read */
+ RBM_ZERO_AND_LOCK, /* Don't read from disk, caller will
+ * initialize. Also locks the page. */
+ RBM_ZERO_AND_CLEANUP_LOCK, /* Like RBM_ZERO_AND_LOCK, but locks the page
+ * in "cleanup" mode */
+ RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */
+ RBM_NORMAL_NO_LOG /* Don't log page as invalid during WAL
+ * replay; otherwise same as RBM_NORMAL */
+} ReadBufferMode;
+
+/*
+ * Type returned by PrefetchBuffer().
+ */
+typedef struct PrefetchBufferResult
+{
+ Buffer recent_buffer; /* If valid, a hit (recheck needed!) */
+ bool initiated_io; /* If true, a miss resulting in async I/O */
+} PrefetchBufferResult;
+
+/* forward declared, to avoid having to expose buf_internals.h here */
+struct WritebackContext;
+
+/* forward declared, to avoid including smgr.h here */
+struct SMgrRelationData;
+
+/* in globals.c ... this duplicates miscadmin.h */
+extern PGDLLIMPORT int NBuffers;
+
+/* in bufmgr.c */
+extern PGDLLIMPORT bool zero_damaged_pages;
+extern PGDLLIMPORT int bgwriter_lru_maxpages;
+extern PGDLLIMPORT double bgwriter_lru_multiplier;
+extern PGDLLIMPORT bool track_io_timing;
+extern PGDLLIMPORT int effective_io_concurrency;
+extern PGDLLIMPORT int maintenance_io_concurrency;
+
+extern PGDLLIMPORT int checkpoint_flush_after;
+extern PGDLLIMPORT int backend_flush_after;
+extern PGDLLIMPORT int bgwriter_flush_after;
+
+/* in buf_init.c */
+extern PGDLLIMPORT char *BufferBlocks;
+
+/* in localbuf.c */
+extern PGDLLIMPORT int NLocBuffer;
+extern PGDLLIMPORT Block *LocalBufferBlockPointers;
+extern PGDLLIMPORT int32 *LocalRefCount;
+
+/* upper limit for effective_io_concurrency */
+#define MAX_IO_CONCURRENCY 1000
+
+/* special block number for ReadBuffer() */
+#define P_NEW InvalidBlockNumber /* grow the file to get a new page */
+
+/*
+ * Buffer content lock modes (mode argument for LockBuffer())
+ */
+#define BUFFER_LOCK_UNLOCK 0
+#define BUFFER_LOCK_SHARE 1
+#define BUFFER_LOCK_EXCLUSIVE 2
+
+/*
+ * These routines are beaten on quite heavily, hence the macroization.
+ */
+
+/*
+ * BufferIsValid
+ * True iff the given buffer number is valid (either as a shared
+ * or local buffer).
+ *
+ * Note: For a long time this was defined the same as BufferIsPinned,
+ * that is it would say False if you didn't hold a pin on the buffer.
+ * I believe this was bogus and served only to mask logic errors.
+ * Code should always know whether it has a buffer reference,
+ * independently of the pin state.
+ *
+ * Note: For a further long time this was not quite the inverse of the
+ * BufferIsInvalid() macro, in that it also did sanity checks to verify
+ * that the buffer number was in range. Most likely, this macro was
+ * originally intended only to be used in assertions, but its use has
+ * since expanded quite a bit, and the overhead of making those checks
+ * even in non-assert-enabled builds can be significant. Thus, we've
+ * now demoted the range checks to assertions within the macro itself.
+ */
+#define BufferIsValid(bufnum) \
+( \
+ AssertMacro((bufnum) <= NBuffers && (bufnum) >= -NLocBuffer), \
+ (bufnum) != InvalidBuffer \
+)
+
+/*
+ * BufferGetBlock
+ * Returns a reference to a disk page image associated with a buffer.
+ *
+ * Note:
+ * Assumes buffer is valid.
+ */
+#define BufferGetBlock(buffer) \
+( \
+ AssertMacro(BufferIsValid(buffer)), \
+ BufferIsLocal(buffer) ? \
+ LocalBufferBlockPointers[-(buffer) - 1] \
+ : \
+ (Block) (BufferBlocks + ((Size) ((buffer) - 1)) * BLCKSZ) \
+)
+
+/*
+ * BufferGetPageSize
+ * Returns the page size within a buffer.
+ *
+ * Notes:
+ * Assumes buffer is valid.
+ *
+ * The buffer can be a raw disk block and need not contain a valid
+ * (formatted) disk page.
+ */
+/* XXX should dig out of buffer descriptor */
+#define BufferGetPageSize(buffer) \
+( \
+ AssertMacro(BufferIsValid(buffer)), \
+ (Size)BLCKSZ \
+)
+
+/*
+ * BufferGetPage
+ * Returns the page associated with a buffer.
+ *
+ * When this is called as part of a scan, there may be a need for a nearby
+ * call to TestForOldSnapshot(). See the definition of that for details.
+ */
+#define BufferGetPage(buffer) ((Page)BufferGetBlock(buffer))
+
+/*
+ * prototypes for functions in bufmgr.c
+ */
+extern PrefetchBufferResult PrefetchSharedBuffer(struct SMgrRelationData *smgr_reln,
+ ForkNumber forkNum,
+ BlockNumber blockNum);
+extern PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum,
+ BlockNumber blockNum);
+extern bool ReadRecentBuffer(RelFileNode rnode, ForkNumber forkNum,
+ BlockNumber blockNum, Buffer recent_buffer);
+extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
+extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum,
+ BlockNumber blockNum, ReadBufferMode mode,
+ BufferAccessStrategy strategy);
+extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode,
+ ForkNumber forkNum, BlockNumber blockNum,
+ ReadBufferMode mode, BufferAccessStrategy strategy,
+ bool permanent);
+extern void ReleaseBuffer(Buffer buffer);
+extern void UnlockReleaseBuffer(Buffer buffer);
+extern void MarkBufferDirty(Buffer buffer);
+extern void IncrBufferRefCount(Buffer buffer);
+extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
+ BlockNumber blockNum);
+
+extern void InitBufferPool(void);
+extern void InitBufferPoolAccess(void);
+extern void AtEOXact_Buffers(bool isCommit);
+extern void PrintBufferLeakWarning(Buffer buffer);
+extern void CheckPointBuffers(int flags);
+extern BlockNumber BufferGetBlockNumber(Buffer buffer);
+extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation,
+ ForkNumber forkNum);
+extern void FlushOneBuffer(Buffer buffer);
+extern void FlushRelationBuffers(Relation rel);
+extern void FlushRelationsAllBuffers(struct SMgrRelationData **smgrs, int nrels);
+extern void CreateAndCopyRelationData(RelFileNode src_rnode,
+ RelFileNode dst_rnode,
+ bool permanent);
+extern void FlushDatabaseBuffers(Oid dbid);
+extern void DropRelFileNodeBuffers(struct SMgrRelationData *smgr_reln, ForkNumber *forkNum,
+ int nforks, BlockNumber *firstDelBlock);
+extern void DropRelFileNodesAllBuffers(struct SMgrRelationData **smgr_reln, int nnodes);
+extern void DropDatabaseBuffers(Oid dbid);
+
+#define RelationGetNumberOfBlocks(reln) \
+ RelationGetNumberOfBlocksInFork(reln, MAIN_FORKNUM)
+
+extern bool BufferIsPermanent(Buffer buffer);
+extern XLogRecPtr BufferGetLSNAtomic(Buffer buffer);
+
+#ifdef NOT_USED
+extern void PrintPinnedBufs(void);
+#endif
+extern Size BufferShmemSize(void);
+extern void BufferGetTag(Buffer buffer, RelFileNode *rnode,
+ ForkNumber *forknum, BlockNumber *blknum);
+
+extern void MarkBufferDirtyHint(Buffer buffer, bool buffer_std);
+
+extern void UnlockBuffers(void);
+extern void LockBuffer(Buffer buffer, int mode);
+extern bool ConditionalLockBuffer(Buffer buffer);
+extern void LockBufferForCleanup(Buffer buffer);
+extern bool ConditionalLockBufferForCleanup(Buffer buffer);
+extern bool IsBufferCleanupOK(Buffer buffer);
+extern bool HoldingBufferPinThatDelaysRecovery(void);
+
+extern void AbortBufferIO(void);
+
+extern void BufmgrCommit(void);
+extern bool BgBufferSync(struct WritebackContext *wb_context);
+
+extern void AtProcExit_LocalBuffers(void);
+
+extern void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation);
+
+/* in freelist.c */
+extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype);
+extern void FreeAccessStrategy(BufferAccessStrategy strategy);
+
+
+/* inline functions */
+
+/*
+ * Although this header file is nominally backend-only, certain frontend
+ * programs like pg_waldump include it. For compilers that emit static
+ * inline functions even when they're unused, that leads to unsatisfied
+ * external references; hence hide these with #ifndef FRONTEND.
+ */
+
+#ifndef FRONTEND
+
+/*
+ * Check whether the given snapshot is too old to have safely read the given
+ * page from the given table. If so, throw a "snapshot too old" error.
+ *
+ * This test generally needs to be performed after every BufferGetPage() call
+ * that is executed as part of a scan. It is not needed for calls made for
+ * modifying the page (for example, to position to the right place to insert a
+ * new index tuple or for vacuuming). It may also be omitted where calls to
+ * lower-level functions will have already performed the test.
+ *
+ * Note that a NULL snapshot argument is allowed and causes a fast return
+ * without error; this is to support call sites which can be called from
+ * either scans or index modification areas.
+ *
+ * For best performance, keep the tests that are fastest and/or most likely to
+ * exclude a page from old snapshot testing near the front.
+ */
+static inline void
+TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
+{
+ Assert(relation != NULL);
+
+ if (old_snapshot_threshold >= 0
+ && (snapshot) != NULL
+ && ((snapshot)->snapshot_type == SNAPSHOT_MVCC
+ || (snapshot)->snapshot_type == SNAPSHOT_TOAST)
+ && !XLogRecPtrIsInvalid((snapshot)->lsn)
+ && PageGetLSN(page) > (snapshot)->lsn)
+ TestForOldSnapshot_impl(snapshot, relation);
+}
+
+#endif /* FRONTEND */
+
+#endif /* BUFMGR_H */
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
new file mode 100644
index 0000000..e9f253f
--- /dev/null
+++ b/src/include/storage/bufpage.h
@@ -0,0 +1,457 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufpage.h
+ * Standard POSTGRES buffer page definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/bufpage.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BUFPAGE_H
+#define BUFPAGE_H
+
+#include "access/xlogdefs.h"
+#include "storage/block.h"
+#include "storage/item.h"
+#include "storage/off.h"
+
+/*
+ * A postgres disk page is an abstraction layered on top of a postgres
+ * disk block (which is simply a unit of i/o, see block.h).
+ *
+ * specifically, while a disk block can be unformatted, a postgres
+ * disk page is always a slotted page of the form:
+ *
+ * +----------------+---------------------------------+
+ * | PageHeaderData | linp1 linp2 linp3 ... |
+ * +-----------+----+---------------------------------+
+ * | ... linpN | |
+ * +-----------+--------------------------------------+
+ * | ^ pd_lower |
+ * | |
+ * | v pd_upper |
+ * +-------------+------------------------------------+
+ * | | tupleN ... |
+ * +-------------+------------------+-----------------+
+ * | ... tuple3 tuple2 tuple1 | "special space" |
+ * +--------------------------------+-----------------+
+ * ^ pd_special
+ *
+ * a page is full when nothing can be added between pd_lower and
+ * pd_upper.
+ *
+ * all blocks written out by an access method must be disk pages.
+ *
+ * EXCEPTIONS:
+ *
+ * obviously, a page is not formatted before it is initialized by
+ * a call to PageInit.
+ *
+ * NOTES:
+ *
+ * linp1..N form an ItemId (line pointer) array. ItemPointers point
+ * to a physical block number and a logical offset (line pointer
+ * number) within that block/page. Note that OffsetNumbers
+ * conventionally start at 1, not 0.
+ *
+ * tuple1..N are added "backwards" on the page. Since an ItemPointer
+ * offset is used to access an ItemId entry rather than an actual
+ * byte-offset position, tuples can be physically shuffled on a page
+ * whenever the need arises. This indirection also keeps crash recovery
+ * relatively simple, because the low-level details of page space
+ * management can be controlled by standard buffer page code during
+ * logging, and during recovery.
+ *
+ * AM-generic per-page information is kept in PageHeaderData.
+ *
+ * AM-specific per-page data (if any) is kept in the area marked "special
+ * space"; each AM has an "opaque" structure defined somewhere that is
+ * stored as the page trailer. an access method should always
+ * initialize its pages with PageInit and then set its own opaque
+ * fields.
+ */
+
+typedef Pointer Page;
+
+
+/*
+ * location (byte offset) within a page.
+ *
+ * note that this is actually limited to 2^15 because we have limited
+ * ItemIdData.lp_off and ItemIdData.lp_len to 15 bits (see itemid.h).
+ */
+typedef uint16 LocationIndex;
+
+
+/*
+ * For historical reasons, the 64-bit LSN value is stored as two 32-bit
+ * values.
+ */
+typedef struct
+{
+ uint32 xlogid; /* high bits */
+ uint32 xrecoff; /* low bits */
+} PageXLogRecPtr;
+
+#define PageXLogRecPtrGet(val) \
+ ((uint64) (val).xlogid << 32 | (val).xrecoff)
+#define PageXLogRecPtrSet(ptr, lsn) \
+ ((ptr).xlogid = (uint32) ((lsn) >> 32), (ptr).xrecoff = (uint32) (lsn))
+
+/*
+ * disk page organization
+ *
+ * space management information generic to any page
+ *
+ * pd_lsn - identifies xlog record for last change to this page.
+ * pd_checksum - page checksum, if set.
+ * pd_flags - flag bits.
+ * pd_lower - offset to start of free space.
+ * pd_upper - offset to end of free space.
+ * pd_special - offset to start of special space.
+ * pd_pagesize_version - size in bytes and page layout version number.
+ * pd_prune_xid - oldest XID among potentially prunable tuples on page.
+ *
+ * The LSN is used by the buffer manager to enforce the basic rule of WAL:
+ * "thou shalt write xlog before data". A dirty buffer cannot be dumped
+ * to disk until xlog has been flushed at least as far as the page's LSN.
+ *
+ * pd_checksum stores the page checksum, if it has been set for this page;
+ * zero is a valid value for a checksum. If a checksum is not in use then
+ * we leave the field unset. This will typically mean the field is zero
+ * though non-zero values may also be present if databases have been
+ * pg_upgraded from releases prior to 9.3, when the same byte offset was
+ * used to store the current timelineid when the page was last updated.
+ * Note that there is no indication on a page as to whether the checksum
+ * is valid or not, a deliberate design choice which avoids the problem
+ * of relying on the page contents to decide whether to verify it. Hence
+ * there are no flag bits relating to checksums.
+ *
+ * pd_prune_xid is a hint field that helps determine whether pruning will be
+ * useful. It is currently unused in index pages.
+ *
+ * The page version number and page size are packed together into a single
+ * uint16 field. This is for historical reasons: before PostgreSQL 7.3,
+ * there was no concept of a page version number, and doing it this way
+ * lets us pretend that pre-7.3 databases have page version number zero.
+ * We constrain page sizes to be multiples of 256, leaving the low eight
+ * bits available for a version number.
+ *
+ * Minimum possible page size is perhaps 64B to fit page header, opaque space
+ * and a minimal tuple; of course, in reality you want it much bigger, so
+ * the constraint on pagesize mod 256 is not an important restriction.
+ * On the high end, we can only support pages up to 32KB because lp_off/lp_len
+ * are 15 bits.
+ */
+
+typedef struct PageHeaderData
+{
+ /* XXX LSN is member of *any* block, not only page-organized ones */
+ PageXLogRecPtr pd_lsn; /* LSN: next byte after last byte of xlog
+ * record for last change to this page */
+ uint16 pd_checksum; /* checksum */
+ uint16 pd_flags; /* flag bits, see below */
+ LocationIndex pd_lower; /* offset to start of free space */
+ LocationIndex pd_upper; /* offset to end of free space */
+ LocationIndex pd_special; /* offset to start of special space */
+ uint16 pd_pagesize_version;
+ TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */
+ ItemIdData pd_linp[FLEXIBLE_ARRAY_MEMBER]; /* line pointer array */
+} PageHeaderData;
+
+typedef PageHeaderData *PageHeader;
+
+/*
+ * pd_flags contains the following flag bits. Undefined bits are initialized
+ * to zero and may be used in the future.
+ *
+ * PD_HAS_FREE_LINES is set if there are any LP_UNUSED line pointers before
+ * pd_lower. This should be considered a hint rather than the truth, since
+ * changes to it are not WAL-logged.
+ *
+ * PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the
+ * page for its new tuple version; this suggests that a prune is needed.
+ * Again, this is just a hint.
+ */
+#define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */
+#define PD_PAGE_FULL 0x0002 /* not enough free space for new tuple? */
+#define PD_ALL_VISIBLE 0x0004 /* all tuples on page are visible to
+ * everyone */
+
+#define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */
+
+/*
+ * Page layout version number 0 is for pre-7.3 Postgres releases.
+ * Releases 7.3 and 7.4 use 1, denoting a new HeapTupleHeader layout.
+ * Release 8.0 uses 2; it changed the HeapTupleHeader layout again.
+ * Release 8.1 uses 3; it redefined HeapTupleHeader infomask bits.
+ * Release 8.3 uses 4; it changed the HeapTupleHeader layout again, and
+ * added the pd_flags field (by stealing some bits from pd_tli),
+ * as well as adding the pd_prune_xid field (which enlarges the header).
+ *
+ * As of Release 9.3, the checksum version must also be considered when
+ * handling pages.
+ */
+#define PG_PAGE_LAYOUT_VERSION 4
+#define PG_DATA_CHECKSUM_VERSION 1
+
+/* ----------------------------------------------------------------
+ * page support macros
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * PageIsValid
+ * True iff page is valid.
+ */
+#define PageIsValid(page) PointerIsValid(page)
+
+/*
+ * line pointer(s) do not count as part of header
+ */
+#define SizeOfPageHeaderData (offsetof(PageHeaderData, pd_linp))
+
+/*
+ * PageIsEmpty
+ * returns true iff no itemid has been allocated on the page
+ */
+#define PageIsEmpty(page) \
+ (((PageHeader) (page))->pd_lower <= SizeOfPageHeaderData)
+
+/*
+ * PageIsNew
+ * returns true iff page has not been initialized (by PageInit)
+ */
+#define PageIsNew(page) (((PageHeader) (page))->pd_upper == 0)
+
+/*
+ * PageGetItemId
+ * Returns an item identifier of a page.
+ */
+#define PageGetItemId(page, offsetNumber) \
+ ((ItemId) (&((PageHeader) (page))->pd_linp[(offsetNumber) - 1]))
+
+/*
+ * PageGetContents
+ * To be used in cases where the page does not contain line pointers.
+ *
+ * Note: prior to 8.3 this was not guaranteed to yield a MAXALIGN'd result.
+ * Now it is. Beware of old code that might think the offset to the contents
+ * is just SizeOfPageHeaderData rather than MAXALIGN(SizeOfPageHeaderData).
+ */
+#define PageGetContents(page) \
+ ((char *) (page) + MAXALIGN(SizeOfPageHeaderData))
+
+/* ----------------
+ * macros to access page size info
+ * ----------------
+ */
+
+/*
+ * PageSizeIsValid
+ * True iff the page size is valid.
+ */
+#define PageSizeIsValid(pageSize) ((pageSize) == BLCKSZ)
+
+/*
+ * PageGetPageSize
+ * Returns the page size of a page.
+ *
+ * this can only be called on a formatted page (unlike
+ * BufferGetPageSize, which can be called on an unformatted page).
+ * however, it can be called on a page that is not stored in a buffer.
+ */
+#define PageGetPageSize(page) \
+ ((Size) (((PageHeader) (page))->pd_pagesize_version & (uint16) 0xFF00))
+
+/*
+ * PageGetPageLayoutVersion
+ * Returns the page layout version of a page.
+ */
+#define PageGetPageLayoutVersion(page) \
+ (((PageHeader) (page))->pd_pagesize_version & 0x00FF)
+
+/*
+ * PageSetPageSizeAndVersion
+ * Sets the page size and page layout version number of a page.
+ *
+ * We could support setting these two values separately, but there's
+ * no real need for it at the moment.
+ */
+#define PageSetPageSizeAndVersion(page, size, version) \
+( \
+ AssertMacro(((size) & 0xFF00) == (size)), \
+ AssertMacro(((version) & 0x00FF) == (version)), \
+ ((PageHeader) (page))->pd_pagesize_version = (size) | (version) \
+)
+
+/* ----------------
+ * page special data macros
+ * ----------------
+ */
+/*
+ * PageGetSpecialSize
+ * Returns size of special space on a page.
+ */
+#define PageGetSpecialSize(page) \
+ ((uint16) (PageGetPageSize(page) - ((PageHeader)(page))->pd_special))
+
+/*
+ * Using assertions, validate that the page special pointer is OK.
+ *
+ * This is intended to catch use of the pointer before page initialization.
+ * It is implemented as a function due to the limitations of the MSVC
+ * compiler, which choked on doing all these tests within another macro. We
+ * return true so that AssertMacro() can be used while still getting the
+ * specifics from the macro failure within this function.
+ */
+static inline bool
+PageValidateSpecialPointer(Page page)
+{
+ Assert(PageIsValid(page));
+ Assert(((PageHeader) (page))->pd_special <= BLCKSZ);
+ Assert(((PageHeader) (page))->pd_special >= SizeOfPageHeaderData);
+
+ return true;
+}
+
+/*
+ * PageGetSpecialPointer
+ * Returns pointer to special space on a page.
+ */
+#define PageGetSpecialPointer(page) \
+( \
+ AssertMacro(PageValidateSpecialPointer(page)), \
+ (char *) ((char *) (page) + ((PageHeader) (page))->pd_special) \
+)
+
+/*
+ * PageGetItem
+ * Retrieves an item on the given page.
+ *
+ * Note:
+ * This does not change the status of any of the resources passed.
+ * The semantics may change in the future.
+ */
+#define PageGetItem(page, itemId) \
+( \
+ AssertMacro(PageIsValid(page)), \
+ AssertMacro(ItemIdHasStorage(itemId)), \
+ (Item)(((char *)(page)) + ItemIdGetOffset(itemId)) \
+)
+
+/*
+ * PageGetMaxOffsetNumber
+ * Returns the maximum offset number used by the given page.
+ * Since offset numbers are 1-based, this is also the number
+ * of items on the page.
+ *
+ * NOTE: if the page is not initialized (pd_lower == 0), we must
+ * return zero to ensure sane behavior. Accept double evaluation
+ * of the argument so that we can ensure this.
+ */
+#define PageGetMaxOffsetNumber(page) \
+ (((PageHeader) (page))->pd_lower <= SizeOfPageHeaderData ? 0 : \
+ ((((PageHeader) (page))->pd_lower - SizeOfPageHeaderData) \
+ / sizeof(ItemIdData)))
+
+/*
+ * Additional macros for access to page headers. (Beware multiple evaluation
+ * of the arguments!)
+ */
+#define PageGetLSN(page) \
+ PageXLogRecPtrGet(((PageHeader) (page))->pd_lsn)
+#define PageSetLSN(page, lsn) \
+ PageXLogRecPtrSet(((PageHeader) (page))->pd_lsn, lsn)
+
+#define PageHasFreeLinePointers(page) \
+ (((PageHeader) (page))->pd_flags & PD_HAS_FREE_LINES)
+#define PageSetHasFreeLinePointers(page) \
+ (((PageHeader) (page))->pd_flags |= PD_HAS_FREE_LINES)
+#define PageClearHasFreeLinePointers(page) \
+ (((PageHeader) (page))->pd_flags &= ~PD_HAS_FREE_LINES)
+
+#define PageIsFull(page) \
+ (((PageHeader) (page))->pd_flags & PD_PAGE_FULL)
+#define PageSetFull(page) \
+ (((PageHeader) (page))->pd_flags |= PD_PAGE_FULL)
+#define PageClearFull(page) \
+ (((PageHeader) (page))->pd_flags &= ~PD_PAGE_FULL)
+
+#define PageIsAllVisible(page) \
+ (((PageHeader) (page))->pd_flags & PD_ALL_VISIBLE)
+#define PageSetAllVisible(page) \
+ (((PageHeader) (page))->pd_flags |= PD_ALL_VISIBLE)
+#define PageClearAllVisible(page) \
+ (((PageHeader) (page))->pd_flags &= ~PD_ALL_VISIBLE)
+
+#define PageSetPrunable(page, xid) \
+do { \
+ Assert(TransactionIdIsNormal(xid)); \
+ if (!TransactionIdIsValid(((PageHeader) (page))->pd_prune_xid) || \
+ TransactionIdPrecedes(xid, ((PageHeader) (page))->pd_prune_xid)) \
+ ((PageHeader) (page))->pd_prune_xid = (xid); \
+} while (0)
+#define PageClearPrunable(page) \
+ (((PageHeader) (page))->pd_prune_xid = InvalidTransactionId)
+
+
+/* ----------------------------------------------------------------
+ * extern declarations
+ * ----------------------------------------------------------------
+ */
+
+/* flags for PageAddItemExtended() */
+#define PAI_OVERWRITE (1 << 0)
+#define PAI_IS_HEAP (1 << 1)
+
+/* flags for PageIsVerifiedExtended() */
+#define PIV_LOG_WARNING (1 << 0)
+#define PIV_REPORT_STAT (1 << 1)
+
+#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap) \
+ PageAddItemExtended(page, item, size, offsetNumber, \
+ ((overwrite) ? PAI_OVERWRITE : 0) | \
+ ((is_heap) ? PAI_IS_HEAP : 0))
+
+#define PageIsVerified(page, blkno) \
+ PageIsVerifiedExtended(page, blkno, \
+ PIV_LOG_WARNING | PIV_REPORT_STAT)
+
+/*
+ * Check that BLCKSZ is a multiple of sizeof(size_t). In
+ * PageIsVerifiedExtended(), it is much faster to check if a page is
+ * full of zeroes using the native word size. Note that this assertion
+ * is kept within a header to make sure that StaticAssertDecl() works
+ * across various combinations of platforms and compilers.
+ */
+StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)),
+ "BLCKSZ has to be a multiple of sizeof(size_t)");
+
+extern void PageInit(Page page, Size pageSize, Size specialSize);
+extern bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags);
+extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size,
+ OffsetNumber offsetNumber, int flags);
+extern Page PageGetTempPage(Page page);
+extern Page PageGetTempPageCopy(Page page);
+extern Page PageGetTempPageCopySpecial(Page page);
+extern void PageRestoreTempPage(Page tempPage, Page oldPage);
+extern void PageRepairFragmentation(Page page);
+extern void PageTruncateLinePointerArray(Page page);
+extern Size PageGetFreeSpace(Page page);
+extern Size PageGetFreeSpaceForMultipleTuples(Page page, int ntups);
+extern Size PageGetExactFreeSpace(Page page);
+extern Size PageGetHeapFreeSpace(Page page);
+extern void PageIndexTupleDelete(Page page, OffsetNumber offset);
+extern void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems);
+extern void PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offset);
+extern bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
+ Item newtup, Size newsize);
+extern char *PageSetChecksumCopy(Page page, BlockNumber blkno);
+extern void PageSetChecksumInplace(Page page, BlockNumber blkno);
+
+#endif /* BUFPAGE_H */
diff --git a/src/include/storage/checksum.h b/src/include/storage/checksum.h
new file mode 100644
index 0000000..1904fab
--- /dev/null
+++ b/src/include/storage/checksum.h
@@ -0,0 +1,24 @@
+/*-------------------------------------------------------------------------
+ *
+ * checksum.h
+ * Checksum implementation for data pages.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/checksum.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef CHECKSUM_H
+#define CHECKSUM_H
+
+#include "storage/block.h"
+
+/*
+ * Compute the checksum for a Postgres page. The page must be aligned on a
+ * 4-byte boundary.
+ */
+extern uint16 pg_checksum_page(char *page, BlockNumber blkno);
+
+#endif /* CHECKSUM_H */
diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h
new file mode 100644
index 0000000..015f0f1
--- /dev/null
+++ b/src/include/storage/checksum_impl.h
@@ -0,0 +1,215 @@
+/*-------------------------------------------------------------------------
+ *
+ * checksum_impl.h
+ * Checksum implementation for data pages.
+ *
+ * This file exists for the benefit of external programs that may wish to
+ * check Postgres page checksums. They can #include this to get the code
+ * referenced by storage/checksum.h. (Note: you may need to redefine
+ * Assert() as empty to compile this successfully externally.)
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/checksum_impl.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * The algorithm used to checksum pages is chosen for very fast calculation.
+ * Workloads where the database working set fits into OS file cache but not
+ * into shared buffers can read in pages at a very fast pace and the checksum
+ * algorithm itself can become the largest bottleneck.
+ *
+ * The checksum algorithm itself is based on the FNV-1a hash (FNV is shorthand
+ * for Fowler/Noll/Vo). The primitive of a plain FNV-1a hash folds in data 1
+ * byte at a time according to the formula:
+ *
+ * hash = (hash ^ value) * FNV_PRIME
+ *
+ * FNV-1a algorithm is described at http://www.isthe.com/chongo/tech/comp/fnv/
+ *
+ * PostgreSQL doesn't use FNV-1a hash directly because it has bad mixing of
+ * high bits - high order bits in input data only affect high order bits in
+ * output data. To resolve this we xor in the value prior to multiplication
+ * shifted right by 17 bits. The number 17 was chosen because it doesn't
+ * have common denominator with set bit positions in FNV_PRIME and empirically
+ * provides the fastest mixing for high order bits of final iterations quickly
+ * avalanche into lower positions. For performance reasons we choose to combine
+ * 4 bytes at a time. The actual hash formula used as the basis is:
+ *
+ * hash = (hash ^ value) * FNV_PRIME ^ ((hash ^ value) >> 17)
+ *
+ * The main bottleneck in this calculation is the multiplication latency. To
+ * hide the latency and to make use of SIMD parallelism multiple hash values
+ * are calculated in parallel. The page is treated as a 32 column two
+ * dimensional array of 32 bit values. Each column is aggregated separately
+ * into a partial checksum. Each partial checksum uses a different initial
+ * value (offset basis in FNV terminology). The initial values actually used
+ * were chosen randomly, as the values themselves don't matter as much as that
+ * they are different and don't match anything in real data. After initializing
+ * partial checksums each value in the column is aggregated according to the
+ * above formula. Finally two more iterations of the formula are performed with
+ * value 0 to mix the bits of the last value added.
+ *
+ * The partial checksums are then folded together using xor to form a single
+ * 32-bit checksum. The caller can safely reduce the value to 16 bits
+ * using modulo 2^16-1. That will cause a very slight bias towards lower
+ * values but this is not significant for the performance of the
+ * checksum.
+ *
+ * The algorithm choice was based on what instructions are available in SIMD
+ * instruction sets. This meant that a fast and good algorithm needed to use
+ * multiplication as the main mixing operator. The simplest multiplication
+ * based checksum primitive is the one used by FNV. The prime used is chosen
+ * for good dispersion of values. It has no known simple patterns that result
+ * in collisions. Test of 5-bit differentials of the primitive over 64bit keys
+ * reveals no differentials with 3 or more values out of 100000 random keys
+ * colliding. Avalanche test shows that only high order bits of the last word
+ * have a bias. Tests of 1-4 uncorrelated bit errors, stray 0 and 0xFF bytes,
+ * overwriting page from random position to end with 0 bytes, and overwriting
+ * random segments of page with 0x00, 0xFF and random data all show optimal
+ * 2e-16 false positive rate within margin of error.
+ *
+ * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer
+ * multiplication instruction. As of 2013 the corresponding instruction is
+ * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32).
+ * Vectorization requires a compiler to do the vectorization for us. For recent
+ * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough
+ * to achieve vectorization.
+ *
+ * The optimal amount of parallelism to use depends on CPU specific instruction
+ * latency, SIMD instruction width, throughput and the amount of registers
+ * available to hold intermediate state. Generally, more parallelism is better
+ * up to the point that state doesn't fit in registers and extra load-store
+ * instructions are needed to swap values in/out. The number chosen is a fixed
+ * part of the algorithm because changing the parallelism changes the checksum
+ * result.
+ *
+ * The parallelism number 32 was chosen based on the fact that it is the
+ * largest state that fits into architecturally visible x86 SSE registers while
+ * leaving some free registers for intermediate values. For future processors
+ * with 256bit vector registers this will leave some performance on the table.
+ * When vectorization is not available it might be beneficial to restructure
+ * the computation to calculate a subset of the columns at a time and perform
+ * multiple passes to avoid register spilling. This optimization opportunity
+ * is not used. Current coding also assumes that the compiler has the ability
+ * to unroll the inner loop to avoid loop overhead and minimize register
+ * spilling. For less sophisticated compilers it might be beneficial to
+ * manually unroll the inner loop.
+ */
+
+#include "storage/bufpage.h"
+
+/* number of checksums to calculate in parallel */
+#define N_SUMS 32
+/* prime multiplier of FNV-1a hash */
+#define FNV_PRIME 16777619
+
+/* Use a union so that this code is valid under strict aliasing */
+typedef union
+{
+ PageHeaderData phdr;
+ uint32 data[BLCKSZ / (sizeof(uint32) * N_SUMS)][N_SUMS];
+} PGChecksummablePage;
+
+/*
+ * Base offsets to initialize each of the parallel FNV hashes into a
+ * different initial state.
+ */
+static const uint32 checksumBaseOffsets[N_SUMS] = {
+ 0x5B1F36E9, 0xB8525960, 0x02AB50AA, 0x1DE66D2A,
+ 0x79FF467A, 0x9BB9F8A3, 0x217E7CD2, 0x83E13D2C,
+ 0xF8D4474F, 0xE39EB970, 0x42C6AE16, 0x993216FA,
+ 0x7B093B5D, 0x98DAFF3C, 0xF718902A, 0x0B1C9CDB,
+ 0xE58F764B, 0x187636BC, 0x5D7B3BB1, 0xE73DE7DE,
+ 0x92BEC979, 0xCCA6C0B2, 0x304A0979, 0x85AA43D4,
+ 0x783125BB, 0x6CA8EAA2, 0xE407EAC6, 0x4B5CFC3E,
+ 0x9FBF8C76, 0x15CA20BE, 0xF2CA9FD3, 0x959BD756
+};
+
+/*
+ * Calculate one round of the checksum.
+ */
+#define CHECKSUM_COMP(checksum, value) \
+do { \
+ uint32 __tmp = (checksum) ^ (value); \
+ (checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17); \
+} while (0)
+
+/*
+ * Block checksum algorithm. The page must be adequately aligned
+ * (at least on 4-byte boundary).
+ */
+static uint32
+pg_checksum_block(const PGChecksummablePage *page)
+{
+ uint32 sums[N_SUMS];
+ uint32 result = 0;
+ uint32 i,
+ j;
+
+ /* ensure that the size is compatible with the algorithm */
+ Assert(sizeof(PGChecksummablePage) == BLCKSZ);
+
+ /* initialize partial checksums to their corresponding offsets */
+ memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
+
+ /* main checksum calculation */
+ for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++)
+ for (j = 0; j < N_SUMS; j++)
+ CHECKSUM_COMP(sums[j], page->data[i][j]);
+
+ /* finally add in two rounds of zeroes for additional mixing */
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < N_SUMS; j++)
+ CHECKSUM_COMP(sums[j], 0);
+
+ /* xor fold partial checksums together */
+ for (i = 0; i < N_SUMS; i++)
+ result ^= sums[i];
+
+ return result;
+}
+
+/*
+ * Compute the checksum for a Postgres page.
+ *
+ * The page must be adequately aligned (at least on a 4-byte boundary).
+ * Beware also that the checksum field of the page is transiently zeroed.
+ *
+ * The checksum includes the block number (to detect the case where a page is
+ * somehow moved to a different location), the page header (excluding the
+ * checksum itself), and the page data.
+ */
+uint16
+pg_checksum_page(char *page, BlockNumber blkno)
+{
+ PGChecksummablePage *cpage = (PGChecksummablePage *) page;
+ uint16 save_checksum;
+ uint32 checksum;
+
+ /* We only calculate the checksum for properly-initialized pages */
+ Assert(!PageIsNew(&cpage->phdr));
+
+ /*
+ * Save pd_checksum and temporarily set it to zero, so that the checksum
+ * calculation isn't affected by the old checksum stored on the page.
+ * Restore it after, because actually updating the checksum is NOT part of
+ * the API of this function.
+ */
+ save_checksum = cpage->phdr.pd_checksum;
+ cpage->phdr.pd_checksum = 0;
+ checksum = pg_checksum_block(cpage);
+ cpage->phdr.pd_checksum = save_checksum;
+
+ /* Mix in the block number to detect transposed pages */
+ checksum ^= blkno;
+
+ /*
+ * Reduce to a uint16 (to fit in the pd_checksum field) with an offset of
+ * one. That avoids checksums of zero, which seems like a good idea.
+ */
+ return (uint16) ((checksum % 65535) + 1);
+}
diff --git a/src/include/storage/condition_variable.h b/src/include/storage/condition_variable.h
new file mode 100644
index 0000000..e89175e
--- /dev/null
+++ b/src/include/storage/condition_variable.h
@@ -0,0 +1,73 @@
+/*-------------------------------------------------------------------------
+ *
+ * condition_variable.h
+ * Condition variables
+ *
+ * A condition variable is a method of waiting until a certain condition
+ * becomes true. Conventionally, a condition variable supports three
+ * operations: (1) sleep; (2) signal, which wakes up one process sleeping
+ * on the condition variable; and (3) broadcast, which wakes up every
+ * process sleeping on the condition variable. In our implementation,
+ * condition variables put a process into an interruptible sleep (so it
+ * can be canceled prior to the fulfillment of the condition) and do not
+ * use pointers internally (so that they are safe to use within DSMs).
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/condition_variable.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef CONDITION_VARIABLE_H
+#define CONDITION_VARIABLE_H
+
+#include "storage/proclist_types.h"
+#include "storage/spin.h"
+
+typedef struct
+{
+ slock_t mutex; /* spinlock protecting the wakeup list */
+ proclist_head wakeup; /* list of wake-able processes */
+} ConditionVariable;
+
+/*
+ * Pad a condition variable to a power-of-two size so that an array of
+ * condition variables does not cross a cache line boundary.
+ */
+#define CV_MINIMAL_SIZE (sizeof(ConditionVariable) <= 16 ? 16 : 32)
+typedef union ConditionVariableMinimallyPadded
+{
+ ConditionVariable cv;
+ char pad[CV_MINIMAL_SIZE];
+} ConditionVariableMinimallyPadded;
+
+/* Initialize a condition variable. */
+extern void ConditionVariableInit(ConditionVariable *cv);
+
+/*
+ * To sleep on a condition variable, a process should use a loop which first
+ * checks the condition, exiting the loop if it is met, and then calls
+ * ConditionVariableSleep. Spurious wakeups are possible, but should be
+ * infrequent. After exiting the loop, ConditionVariableCancelSleep must
+ * be called to ensure that the process is no longer in the wait list for
+ * the condition variable.
+ */
+extern void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info);
+extern bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout,
+ uint32 wait_event_info);
+extern void ConditionVariableCancelSleep(void);
+
+/*
+ * Optionally, ConditionVariablePrepareToSleep can be called before entering
+ * the test-and-sleep loop described above. Doing so is more efficient if
+ * at least one sleep is needed, whereas not doing so is more efficient when
+ * no sleep is needed because the test condition is true the first time.
+ */
+extern void ConditionVariablePrepareToSleep(ConditionVariable *cv);
+
+/* Wake up a single waiter (via signal) or all waiters (via broadcast). */
+extern void ConditionVariableSignal(ConditionVariable *cv);
+extern void ConditionVariableBroadcast(ConditionVariable *cv);
+
+#endif /* CONDITION_VARIABLE_H */
diff --git a/src/include/storage/copydir.h b/src/include/storage/copydir.h
new file mode 100644
index 0000000..50a26ed
--- /dev/null
+++ b/src/include/storage/copydir.h
@@ -0,0 +1,19 @@
+/*-------------------------------------------------------------------------
+ *
+ * copydir.h
+ * Copy a directory.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/copydir.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef COPYDIR_H
+#define COPYDIR_H
+
+extern void copydir(char *fromdir, char *todir, bool recurse);
+extern void copy_file(char *fromfile, char *tofile);
+
+#endif /* COPYDIR_H */
diff --git a/src/include/storage/dsm.h b/src/include/storage/dsm.h
new file mode 100644
index 0000000..4dd6af2
--- /dev/null
+++ b/src/include/storage/dsm.h
@@ -0,0 +1,64 @@
+/*-------------------------------------------------------------------------
+ *
+ * dsm.h
+ * manage dynamic shared memory segments
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/dsm.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef DSM_H
+#define DSM_H
+
+#include "storage/dsm_impl.h"
+
+typedef struct dsm_segment dsm_segment;
+
+#define DSM_CREATE_NULL_IF_MAXSEGMENTS 0x0001
+
+/* A sentinel value for an invalid DSM handle. */
+#define DSM_HANDLE_INVALID 0
+
+/* Startup and shutdown functions. */
+struct PGShmemHeader; /* avoid including pg_shmem.h */
+extern void dsm_cleanup_using_control_segment(dsm_handle old_control_handle);
+extern void dsm_postmaster_startup(struct PGShmemHeader *);
+extern void dsm_backend_shutdown(void);
+extern void dsm_detach_all(void);
+
+extern size_t dsm_estimate_size(void);
+extern void dsm_shmem_init(void);
+
+#ifdef EXEC_BACKEND
+extern void dsm_set_control_handle(dsm_handle h);
+#endif
+
+/* Functions that create or remove mappings. */
+extern dsm_segment *dsm_create(Size size, int flags);
+extern dsm_segment *dsm_attach(dsm_handle h);
+extern void dsm_detach(dsm_segment *seg);
+
+/* Resource management functions. */
+extern void dsm_pin_mapping(dsm_segment *seg);
+extern void dsm_unpin_mapping(dsm_segment *seg);
+extern void dsm_pin_segment(dsm_segment *seg);
+extern void dsm_unpin_segment(dsm_handle h);
+extern dsm_segment *dsm_find_mapping(dsm_handle h);
+
+/* Informational functions. */
+extern void *dsm_segment_address(dsm_segment *seg);
+extern Size dsm_segment_map_length(dsm_segment *seg);
+extern dsm_handle dsm_segment_handle(dsm_segment *seg);
+
+/* Cleanup hooks. */
+typedef void (*on_dsm_detach_callback) (dsm_segment *, Datum arg);
+extern void on_dsm_detach(dsm_segment *seg,
+ on_dsm_detach_callback function, Datum arg);
+extern void cancel_on_dsm_detach(dsm_segment *seg,
+ on_dsm_detach_callback function, Datum arg);
+extern void reset_on_dsm_detach(void);
+
+#endif /* DSM_H */
diff --git a/src/include/storage/dsm_impl.h b/src/include/storage/dsm_impl.h
new file mode 100644
index 0000000..c51584d
--- /dev/null
+++ b/src/include/storage/dsm_impl.h
@@ -0,0 +1,76 @@
+/*-------------------------------------------------------------------------
+ *
+ * dsm_impl.h
+ * low-level dynamic shared memory primitives
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/dsm_impl.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef DSM_IMPL_H
+#define DSM_IMPL_H
+
+/* Dynamic shared memory implementations. */
+#define DSM_IMPL_POSIX 1
+#define DSM_IMPL_SYSV 2
+#define DSM_IMPL_WINDOWS 3
+#define DSM_IMPL_MMAP 4
+
+/*
+ * Determine which dynamic shared memory implementations will be supported
+ * on this platform, and which one will be the default.
+ */
+#ifdef WIN32
+#define USE_DSM_WINDOWS
+#define DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE DSM_IMPL_WINDOWS
+#else
+#ifdef HAVE_SHM_OPEN
+#define USE_DSM_POSIX
+#define DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE DSM_IMPL_POSIX
+#endif
+#define USE_DSM_SYSV
+#ifndef DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE
+#define DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE DSM_IMPL_SYSV
+#endif
+#define USE_DSM_MMAP
+#endif
+
+/* GUC. */
+extern PGDLLIMPORT int dynamic_shared_memory_type;
+extern PGDLLIMPORT int min_dynamic_shared_memory;
+
+/*
+ * Directory for on-disk state.
+ *
+ * This is used by all implementations for crash recovery and by the mmap
+ * implementation for storage.
+ */
+#define PG_DYNSHMEM_DIR "pg_dynshmem"
+#define PG_DYNSHMEM_MMAP_FILE_PREFIX "mmap."
+
+/* A "name" for a dynamic shared memory segment. */
+typedef uint32 dsm_handle;
+
+/* All the shared-memory operations we know about. */
+typedef enum
+{
+ DSM_OP_CREATE,
+ DSM_OP_ATTACH,
+ DSM_OP_DETACH,
+ DSM_OP_DESTROY
+} dsm_op;
+
+/* Create, attach to, detach from, resize, or destroy a segment. */
+extern bool dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
+ void **impl_private, void **mapped_address, Size *mapped_size,
+ int elevel);
+
+/* Implementation-dependent actions required to keep segment until shutdown. */
+extern void dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
+ void **impl_private_pm_handle);
+extern void dsm_impl_unpin_segment(dsm_handle handle, void **impl_private);
+
+#endif /* DSM_IMPL_H */
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
new file mode 100644
index 0000000..69549b0
--- /dev/null
+++ b/src/include/storage/fd.h
@@ -0,0 +1,198 @@
+/*-------------------------------------------------------------------------
+ *
+ * fd.h
+ * Virtual file descriptor definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/fd.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * calls:
+ *
+ * File {Close, Read, Write, Size, Sync}
+ * {Path Name Open, Allocate, Free} File
+ *
+ * These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
+ * Use them for all file activity...
+ *
+ * File fd;
+ * fd = PathNameOpenFile("foo", O_RDONLY);
+ *
+ * AllocateFile();
+ * FreeFile();
+ *
+ * Use AllocateFile, not fopen, if you need a stdio file (FILE*); then
+ * use FreeFile, not fclose, to close it. AVOID using stdio for files
+ * that you intend to hold open for any length of time, since there is
+ * no way for them to share kernel file descriptors with other files.
+ *
+ * Likewise, use AllocateDir/FreeDir, not opendir/closedir, to allocate
+ * open directories (DIR*), and OpenTransientFile/CloseTransientFile for an
+ * unbuffered file descriptor.
+ *
+ * If you really can't use any of the above, at least call AcquireExternalFD
+ * or ReserveExternalFD to report any file descriptors that are held for any
+ * length of time. Failure to do so risks unnecessary EMFILE errors.
+ */
+#ifndef FD_H
+#define FD_H
+
+#include <dirent.h>
+
+typedef enum RecoveryInitSyncMethod
+{
+ RECOVERY_INIT_SYNC_METHOD_FSYNC,
+ RECOVERY_INIT_SYNC_METHOD_SYNCFS
+} RecoveryInitSyncMethod;
+
+struct iovec; /* avoid including port/pg_iovec.h here */
+
+typedef int File;
+
+
+/* GUC parameter */
+extern PGDLLIMPORT int max_files_per_process;
+extern PGDLLIMPORT bool data_sync_retry;
+extern PGDLLIMPORT int recovery_init_sync_method;
+
+/*
+ * This is private to fd.c, but exported for save/restore_backend_variables()
+ */
+extern PGDLLIMPORT int max_safe_fds;
+
+/*
+ * On Windows, we have to interpret EACCES as possibly meaning the same as
+ * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
+ * that's what you get. Ugh. This code is designed so that we don't
+ * actually believe these cases are okay without further evidence (namely,
+ * a pending fsync request getting canceled ... see ProcessSyncRequests).
+ */
+#ifndef WIN32
+#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT)
+#else
+#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES)
+#endif
+
+/*
+ * O_DIRECT is not standard, but almost every Unix has it. We translate it
+ * to the appropriate Windows flag in src/port/open.c. We simulate it with
+ * fcntl(F_NOCACHE) on macOS inside fd.c's open() wrapper. We use the name
+ * PG_O_DIRECT rather than defining O_DIRECT in that case (probably not a good
+ * idea on a Unix).
+ */
+#if defined(O_DIRECT)
+#define PG_O_DIRECT O_DIRECT
+#elif defined(F_NOCACHE)
+#define PG_O_DIRECT 0x80000000
+#define PG_O_DIRECT_USE_F_NOCACHE
+#else
+#define PG_O_DIRECT 0
+#endif
+
+/*
+ * prototypes for functions in fd.c
+ */
+
+/* Operations on virtual Files --- equivalent to Unix kernel file ops */
+extern File PathNameOpenFile(const char *fileName, int fileFlags);
+extern File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode);
+extern File OpenTemporaryFile(bool interXact);
+extern void FileClose(File file);
+extern int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info);
+extern int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info);
+extern int FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info);
+extern int FileSync(File file, uint32 wait_event_info);
+extern off_t FileSize(File file);
+extern int FileTruncate(File file, off_t offset, uint32 wait_event_info);
+extern void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info);
+extern char *FilePathName(File file);
+extern int FileGetRawDesc(File file);
+extern int FileGetRawFlags(File file);
+extern mode_t FileGetRawMode(File file);
+
+/* Operations used for sharing named temporary files */
+extern File PathNameCreateTemporaryFile(const char *name, bool error_on_failure);
+extern File PathNameOpenTemporaryFile(const char *path, int mode);
+extern bool PathNameDeleteTemporaryFile(const char *name, bool error_on_failure);
+extern void PathNameCreateTemporaryDir(const char *base, const char *name);
+extern void PathNameDeleteTemporaryDir(const char *name);
+extern void TempTablespacePath(char *path, Oid tablespace);
+
+/* Operations that allow use of regular stdio --- USE WITH CAUTION */
+extern FILE *AllocateFile(const char *name, const char *mode);
+extern int FreeFile(FILE *file);
+
+/* Operations that allow use of pipe streams (popen/pclose) */
+extern FILE *OpenPipeStream(const char *command, const char *mode);
+extern int ClosePipeStream(FILE *file);
+
+/* Operations to allow use of the <dirent.h> library routines */
+extern DIR *AllocateDir(const char *dirname);
+extern struct dirent *ReadDir(DIR *dir, const char *dirname);
+extern struct dirent *ReadDirExtended(DIR *dir, const char *dirname,
+ int elevel);
+extern int FreeDir(DIR *dir);
+
+/* Operations to allow use of a plain kernel FD, with automatic cleanup */
+extern int OpenTransientFile(const char *fileName, int fileFlags);
+extern int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode);
+extern int CloseTransientFile(int fd);
+
+/* If you've really really gotta have a plain kernel FD, use this */
+extern int BasicOpenFile(const char *fileName, int fileFlags);
+extern int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode);
+
+/* Use these for other cases, and also for long-lived BasicOpenFile FDs */
+extern bool AcquireExternalFD(void);
+extern void ReserveExternalFD(void);
+extern void ReleaseExternalFD(void);
+
+/* Make a directory with default permissions */
+extern int MakePGDirectory(const char *directoryName);
+
+/* Miscellaneous support routines */
+extern void InitFileAccess(void);
+extern void InitTemporaryFileAccess(void);
+extern void set_max_safe_fds(void);
+extern void closeAllVfds(void);
+extern void SetTempTablespaces(Oid *tableSpaces, int numSpaces);
+extern bool TempTablespacesAreSet(void);
+extern int GetTempTablespaces(Oid *tableSpaces, int numSpaces);
+extern Oid GetNextTempTableSpace(void);
+extern void AtEOXact_Files(bool isCommit);
+extern void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
+ SubTransactionId parentSubid);
+extern void RemovePgTempFiles(void);
+extern void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok,
+ bool unlink_all);
+extern bool looks_like_temp_rel_name(const char *name);
+
+extern int pg_fsync(int fd);
+extern int pg_fsync_no_writethrough(int fd);
+extern int pg_fsync_writethrough(int fd);
+extern int pg_fdatasync(int fd);
+extern void pg_flush_data(int fd, off_t offset, off_t amount);
+extern ssize_t pg_pwritev_with_retry(int fd,
+ const struct iovec *iov,
+ int iovcnt,
+ off_t offset);
+extern int pg_truncate(const char *path, off_t length);
+extern void fsync_fname(const char *fname, bool isdir);
+extern int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
+extern int durable_rename(const char *oldfile, const char *newfile, int loglevel);
+extern int durable_unlink(const char *fname, int loglevel);
+extern int durable_rename_excl(const char *oldfile, const char *newfile, int loglevel);
+extern void SyncDataDirectory(void);
+extern int data_sync_elevel(int elevel);
+
+/* Filename components */
+#define PG_TEMP_FILES_DIR "pgsql_tmp"
+#define PG_TEMP_FILE_PREFIX "pgsql_tmp"
+
+#endif /* FD_H */
diff --git a/src/include/storage/fileset.h b/src/include/storage/fileset.h
new file mode 100644
index 0000000..ad37884
--- /dev/null
+++ b/src/include/storage/fileset.h
@@ -0,0 +1,40 @@
+/*-------------------------------------------------------------------------
+ *
+ * fileset.h
+ * Management of named temporary files.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/fileset.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef FILESET_H
+#define FILESET_H
+
+#include "storage/fd.h"
+
+/*
+ * A set of temporary files.
+ */
+typedef struct FileSet
+{
+ pid_t creator_pid; /* PID of the creating process */
+ uint32 number; /* per-PID identifier */
+ int ntablespaces; /* number of tablespaces to use */
+ Oid tablespaces[8]; /* OIDs of tablespaces to use. Assumes that
+ * it's rare that there more than temp
+ * tablespaces. */
+} FileSet;
+
+extern void FileSetInit(FileSet *fileset);
+extern File FileSetCreate(FileSet *fileset, const char *name);
+extern File FileSetOpen(FileSet *fileset, const char *name,
+ int mode);
+extern bool FileSetDelete(FileSet *fileset, const char *name,
+ bool error_on_failure);
+extern void FileSetDeleteAll(FileSet *fileset);
+
+#endif
diff --git a/src/include/storage/freespace.h b/src/include/storage/freespace.h
new file mode 100644
index 0000000..dcc40eb
--- /dev/null
+++ b/src/include/storage/freespace.h
@@ -0,0 +1,39 @@
+/*-------------------------------------------------------------------------
+ *
+ * freespace.h
+ * POSTGRES free space map for quickly finding free space in relations
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/freespace.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef FREESPACE_H_
+#define FREESPACE_H_
+
+#include "storage/block.h"
+#include "storage/relfilenode.h"
+#include "utils/relcache.h"
+
+/* prototypes for public functions in freespace.c */
+extern Size GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk);
+extern BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded);
+extern BlockNumber RecordAndGetPageWithFreeSpace(Relation rel,
+ BlockNumber oldPage,
+ Size oldSpaceAvail,
+ Size spaceNeeded);
+extern void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk,
+ Size spaceAvail);
+extern void XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
+ Size spaceAvail);
+
+extern BlockNumber FreeSpaceMapPrepareTruncateRel(Relation rel,
+ BlockNumber nblocks);
+extern void FreeSpaceMapVacuum(Relation rel);
+extern void FreeSpaceMapVacuumRange(Relation rel, BlockNumber start,
+ BlockNumber end);
+
+#endif /* FREESPACE_H_ */
diff --git a/src/include/storage/fsm_internals.h b/src/include/storage/fsm_internals.h
new file mode 100644
index 0000000..a6f8372
--- /dev/null
+++ b/src/include/storage/fsm_internals.h
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * fsm_internals.h
+ * internal functions for free space map
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/fsm_internals.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef FSM_INTERNALS_H
+#define FSM_INTERNALS_H
+
+#include "storage/buf.h"
+#include "storage/bufpage.h"
+
+/*
+ * Structure of a FSM page. See src/backend/storage/freespace/README for
+ * details.
+ */
+typedef struct
+{
+ /*
+ * fsm_search_avail() tries to spread the load of multiple backends by
+ * returning different pages to different backends in a round-robin
+ * fashion. fp_next_slot points to the next slot to be returned (assuming
+ * there's enough space on it for the request). It's defined as an int,
+ * because it's updated without an exclusive lock. uint16 would be more
+ * appropriate, but int is more likely to be atomically
+ * fetchable/storable.
+ */
+ int fp_next_slot;
+
+ /*
+ * fp_nodes contains the binary tree, stored in array. The first
+ * NonLeafNodesPerPage elements are upper nodes, and the following
+ * LeafNodesPerPage elements are leaf nodes. Unused nodes are zero.
+ */
+ uint8 fp_nodes[FLEXIBLE_ARRAY_MEMBER];
+} FSMPageData;
+
+typedef FSMPageData *FSMPage;
+
+/*
+ * Number of non-leaf and leaf nodes, and nodes in total, on an FSM page.
+ * These definitions are internal to fsmpage.c.
+ */
+#define NodesPerPage (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \
+ offsetof(FSMPageData, fp_nodes))
+
+#define NonLeafNodesPerPage (BLCKSZ / 2 - 1)
+#define LeafNodesPerPage (NodesPerPage - NonLeafNodesPerPage)
+
+/*
+ * Number of FSM "slots" on a FSM page. This is what should be used
+ * outside fsmpage.c.
+ */
+#define SlotsPerFSMPage LeafNodesPerPage
+
+/* Prototypes for functions in fsmpage.c */
+extern int fsm_search_avail(Buffer buf, uint8 min_cat, bool advancenext,
+ bool exclusive_lock_held);
+extern uint8 fsm_get_avail(Page page, int slot);
+extern uint8 fsm_get_max_avail(Page page);
+extern bool fsm_set_avail(Page page, int slot, uint8 value);
+extern bool fsm_truncate_avail(Page page, int nslots);
+extern bool fsm_rebuild_page(Page page);
+
+#endif /* FSM_INTERNALS_H */
diff --git a/src/include/storage/indexfsm.h b/src/include/storage/indexfsm.h
new file mode 100644
index 0000000..04c1a05
--- /dev/null
+++ b/src/include/storage/indexfsm.h
@@ -0,0 +1,26 @@
+/*-------------------------------------------------------------------------
+ *
+ * indexfsm.h
+ * POSTGRES free space map for quickly finding an unused page in index
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/indexfsm.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef INDEXFSM_H_
+#define INDEXFSM_H_
+
+#include "storage/block.h"
+#include "utils/relcache.h"
+
+extern BlockNumber GetFreeIndexPage(Relation rel);
+extern void RecordFreeIndexPage(Relation rel, BlockNumber page);
+extern void RecordUsedIndexPage(Relation rel, BlockNumber page);
+
+extern void IndexFreeSpaceMapVacuum(Relation rel);
+
+#endif /* INDEXFSM_H_ */
diff --git a/src/include/storage/ipc.h b/src/include/storage/ipc.h
new file mode 100644
index 0000000..fade4db
--- /dev/null
+++ b/src/include/storage/ipc.h
@@ -0,0 +1,84 @@
+/*-------------------------------------------------------------------------
+ *
+ * ipc.h
+ * POSTGRES inter-process communication definitions.
+ *
+ * This file is misnamed, as it no longer has much of anything directly
+ * to do with IPC. The functionality here is concerned with managing
+ * exit-time cleanup for either a postmaster or a backend.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/ipc.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef IPC_H
+#define IPC_H
+
+typedef void (*pg_on_exit_callback) (int code, Datum arg);
+typedef void (*shmem_startup_hook_type) (void);
+
+/*----------
+ * API for handling cleanup that must occur during either ereport(ERROR)
+ * or ereport(FATAL) exits from a block of code. (Typical examples are
+ * undoing transient changes to shared-memory state.)
+ *
+ * PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg);
+ * {
+ * ... code that might throw ereport(ERROR) or ereport(FATAL) ...
+ * }
+ * PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg);
+ *
+ * where the cleanup code is in a function declared per pg_on_exit_callback.
+ * The Datum value "arg" can carry any information the cleanup function
+ * needs.
+ *
+ * This construct ensures that cleanup_function() will be called during
+ * either ERROR or FATAL exits. It will not be called on successful
+ * exit from the controlled code. (If you want it to happen then too,
+ * call the function yourself from just after the construct.)
+ *
+ * Note: the macro arguments are multiply evaluated, so avoid side-effects.
+ *----------
+ */
+#define PG_ENSURE_ERROR_CLEANUP(cleanup_function, arg) \
+ do { \
+ before_shmem_exit(cleanup_function, arg); \
+ PG_TRY()
+
+#define PG_END_ENSURE_ERROR_CLEANUP(cleanup_function, arg) \
+ cancel_before_shmem_exit(cleanup_function, arg); \
+ PG_CATCH(); \
+ { \
+ cancel_before_shmem_exit(cleanup_function, arg); \
+ cleanup_function (0, arg); \
+ PG_RE_THROW(); \
+ } \
+ PG_END_TRY(); \
+ } while (0)
+
+
+/* ipc.c */
+extern PGDLLIMPORT bool proc_exit_inprogress;
+extern PGDLLIMPORT bool shmem_exit_inprogress;
+
+extern void proc_exit(int code) pg_attribute_noreturn();
+extern void shmem_exit(int code);
+extern void on_proc_exit(pg_on_exit_callback function, Datum arg);
+extern void on_shmem_exit(pg_on_exit_callback function, Datum arg);
+extern void before_shmem_exit(pg_on_exit_callback function, Datum arg);
+extern void cancel_before_shmem_exit(pg_on_exit_callback function, Datum arg);
+extern void on_exit_reset(void);
+extern void check_on_shmem_exit_lists_are_empty(void);
+
+/* ipci.c */
+extern PGDLLIMPORT shmem_startup_hook_type shmem_startup_hook;
+
+extern Size CalculateShmemSize(int *num_semaphores);
+extern void CreateSharedMemoryAndSemaphores(void);
+extern void InitializeShmemGUCs(void);
+
+#endif /* IPC_H */
diff --git a/src/include/storage/item.h b/src/include/storage/item.h
new file mode 100644
index 0000000..6f3eaeb
--- /dev/null
+++ b/src/include/storage/item.h
@@ -0,0 +1,19 @@
+/*-------------------------------------------------------------------------
+ *
+ * item.h
+ * POSTGRES disk item definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/item.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ITEM_H
+#define ITEM_H
+
+typedef Pointer Item;
+
+#endif /* ITEM_H */
diff --git a/src/include/storage/itemid.h b/src/include/storage/itemid.h
new file mode 100644
index 0000000..e33637f
--- /dev/null
+++ b/src/include/storage/itemid.h
@@ -0,0 +1,184 @@
+/*-------------------------------------------------------------------------
+ *
+ * itemid.h
+ * Standard POSTGRES buffer page item identifier/line pointer definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/itemid.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ITEMID_H
+#define ITEMID_H
+
+/*
+ * A line pointer on a buffer page. See buffer page definitions and comments
+ * for an explanation of how line pointers are used.
+ *
+ * In some cases a line pointer is "in use" but does not have any associated
+ * storage on the page. By convention, lp_len == 0 in every line pointer
+ * that does not have storage, independently of its lp_flags state.
+ */
+typedef struct ItemIdData
+{
+ unsigned lp_off:15, /* offset to tuple (from start of page) */
+ lp_flags:2, /* state of line pointer, see below */
+ lp_len:15; /* byte length of tuple */
+} ItemIdData;
+
+typedef ItemIdData *ItemId;
+
+/*
+ * lp_flags has these possible states. An UNUSED line pointer is available
+ * for immediate re-use, the other states are not.
+ */
+#define LP_UNUSED 0 /* unused (should always have lp_len=0) */
+#define LP_NORMAL 1 /* used (should always have lp_len>0) */
+#define LP_REDIRECT 2 /* HOT redirect (should have lp_len=0) */
+#define LP_DEAD 3 /* dead, may or may not have storage */
+
+/*
+ * Item offsets and lengths are represented by these types when
+ * they're not actually stored in an ItemIdData.
+ */
+typedef uint16 ItemOffset;
+typedef uint16 ItemLength;
+
+
+/* ----------------
+ * support macros
+ * ----------------
+ */
+
+/*
+ * ItemIdGetLength
+ */
+#define ItemIdGetLength(itemId) \
+ ((itemId)->lp_len)
+
+/*
+ * ItemIdGetOffset
+ */
+#define ItemIdGetOffset(itemId) \
+ ((itemId)->lp_off)
+
+/*
+ * ItemIdGetFlags
+ */
+#define ItemIdGetFlags(itemId) \
+ ((itemId)->lp_flags)
+
+/*
+ * ItemIdGetRedirect
+ * In a REDIRECT pointer, lp_off holds offset number for next line pointer
+ */
+#define ItemIdGetRedirect(itemId) \
+ ((itemId)->lp_off)
+
+/*
+ * ItemIdIsValid
+ * True iff item identifier is valid.
+ * This is a pretty weak test, probably useful only in Asserts.
+ */
+#define ItemIdIsValid(itemId) PointerIsValid(itemId)
+
+/*
+ * ItemIdIsUsed
+ * True iff item identifier is in use.
+ */
+#define ItemIdIsUsed(itemId) \
+ ((itemId)->lp_flags != LP_UNUSED)
+
+/*
+ * ItemIdIsNormal
+ * True iff item identifier is in state NORMAL.
+ */
+#define ItemIdIsNormal(itemId) \
+ ((itemId)->lp_flags == LP_NORMAL)
+
+/*
+ * ItemIdIsRedirected
+ * True iff item identifier is in state REDIRECT.
+ */
+#define ItemIdIsRedirected(itemId) \
+ ((itemId)->lp_flags == LP_REDIRECT)
+
+/*
+ * ItemIdIsDead
+ * True iff item identifier is in state DEAD.
+ */
+#define ItemIdIsDead(itemId) \
+ ((itemId)->lp_flags == LP_DEAD)
+
+/*
+ * ItemIdHasStorage
+ * True iff item identifier has associated storage.
+ */
+#define ItemIdHasStorage(itemId) \
+ ((itemId)->lp_len != 0)
+
+/*
+ * ItemIdSetUnused
+ * Set the item identifier to be UNUSED, with no storage.
+ * Beware of multiple evaluations of itemId!
+ */
+#define ItemIdSetUnused(itemId) \
+( \
+ (itemId)->lp_flags = LP_UNUSED, \
+ (itemId)->lp_off = 0, \
+ (itemId)->lp_len = 0 \
+)
+
+/*
+ * ItemIdSetNormal
+ * Set the item identifier to be NORMAL, with the specified storage.
+ * Beware of multiple evaluations of itemId!
+ */
+#define ItemIdSetNormal(itemId, off, len) \
+( \
+ (itemId)->lp_flags = LP_NORMAL, \
+ (itemId)->lp_off = (off), \
+ (itemId)->lp_len = (len) \
+)
+
+/*
+ * ItemIdSetRedirect
+ * Set the item identifier to be REDIRECT, with the specified link.
+ * Beware of multiple evaluations of itemId!
+ */
+#define ItemIdSetRedirect(itemId, link) \
+( \
+ (itemId)->lp_flags = LP_REDIRECT, \
+ (itemId)->lp_off = (link), \
+ (itemId)->lp_len = 0 \
+)
+
+/*
+ * ItemIdSetDead
+ * Set the item identifier to be DEAD, with no storage.
+ * Beware of multiple evaluations of itemId!
+ */
+#define ItemIdSetDead(itemId) \
+( \
+ (itemId)->lp_flags = LP_DEAD, \
+ (itemId)->lp_off = 0, \
+ (itemId)->lp_len = 0 \
+)
+
+/*
+ * ItemIdMarkDead
+ * Set the item identifier to be DEAD, keeping its existing storage.
+ *
+ * Note: in indexes, this is used as if it were a hint-bit mechanism;
+ * we trust that multiple processors can do this in parallel and get
+ * the same result.
+ */
+#define ItemIdMarkDead(itemId) \
+( \
+ (itemId)->lp_flags = LP_DEAD \
+)
+
+#endif /* ITEMID_H */
diff --git a/src/include/storage/itemptr.h b/src/include/storage/itemptr.h
new file mode 100644
index 0000000..81947bc
--- /dev/null
+++ b/src/include/storage/itemptr.h
@@ -0,0 +1,208 @@
+/*-------------------------------------------------------------------------
+ *
+ * itemptr.h
+ * POSTGRES disk item pointer definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/itemptr.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ITEMPTR_H
+#define ITEMPTR_H
+
+#include "storage/block.h"
+#include "storage/off.h"
+
+/*
+ * ItemPointer:
+ *
+ * This is a pointer to an item within a disk page of a known file
+ * (for example, a cross-link from an index to its parent table).
+ * ip_blkid tells us which block, ip_posid tells us which entry in
+ * the linp (ItemIdData) array we want.
+ *
+ * Note: because there is an item pointer in each tuple header and index
+ * tuple header on disk, it's very important not to waste space with
+ * structure padding bytes. The struct is designed to be six bytes long
+ * (it contains three int16 fields) but a few compilers will pad it to
+ * eight bytes unless coerced. We apply appropriate persuasion where
+ * possible. If your compiler can't be made to play along, you'll waste
+ * lots of space.
+ */
+typedef struct ItemPointerData
+{
+ BlockIdData ip_blkid;
+ OffsetNumber ip_posid;
+}
+
+/* If compiler understands packed and aligned pragmas, use those */
+#if defined(pg_attribute_packed) && defined(pg_attribute_aligned)
+ pg_attribute_packed()
+ pg_attribute_aligned(2)
+#endif
+ItemPointerData;
+
+typedef ItemPointerData *ItemPointer;
+
+/* ----------------
+ * special values used in heap tuples (t_ctid)
+ * ----------------
+ */
+
+/*
+ * If a heap tuple holds a speculative insertion token rather than a real
+ * TID, ip_posid is set to SpecTokenOffsetNumber, and the token is stored in
+ * ip_blkid. SpecTokenOffsetNumber must be higher than MaxOffsetNumber, so
+ * that it can be distinguished from a valid offset number in a regular item
+ * pointer.
+ */
+#define SpecTokenOffsetNumber 0xfffe
+
+/*
+ * When a tuple is moved to a different partition by UPDATE, the t_ctid of
+ * the old tuple version is set to this magic value.
+ */
+#define MovedPartitionsOffsetNumber 0xfffd
+#define MovedPartitionsBlockNumber InvalidBlockNumber
+
+
+/* ----------------
+ * support macros
+ * ----------------
+ */
+
+/*
+ * ItemPointerIsValid
+ * True iff the disk item pointer is not NULL.
+ */
+#define ItemPointerIsValid(pointer) \
+ ((bool) (PointerIsValid(pointer) && ((pointer)->ip_posid != 0)))
+
+/*
+ * ItemPointerGetBlockNumberNoCheck
+ * Returns the block number of a disk item pointer.
+ */
+#define ItemPointerGetBlockNumberNoCheck(pointer) \
+( \
+ BlockIdGetBlockNumber(&(pointer)->ip_blkid) \
+)
+
+/*
+ * ItemPointerGetBlockNumber
+ * As above, but verifies that the item pointer looks valid.
+ */
+#define ItemPointerGetBlockNumber(pointer) \
+( \
+ AssertMacro(ItemPointerIsValid(pointer)), \
+ ItemPointerGetBlockNumberNoCheck(pointer) \
+)
+
+/*
+ * ItemPointerGetOffsetNumberNoCheck
+ * Returns the offset number of a disk item pointer.
+ */
+#define ItemPointerGetOffsetNumberNoCheck(pointer) \
+( \
+ (pointer)->ip_posid \
+)
+
+/*
+ * ItemPointerGetOffsetNumber
+ * As above, but verifies that the item pointer looks valid.
+ */
+#define ItemPointerGetOffsetNumber(pointer) \
+( \
+ AssertMacro(ItemPointerIsValid(pointer)), \
+ ItemPointerGetOffsetNumberNoCheck(pointer) \
+)
+
+/*
+ * ItemPointerSet
+ * Sets a disk item pointer to the specified block and offset.
+ */
+#define ItemPointerSet(pointer, blockNumber, offNum) \
+( \
+ AssertMacro(PointerIsValid(pointer)), \
+ BlockIdSet(&((pointer)->ip_blkid), blockNumber), \
+ (pointer)->ip_posid = offNum \
+)
+
+/*
+ * ItemPointerSetBlockNumber
+ * Sets a disk item pointer to the specified block.
+ */
+#define ItemPointerSetBlockNumber(pointer, blockNumber) \
+( \
+ AssertMacro(PointerIsValid(pointer)), \
+ BlockIdSet(&((pointer)->ip_blkid), blockNumber) \
+)
+
+/*
+ * ItemPointerSetOffsetNumber
+ * Sets a disk item pointer to the specified offset.
+ */
+#define ItemPointerSetOffsetNumber(pointer, offsetNumber) \
+( \
+ AssertMacro(PointerIsValid(pointer)), \
+ (pointer)->ip_posid = (offsetNumber) \
+)
+
+/*
+ * ItemPointerCopy
+ * Copies the contents of one disk item pointer to another.
+ *
+ * Should there ever be padding in an ItemPointer this would need to be handled
+ * differently as it's used as hash key.
+ */
+#define ItemPointerCopy(fromPointer, toPointer) \
+( \
+ AssertMacro(PointerIsValid(toPointer)), \
+ AssertMacro(PointerIsValid(fromPointer)), \
+ *(toPointer) = *(fromPointer) \
+)
+
+/*
+ * ItemPointerSetInvalid
+ * Sets a disk item pointer to be invalid.
+ */
+#define ItemPointerSetInvalid(pointer) \
+( \
+ AssertMacro(PointerIsValid(pointer)), \
+ BlockIdSet(&((pointer)->ip_blkid), InvalidBlockNumber), \
+ (pointer)->ip_posid = InvalidOffsetNumber \
+)
+
+/*
+ * ItemPointerIndicatesMovedPartitions
+ * True iff the block number indicates the tuple has moved to another
+ * partition.
+ */
+#define ItemPointerIndicatesMovedPartitions(pointer) \
+( \
+ ItemPointerGetOffsetNumber(pointer) == MovedPartitionsOffsetNumber && \
+ ItemPointerGetBlockNumberNoCheck(pointer) == MovedPartitionsBlockNumber \
+)
+
+/*
+ * ItemPointerSetMovedPartitions
+ * Indicate that the item referenced by the itempointer has moved into a
+ * different partition.
+ */
+#define ItemPointerSetMovedPartitions(pointer) \
+ ItemPointerSet((pointer), MovedPartitionsBlockNumber, MovedPartitionsOffsetNumber)
+
+/* ----------------
+ * externs
+ * ----------------
+ */
+
+extern bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2);
+extern int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2);
+extern void ItemPointerInc(ItemPointer pointer);
+extern void ItemPointerDec(ItemPointer pointer);
+
+#endif /* ITEMPTR_H */
diff --git a/src/include/storage/large_object.h b/src/include/storage/large_object.h
new file mode 100644
index 0000000..b826a7d
--- /dev/null
+++ b/src/include/storage/large_object.h
@@ -0,0 +1,100 @@
+/*-------------------------------------------------------------------------
+ *
+ * large_object.h
+ * Declarations for PostgreSQL large objects. POSTGRES 4.2 supported
+ * zillions of large objects (internal, external, jaquith, inversion).
+ * Now we only support inversion.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/large_object.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LARGE_OBJECT_H
+#define LARGE_OBJECT_H
+
+#include "utils/snapshot.h"
+
+
+/*----------
+ * Data about a currently-open large object.
+ *
+ * id is the logical OID of the large object
+ * snapshot is the snapshot to use for read/write operations
+ * subid is the subtransaction that opened the desc (or currently owns it)
+ * offset is the current seek offset within the LO
+ * flags contains some flag bits
+ *
+ * NOTE: as of v11, permission checks are made when the large object is
+ * opened; therefore IFS_RDLOCK/IFS_WRLOCK indicate that read or write mode
+ * has been requested *and* the corresponding permission has been checked.
+ *
+ * NOTE: before 7.1, we also had to store references to the separate table
+ * and index of a specific large object. Now they all live in pg_largeobject
+ * and are accessed via a common relation descriptor.
+ *----------
+ */
+typedef struct LargeObjectDesc
+{
+ Oid id; /* LO's identifier */
+ Snapshot snapshot; /* snapshot to use */
+ SubTransactionId subid; /* owning subtransaction ID */
+ uint64 offset; /* current seek pointer */
+ int flags; /* see flag bits below */
+
+/* bits in flags: */
+#define IFS_RDLOCK (1 << 0) /* LO was opened for reading */
+#define IFS_WRLOCK (1 << 1) /* LO was opened for writing */
+
+} LargeObjectDesc;
+
+
+/*
+ * Each "page" (tuple) of a large object can hold this much data
+ *
+ * We could set this as high as BLCKSZ less some overhead, but it seems
+ * better to make it a smaller value, so that not as much space is used
+ * up when a page-tuple is updated. Note that the value is deliberately
+ * chosen large enough to trigger the tuple toaster, so that we will
+ * attempt to compress page tuples in-line. (But they won't be moved off
+ * unless the user creates a toast-table for pg_largeobject...)
+ *
+ * Also, it seems to be a smart move to make the page size be a power of 2,
+ * since clients will often be written to send data in power-of-2 blocks.
+ * This avoids unnecessary tuple updates caused by partial-page writes.
+ *
+ * NB: Changing LOBLKSIZE requires an initdb.
+ */
+#define LOBLKSIZE (BLCKSZ / 4)
+
+/*
+ * Maximum length in bytes for a large object. To make this larger, we'd
+ * have to widen pg_largeobject.pageno as well as various internal variables.
+ */
+#define MAX_LARGE_OBJECT_SIZE ((int64) INT_MAX * LOBLKSIZE)
+
+
+/*
+ * GUC: backwards-compatibility flag to suppress LO permission checks
+ */
+extern PGDLLIMPORT bool lo_compat_privileges;
+
+/*
+ * Function definitions...
+ */
+
+/* inversion stuff in inv_api.c */
+extern void close_lo_relation(bool isCommit);
+extern Oid inv_create(Oid lobjId);
+extern LargeObjectDesc *inv_open(Oid lobjId, int flags, MemoryContext mcxt);
+extern void inv_close(LargeObjectDesc *obj_desc);
+extern int inv_drop(Oid lobjId);
+extern int64 inv_seek(LargeObjectDesc *obj_desc, int64 offset, int whence);
+extern int64 inv_tell(LargeObjectDesc *obj_desc);
+extern int inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes);
+extern int inv_write(LargeObjectDesc *obj_desc, const char *buf, int nbytes);
+extern void inv_truncate(LargeObjectDesc *obj_desc, int64 len);
+
+#endif /* LARGE_OBJECT_H */
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
new file mode 100644
index 0000000..68ab740
--- /dev/null
+++ b/src/include/storage/latch.h
@@ -0,0 +1,186 @@
+/*-------------------------------------------------------------------------
+ *
+ * latch.h
+ * Routines for interprocess latches
+ *
+ * A latch is a boolean variable, with operations that let processes sleep
+ * until it is set. A latch can be set from another process, or a signal
+ * handler within the same process.
+ *
+ * The latch interface is a reliable replacement for the common pattern of
+ * using pg_usleep() or select() to wait until a signal arrives, where the
+ * signal handler sets a flag variable. Because on some platforms an
+ * incoming signal doesn't interrupt sleep, and even on platforms where it
+ * does there is a race condition if the signal arrives just before
+ * entering the sleep, the common pattern must periodically wake up and
+ * poll the flag variable. The pselect() system call was invented to solve
+ * this problem, but it is not portable enough. Latches are designed to
+ * overcome these limitations, allowing you to sleep without polling and
+ * ensuring quick response to signals from other processes.
+ *
+ * There are two kinds of latches: local and shared. A local latch is
+ * initialized by InitLatch, and can only be set from the same process.
+ * A local latch can be used to wait for a signal to arrive, by calling
+ * SetLatch in the signal handler. A shared latch resides in shared memory,
+ * and must be initialized at postmaster startup by InitSharedLatch. Before
+ * a shared latch can be waited on, it must be associated with a process
+ * with OwnLatch. Only the process owning the latch can wait on it, but any
+ * process can set it.
+ *
+ * There are three basic operations on a latch:
+ *
+ * SetLatch - Sets the latch
+ * ResetLatch - Clears the latch, allowing it to be set again
+ * WaitLatch - Waits for the latch to become set
+ *
+ * WaitLatch includes a provision for timeouts (which should be avoided
+ * when possible, as they incur extra overhead) and a provision for
+ * postmaster child processes to wake up immediately on postmaster death.
+ * See latch.c for detailed specifications for the exported functions.
+ *
+ * The correct pattern to wait for event(s) is:
+ *
+ * for (;;)
+ * {
+ * ResetLatch();
+ * if (work to do)
+ * Do Stuff();
+ * WaitLatch();
+ * }
+ *
+ * It's important to reset the latch *before* checking if there's work to
+ * do. Otherwise, if someone sets the latch between the check and the
+ * ResetLatch call, you will miss it and Wait will incorrectly block.
+ *
+ * Another valid coding pattern looks like:
+ *
+ * for (;;)
+ * {
+ * if (work to do)
+ * Do Stuff(); // in particular, exit loop if some condition satisfied
+ * WaitLatch();
+ * ResetLatch();
+ * }
+ *
+ * This is useful to reduce latch traffic if it's expected that the loop's
+ * termination condition will often be satisfied in the first iteration;
+ * the cost is an extra loop iteration before blocking when it is not.
+ * What must be avoided is placing any checks for asynchronous events after
+ * WaitLatch and before ResetLatch, as that creates a race condition.
+ *
+ * To wake up the waiter, you must first set a global flag or something
+ * else that the wait loop tests in the "if (work to do)" part, and call
+ * SetLatch *after* that. SetLatch is designed to return quickly if the
+ * latch is already set.
+ *
+ * On some platforms, signals will not interrupt the latch wait primitive
+ * by themselves. Therefore, it is critical that any signal handler that
+ * is meant to terminate a WaitLatch wait calls SetLatch.
+ *
+ * Note that use of the process latch (PGPROC.procLatch) is generally better
+ * than an ad-hoc shared latch for signaling auxiliary processes. This is
+ * because generic signal handlers will call SetLatch on the process latch
+ * only, so using any latch other than the process latch effectively precludes
+ * use of any generic handler.
+ *
+ *
+ * WaitEventSets allow to wait for latches being set and additional events -
+ * postmaster dying and socket readiness of several sockets currently - at the
+ * same time. On many platforms using a long lived event set is more
+ * efficient than using WaitLatch or WaitLatchOrSocket.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/latch.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LATCH_H
+#define LATCH_H
+
+#include <signal.h>
+
+/*
+ * Latch structure should be treated as opaque and only accessed through
+ * the public functions. It is defined here to allow embedding Latches as
+ * part of bigger structs.
+ */
+typedef struct Latch
+{
+ sig_atomic_t is_set;
+ sig_atomic_t maybe_sleeping;
+ bool is_shared;
+ int owner_pid;
+#ifdef WIN32
+ HANDLE event;
+#endif
+} Latch;
+
+/*
+ * Bitmasks for events that may wake-up WaitLatch(), WaitLatchOrSocket(), or
+ * WaitEventSetWait().
+ */
+#define WL_LATCH_SET (1 << 0)
+#define WL_SOCKET_READABLE (1 << 1)
+#define WL_SOCKET_WRITEABLE (1 << 2)
+#define WL_TIMEOUT (1 << 3) /* not for WaitEventSetWait() */
+#define WL_POSTMASTER_DEATH (1 << 4)
+#define WL_EXIT_ON_PM_DEATH (1 << 5)
+#ifdef WIN32
+#define WL_SOCKET_CONNECTED (1 << 6)
+#else
+/* avoid having to deal with case on platforms not requiring it */
+#define WL_SOCKET_CONNECTED WL_SOCKET_WRITEABLE
+#endif
+#define WL_SOCKET_CLOSED (1 << 7)
+#define WL_SOCKET_MASK (WL_SOCKET_READABLE | \
+ WL_SOCKET_WRITEABLE | \
+ WL_SOCKET_CONNECTED | \
+ WL_SOCKET_CLOSED)
+
+typedef struct WaitEvent
+{
+ int pos; /* position in the event data structure */
+ uint32 events; /* triggered events */
+ pgsocket fd; /* socket fd associated with event */
+ void *user_data; /* pointer provided in AddWaitEventToSet */
+#ifdef WIN32
+ bool reset; /* Is reset of the event required? */
+#endif
+} WaitEvent;
+
+/* forward declaration to avoid exposing latch.c implementation details */
+typedef struct WaitEventSet WaitEventSet;
+
+/*
+ * prototypes for functions in latch.c
+ */
+extern void InitializeLatchSupport(void);
+extern void InitLatch(Latch *latch);
+extern void InitSharedLatch(Latch *latch);
+extern void OwnLatch(Latch *latch);
+extern void DisownLatch(Latch *latch);
+extern void SetLatch(Latch *latch);
+extern void ResetLatch(Latch *latch);
+extern void ShutdownLatchSupport(void);
+
+extern WaitEventSet *CreateWaitEventSet(MemoryContext context, int nevents);
+extern void FreeWaitEventSet(WaitEventSet *set);
+extern int AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd,
+ Latch *latch, void *user_data);
+extern void ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch);
+
+extern int WaitEventSetWait(WaitEventSet *set, long timeout,
+ WaitEvent *occurred_events, int nevents,
+ uint32 wait_event_info);
+extern int WaitLatch(Latch *latch, int wakeEvents, long timeout,
+ uint32 wait_event_info);
+extern int WaitLatchOrSocket(Latch *latch, int wakeEvents,
+ pgsocket sock, long timeout, uint32 wait_event_info);
+extern void InitializeLatchWaitSet(void);
+extern int GetNumRegisteredWaitEvents(WaitEventSet *set);
+extern bool WaitEventSetCanReportClosed(void);
+
+#endif /* LATCH_H */
diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h
new file mode 100644
index 0000000..be1d2c9
--- /dev/null
+++ b/src/include/storage/lmgr.h
@@ -0,0 +1,115 @@
+/*-------------------------------------------------------------------------
+ *
+ * lmgr.h
+ * POSTGRES lock manager definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/lmgr.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LMGR_H
+#define LMGR_H
+
+#include "lib/stringinfo.h"
+#include "storage/itemptr.h"
+#include "storage/lock.h"
+#include "utils/rel.h"
+
+
+/* XactLockTableWait operations */
+typedef enum XLTW_Oper
+{
+ XLTW_None,
+ XLTW_Update,
+ XLTW_Delete,
+ XLTW_Lock,
+ XLTW_LockUpdated,
+ XLTW_InsertIndex,
+ XLTW_InsertIndexUnique,
+ XLTW_FetchUpdated,
+ XLTW_RecheckExclusionConstr
+} XLTW_Oper;
+
+extern void RelationInitLockInfo(Relation relation);
+
+/* Lock a relation */
+extern void LockRelationOid(Oid relid, LOCKMODE lockmode);
+extern void LockRelationId(LockRelId *relid, LOCKMODE lockmode);
+extern bool ConditionalLockRelationOid(Oid relid, LOCKMODE lockmode);
+extern void UnlockRelationId(LockRelId *relid, LOCKMODE lockmode);
+extern void UnlockRelationOid(Oid relid, LOCKMODE lockmode);
+
+extern void LockRelation(Relation relation, LOCKMODE lockmode);
+extern bool ConditionalLockRelation(Relation relation, LOCKMODE lockmode);
+extern void UnlockRelation(Relation relation, LOCKMODE lockmode);
+extern bool CheckRelationLockedByMe(Relation relation, LOCKMODE lockmode,
+ bool orstronger);
+extern bool LockHasWaitersRelation(Relation relation, LOCKMODE lockmode);
+
+extern void LockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode);
+extern void UnlockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode);
+
+/* Lock a relation for extension */
+extern void LockRelationForExtension(Relation relation, LOCKMODE lockmode);
+extern void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode);
+extern bool ConditionalLockRelationForExtension(Relation relation,
+ LOCKMODE lockmode);
+extern int RelationExtensionLockWaiterCount(Relation relation);
+
+/* Lock to recompute pg_database.datfrozenxid in the current database */
+extern void LockDatabaseFrozenIds(LOCKMODE lockmode);
+
+/* Lock a page (currently only used within indexes) */
+extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
+extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
+extern void UnlockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
+
+/* Lock a tuple (see heap_lock_tuple before assuming you understand this) */
+extern void LockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode);
+extern bool ConditionalLockTuple(Relation relation, ItemPointer tid,
+ LOCKMODE lockmode);
+extern void UnlockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode);
+
+/* Lock an XID (used to wait for a transaction to finish) */
+extern void XactLockTableInsert(TransactionId xid);
+extern void XactLockTableDelete(TransactionId xid);
+extern void XactLockTableWait(TransactionId xid, Relation rel,
+ ItemPointer ctid, XLTW_Oper oper);
+extern bool ConditionalXactLockTableWait(TransactionId xid);
+
+/* Lock VXIDs, specified by conflicting locktags */
+extern void WaitForLockers(LOCKTAG heaplocktag, LOCKMODE lockmode, bool progress);
+extern void WaitForLockersMultiple(List *locktags, LOCKMODE lockmode, bool progress);
+
+/* Lock an XID for tuple insertion (used to wait for an insertion to finish) */
+extern uint32 SpeculativeInsertionLockAcquire(TransactionId xid);
+extern void SpeculativeInsertionLockRelease(TransactionId xid);
+extern void SpeculativeInsertionWait(TransactionId xid, uint32 token);
+
+/* Lock a general object (other than a relation) of the current database */
+extern void LockDatabaseObject(Oid classid, Oid objid, uint16 objsubid,
+ LOCKMODE lockmode);
+extern void UnlockDatabaseObject(Oid classid, Oid objid, uint16 objsubid,
+ LOCKMODE lockmode);
+
+/* Lock a shared-across-databases object (other than a relation) */
+extern void LockSharedObject(Oid classid, Oid objid, uint16 objsubid,
+ LOCKMODE lockmode);
+extern void UnlockSharedObject(Oid classid, Oid objid, uint16 objsubid,
+ LOCKMODE lockmode);
+
+extern void LockSharedObjectForSession(Oid classid, Oid objid, uint16 objsubid,
+ LOCKMODE lockmode);
+extern void UnlockSharedObjectForSession(Oid classid, Oid objid, uint16 objsubid,
+ LOCKMODE lockmode);
+
+/* Describe a locktag for error messages */
+extern void DescribeLockTag(StringInfo buf, const LOCKTAG *tag);
+
+extern const char *GetLockNameFromTagType(uint16 locktag_type);
+
+#endif /* LMGR_H */
diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h
new file mode 100644
index 0000000..e4e1495
--- /dev/null
+++ b/src/include/storage/lock.h
@@ -0,0 +1,616 @@
+/*-------------------------------------------------------------------------
+ *
+ * lock.h
+ * POSTGRES low-level lock mechanism
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/lock.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LOCK_H_
+#define LOCK_H_
+
+#ifdef FRONTEND
+#error "lock.h may not be included from frontend code"
+#endif
+
+#include "storage/backendid.h"
+#include "storage/lockdefs.h"
+#include "storage/lwlock.h"
+#include "storage/shmem.h"
+#include "utils/timestamp.h"
+
+/* struct PGPROC is declared in proc.h, but must forward-reference it */
+typedef struct PGPROC PGPROC;
+
+typedef struct PROC_QUEUE
+{
+ SHM_QUEUE links; /* head of list of PGPROC objects */
+ int size; /* number of entries in list */
+} PROC_QUEUE;
+
+/* GUC variables */
+extern PGDLLIMPORT int max_locks_per_xact;
+
+#ifdef LOCK_DEBUG
+extern PGDLLIMPORT int Trace_lock_oidmin;
+extern PGDLLIMPORT bool Trace_locks;
+extern PGDLLIMPORT bool Trace_userlocks;
+extern PGDLLIMPORT int Trace_lock_table;
+extern PGDLLIMPORT bool Debug_deadlocks;
+#endif /* LOCK_DEBUG */
+
+
+/*
+ * Top-level transactions are identified by VirtualTransactionIDs comprising
+ * PGPROC fields backendId and lxid. For recovered prepared transactions, the
+ * LocalTransactionId is an ordinary XID; LOCKTAG_VIRTUALTRANSACTION never
+ * refers to that kind. These are guaranteed unique over the short term, but
+ * will be reused after a database restart or XID wraparound; hence they
+ * should never be stored on disk.
+ *
+ * Note that struct VirtualTransactionId can not be assumed to be atomically
+ * assignable as a whole. However, type LocalTransactionId is assumed to
+ * be atomically assignable, and the backend ID doesn't change often enough
+ * to be a problem, so we can fetch or assign the two fields separately.
+ * We deliberately refrain from using the struct within PGPROC, to prevent
+ * coding errors from trying to use struct assignment with it; instead use
+ * GET_VXID_FROM_PGPROC().
+ */
+typedef struct
+{
+ BackendId backendId; /* backendId from PGPROC */
+ LocalTransactionId localTransactionId; /* lxid from PGPROC */
+} VirtualTransactionId;
+
+#define InvalidLocalTransactionId 0
+#define LocalTransactionIdIsValid(lxid) ((lxid) != InvalidLocalTransactionId)
+#define VirtualTransactionIdIsValid(vxid) \
+ (LocalTransactionIdIsValid((vxid).localTransactionId))
+#define VirtualTransactionIdIsRecoveredPreparedXact(vxid) \
+ ((vxid).backendId == InvalidBackendId)
+#define VirtualTransactionIdEquals(vxid1, vxid2) \
+ ((vxid1).backendId == (vxid2).backendId && \
+ (vxid1).localTransactionId == (vxid2).localTransactionId)
+#define SetInvalidVirtualTransactionId(vxid) \
+ ((vxid).backendId = InvalidBackendId, \
+ (vxid).localTransactionId = InvalidLocalTransactionId)
+#define GET_VXID_FROM_PGPROC(vxid, proc) \
+ ((vxid).backendId = (proc).backendId, \
+ (vxid).localTransactionId = (proc).lxid)
+
+/* MAX_LOCKMODES cannot be larger than the # of bits in LOCKMASK */
+#define MAX_LOCKMODES 10
+
+#define LOCKBIT_ON(lockmode) (1 << (lockmode))
+#define LOCKBIT_OFF(lockmode) (~(1 << (lockmode)))
+
+
+/*
+ * This data structure defines the locking semantics associated with a
+ * "lock method". The semantics specify the meaning of each lock mode
+ * (by defining which lock modes it conflicts with).
+ * All of this data is constant and is kept in const tables.
+ *
+ * numLockModes -- number of lock modes (READ,WRITE,etc) that
+ * are defined in this lock method. Must be less than MAX_LOCKMODES.
+ *
+ * conflictTab -- this is an array of bitmasks showing lock
+ * mode conflicts. conflictTab[i] is a mask with the j-th bit
+ * turned on if lock modes i and j conflict. Lock modes are
+ * numbered 1..numLockModes; conflictTab[0] is unused.
+ *
+ * lockModeNames -- ID strings for debug printouts.
+ *
+ * trace_flag -- pointer to GUC trace flag for this lock method. (The
+ * GUC variable is not constant, but we use "const" here to denote that
+ * it can't be changed through this reference.)
+ */
+typedef struct LockMethodData
+{
+ int numLockModes;
+ const LOCKMASK *conflictTab;
+ const char *const *lockModeNames;
+ const bool *trace_flag;
+} LockMethodData;
+
+typedef const LockMethodData *LockMethod;
+
+/*
+ * Lock methods are identified by LOCKMETHODID. (Despite the declaration as
+ * uint16, we are constrained to 256 lockmethods by the layout of LOCKTAG.)
+ */
+typedef uint16 LOCKMETHODID;
+
+/* These identify the known lock methods */
+#define DEFAULT_LOCKMETHOD 1
+#define USER_LOCKMETHOD 2
+
+/*
+ * LOCKTAG is the key information needed to look up a LOCK item in the
+ * lock hashtable. A LOCKTAG value uniquely identifies a lockable object.
+ *
+ * The LockTagType enum defines the different kinds of objects we can lock.
+ * We can handle up to 256 different LockTagTypes.
+ */
+typedef enum LockTagType
+{
+ LOCKTAG_RELATION, /* whole relation */
+ LOCKTAG_RELATION_EXTEND, /* the right to extend a relation */
+ LOCKTAG_DATABASE_FROZEN_IDS, /* pg_database.datfrozenxid */
+ LOCKTAG_PAGE, /* one page of a relation */
+ LOCKTAG_TUPLE, /* one physical tuple */
+ LOCKTAG_TRANSACTION, /* transaction (for waiting for xact done) */
+ LOCKTAG_VIRTUALTRANSACTION, /* virtual transaction (ditto) */
+ LOCKTAG_SPECULATIVE_TOKEN, /* speculative insertion Xid and token */
+ LOCKTAG_OBJECT, /* non-relation database object */
+ LOCKTAG_USERLOCK, /* reserved for old contrib/userlock code */
+ LOCKTAG_ADVISORY /* advisory user locks */
+} LockTagType;
+
+#define LOCKTAG_LAST_TYPE LOCKTAG_ADVISORY
+
+extern PGDLLIMPORT const char *const LockTagTypeNames[];
+
+/*
+ * The LOCKTAG struct is defined with malice aforethought to fit into 16
+ * bytes with no padding. Note that this would need adjustment if we were
+ * to widen Oid, BlockNumber, or TransactionId to more than 32 bits.
+ *
+ * We include lockmethodid in the locktag so that a single hash table in
+ * shared memory can store locks of different lockmethods.
+ */
+typedef struct LOCKTAG
+{
+ uint32 locktag_field1; /* a 32-bit ID field */
+ uint32 locktag_field2; /* a 32-bit ID field */
+ uint32 locktag_field3; /* a 32-bit ID field */
+ uint16 locktag_field4; /* a 16-bit ID field */
+ uint8 locktag_type; /* see enum LockTagType */
+ uint8 locktag_lockmethodid; /* lockmethod indicator */
+} LOCKTAG;
+
+/*
+ * These macros define how we map logical IDs of lockable objects into
+ * the physical fields of LOCKTAG. Use these to set up LOCKTAG values,
+ * rather than accessing the fields directly. Note multiple eval of target!
+ */
+
+/* ID info for a relation is DB OID + REL OID; DB OID = 0 if shared */
+#define SET_LOCKTAG_RELATION(locktag,dboid,reloid) \
+ ((locktag).locktag_field1 = (dboid), \
+ (locktag).locktag_field2 = (reloid), \
+ (locktag).locktag_field3 = 0, \
+ (locktag).locktag_field4 = 0, \
+ (locktag).locktag_type = LOCKTAG_RELATION, \
+ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
+/* same ID info as RELATION */
+#define SET_LOCKTAG_RELATION_EXTEND(locktag,dboid,reloid) \
+ ((locktag).locktag_field1 = (dboid), \
+ (locktag).locktag_field2 = (reloid), \
+ (locktag).locktag_field3 = 0, \
+ (locktag).locktag_field4 = 0, \
+ (locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \
+ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
+/* ID info for frozen IDs is DB OID */
+#define SET_LOCKTAG_DATABASE_FROZEN_IDS(locktag,dboid) \
+ ((locktag).locktag_field1 = (dboid), \
+ (locktag).locktag_field2 = 0, \
+ (locktag).locktag_field3 = 0, \
+ (locktag).locktag_field4 = 0, \
+ (locktag).locktag_type = LOCKTAG_DATABASE_FROZEN_IDS, \
+ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
+/* ID info for a page is RELATION info + BlockNumber */
+#define SET_LOCKTAG_PAGE(locktag,dboid,reloid,blocknum) \
+ ((locktag).locktag_field1 = (dboid), \
+ (locktag).locktag_field2 = (reloid), \
+ (locktag).locktag_field3 = (blocknum), \
+ (locktag).locktag_field4 = 0, \
+ (locktag).locktag_type = LOCKTAG_PAGE, \
+ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
+/* ID info for a tuple is PAGE info + OffsetNumber */
+#define SET_LOCKTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \
+ ((locktag).locktag_field1 = (dboid), \
+ (locktag).locktag_field2 = (reloid), \
+ (locktag).locktag_field3 = (blocknum), \
+ (locktag).locktag_field4 = (offnum), \
+ (locktag).locktag_type = LOCKTAG_TUPLE, \
+ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
+/* ID info for a transaction is its TransactionId */
+#define SET_LOCKTAG_TRANSACTION(locktag,xid) \
+ ((locktag).locktag_field1 = (xid), \
+ (locktag).locktag_field2 = 0, \
+ (locktag).locktag_field3 = 0, \
+ (locktag).locktag_field4 = 0, \
+ (locktag).locktag_type = LOCKTAG_TRANSACTION, \
+ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
+/* ID info for a virtual transaction is its VirtualTransactionId */
+#define SET_LOCKTAG_VIRTUALTRANSACTION(locktag,vxid) \
+ ((locktag).locktag_field1 = (vxid).backendId, \
+ (locktag).locktag_field2 = (vxid).localTransactionId, \
+ (locktag).locktag_field3 = 0, \
+ (locktag).locktag_field4 = 0, \
+ (locktag).locktag_type = LOCKTAG_VIRTUALTRANSACTION, \
+ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
+/*
+ * ID info for a speculative insert is TRANSACTION info +
+ * its speculative insert counter.
+ */
+#define SET_LOCKTAG_SPECULATIVE_INSERTION(locktag,xid,token) \
+ ((locktag).locktag_field1 = (xid), \
+ (locktag).locktag_field2 = (token), \
+ (locktag).locktag_field3 = 0, \
+ (locktag).locktag_field4 = 0, \
+ (locktag).locktag_type = LOCKTAG_SPECULATIVE_TOKEN, \
+ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
+/*
+ * ID info for an object is DB OID + CLASS OID + OBJECT OID + SUBID
+ *
+ * Note: object ID has same representation as in pg_depend and
+ * pg_description, but notice that we are constraining SUBID to 16 bits.
+ * Also, we use DB OID = 0 for shared objects such as tablespaces.
+ */
+#define SET_LOCKTAG_OBJECT(locktag,dboid,classoid,objoid,objsubid) \
+ ((locktag).locktag_field1 = (dboid), \
+ (locktag).locktag_field2 = (classoid), \
+ (locktag).locktag_field3 = (objoid), \
+ (locktag).locktag_field4 = (objsubid), \
+ (locktag).locktag_type = LOCKTAG_OBJECT, \
+ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
+#define SET_LOCKTAG_ADVISORY(locktag,id1,id2,id3,id4) \
+ ((locktag).locktag_field1 = (id1), \
+ (locktag).locktag_field2 = (id2), \
+ (locktag).locktag_field3 = (id3), \
+ (locktag).locktag_field4 = (id4), \
+ (locktag).locktag_type = LOCKTAG_ADVISORY, \
+ (locktag).locktag_lockmethodid = USER_LOCKMETHOD)
+
+
+/*
+ * Per-locked-object lock information:
+ *
+ * tag -- uniquely identifies the object being locked
+ * grantMask -- bitmask for all lock types currently granted on this object.
+ * waitMask -- bitmask for all lock types currently awaited on this object.
+ * procLocks -- list of PROCLOCK objects for this lock.
+ * waitProcs -- queue of processes waiting for this lock.
+ * requested -- count of each lock type currently requested on the lock
+ * (includes requests already granted!!).
+ * nRequested -- total requested locks of all types.
+ * granted -- count of each lock type currently granted on the lock.
+ * nGranted -- total granted locks of all types.
+ *
+ * Note: these counts count 1 for each backend. Internally to a backend,
+ * there may be multiple grabs on a particular lock, but this is not reflected
+ * into shared memory.
+ */
+typedef struct LOCK
+{
+ /* hash key */
+ LOCKTAG tag; /* unique identifier of lockable object */
+
+ /* data */
+ LOCKMASK grantMask; /* bitmask for lock types already granted */
+ LOCKMASK waitMask; /* bitmask for lock types awaited */
+ SHM_QUEUE procLocks; /* list of PROCLOCK objects assoc. with lock */
+ PROC_QUEUE waitProcs; /* list of PGPROC objects waiting on lock */
+ int requested[MAX_LOCKMODES]; /* counts of requested locks */
+ int nRequested; /* total of requested[] array */
+ int granted[MAX_LOCKMODES]; /* counts of granted locks */
+ int nGranted; /* total of granted[] array */
+} LOCK;
+
+#define LOCK_LOCKMETHOD(lock) ((LOCKMETHODID) (lock).tag.locktag_lockmethodid)
+#define LOCK_LOCKTAG(lock) ((LockTagType) (lock).tag.locktag_type)
+
+
+/*
+ * We may have several different backends holding or awaiting locks
+ * on the same lockable object. We need to store some per-holder/waiter
+ * information for each such holder (or would-be holder). This is kept in
+ * a PROCLOCK struct.
+ *
+ * PROCLOCKTAG is the key information needed to look up a PROCLOCK item in the
+ * proclock hashtable. A PROCLOCKTAG value uniquely identifies the combination
+ * of a lockable object and a holder/waiter for that object. (We can use
+ * pointers here because the PROCLOCKTAG need only be unique for the lifespan
+ * of the PROCLOCK, and it will never outlive the lock or the proc.)
+ *
+ * Internally to a backend, it is possible for the same lock to be held
+ * for different purposes: the backend tracks transaction locks separately
+ * from session locks. However, this is not reflected in the shared-memory
+ * state: we only track which backend(s) hold the lock. This is OK since a
+ * backend can never block itself.
+ *
+ * The holdMask field shows the already-granted locks represented by this
+ * proclock. Note that there will be a proclock object, possibly with
+ * zero holdMask, for any lock that the process is currently waiting on.
+ * Otherwise, proclock objects whose holdMasks are zero are recycled
+ * as soon as convenient.
+ *
+ * releaseMask is workspace for LockReleaseAll(): it shows the locks due
+ * to be released during the current call. This must only be examined or
+ * set by the backend owning the PROCLOCK.
+ *
+ * Each PROCLOCK object is linked into lists for both the associated LOCK
+ * object and the owning PGPROC object. Note that the PROCLOCK is entered
+ * into these lists as soon as it is created, even if no lock has yet been
+ * granted. A PGPROC that is waiting for a lock to be granted will also be
+ * linked into the lock's waitProcs queue.
+ */
+typedef struct PROCLOCKTAG
+{
+ /* NB: we assume this struct contains no padding! */
+ LOCK *myLock; /* link to per-lockable-object information */
+ PGPROC *myProc; /* link to PGPROC of owning backend */
+} PROCLOCKTAG;
+
+typedef struct PROCLOCK
+{
+ /* tag */
+ PROCLOCKTAG tag; /* unique identifier of proclock object */
+
+ /* data */
+ PGPROC *groupLeader; /* proc's lock group leader, or proc itself */
+ LOCKMASK holdMask; /* bitmask for lock types currently held */
+ LOCKMASK releaseMask; /* bitmask for lock types to be released */
+ SHM_QUEUE lockLink; /* list link in LOCK's list of proclocks */
+ SHM_QUEUE procLink; /* list link in PGPROC's list of proclocks */
+} PROCLOCK;
+
+#define PROCLOCK_LOCKMETHOD(proclock) \
+ LOCK_LOCKMETHOD(*((proclock).tag.myLock))
+
+/*
+ * Each backend also maintains a local hash table with information about each
+ * lock it is currently interested in. In particular the local table counts
+ * the number of times that lock has been acquired. This allows multiple
+ * requests for the same lock to be executed without additional accesses to
+ * shared memory. We also track the number of lock acquisitions per
+ * ResourceOwner, so that we can release just those locks belonging to a
+ * particular ResourceOwner.
+ *
+ * When holding a lock taken "normally", the lock and proclock fields always
+ * point to the associated objects in shared memory. However, if we acquired
+ * the lock via the fast-path mechanism, the lock and proclock fields are set
+ * to NULL, since there probably aren't any such objects in shared memory.
+ * (If the lock later gets promoted to normal representation, we may eventually
+ * update our locallock's lock/proclock fields after finding the shared
+ * objects.)
+ *
+ * Caution: a locallock object can be left over from a failed lock acquisition
+ * attempt. In this case its lock/proclock fields are untrustworthy, since
+ * the shared lock object is neither held nor awaited, and hence is available
+ * to be reclaimed. If nLocks > 0 then these pointers must either be valid or
+ * NULL, but when nLocks == 0 they should be considered garbage.
+ */
+typedef struct LOCALLOCKTAG
+{
+ LOCKTAG lock; /* identifies the lockable object */
+ LOCKMODE mode; /* lock mode for this table entry */
+} LOCALLOCKTAG;
+
+typedef struct LOCALLOCKOWNER
+{
+ /*
+ * Note: if owner is NULL then the lock is held on behalf of the session;
+ * otherwise it is held on behalf of my current transaction.
+ *
+ * Must use a forward struct reference to avoid circularity.
+ */
+ struct ResourceOwnerData *owner;
+ int64 nLocks; /* # of times held by this owner */
+} LOCALLOCKOWNER;
+
+typedef struct LOCALLOCK
+{
+ /* tag */
+ LOCALLOCKTAG tag; /* unique identifier of locallock entry */
+
+ /* data */
+ uint32 hashcode; /* copy of LOCKTAG's hash value */
+ LOCK *lock; /* associated LOCK object, if any */
+ PROCLOCK *proclock; /* associated PROCLOCK object, if any */
+ int64 nLocks; /* total number of times lock is held */
+ int numLockOwners; /* # of relevant ResourceOwners */
+ int maxLockOwners; /* allocated size of array */
+ LOCALLOCKOWNER *lockOwners; /* dynamically resizable array */
+ bool holdsStrongLockCount; /* bumped FastPathStrongRelationLocks */
+ bool lockCleared; /* we read all sinval msgs for lock */
+} LOCALLOCK;
+
+#define LOCALLOCK_LOCKMETHOD(llock) ((llock).tag.lock.locktag_lockmethodid)
+#define LOCALLOCK_LOCKTAG(llock) ((LockTagType) (llock).tag.lock.locktag_type)
+
+
+/*
+ * These structures hold information passed from lmgr internals to the lock
+ * listing user-level functions (in lockfuncs.c).
+ */
+
+typedef struct LockInstanceData
+{
+ LOCKTAG locktag; /* tag for locked object */
+ LOCKMASK holdMask; /* locks held by this PGPROC */
+ LOCKMODE waitLockMode; /* lock awaited by this PGPROC, if any */
+ BackendId backend; /* backend ID of this PGPROC */
+ LocalTransactionId lxid; /* local transaction ID of this PGPROC */
+ TimestampTz waitStart; /* time at which this PGPROC started waiting
+ * for lock */
+ int pid; /* pid of this PGPROC */
+ int leaderPid; /* pid of group leader; = pid if no group */
+ bool fastpath; /* taken via fastpath? */
+} LockInstanceData;
+
+typedef struct LockData
+{
+ int nelements; /* The length of the array */
+ LockInstanceData *locks; /* Array of per-PROCLOCK information */
+} LockData;
+
+typedef struct BlockedProcData
+{
+ int pid; /* pid of a blocked PGPROC */
+ /* Per-PROCLOCK information about PROCLOCKs of the lock the pid awaits */
+ /* (these fields refer to indexes in BlockedProcsData.locks[]) */
+ int first_lock; /* index of first relevant LockInstanceData */
+ int num_locks; /* number of relevant LockInstanceDatas */
+ /* PIDs of PGPROCs that are ahead of "pid" in the lock's wait queue */
+ /* (these fields refer to indexes in BlockedProcsData.waiter_pids[]) */
+ int first_waiter; /* index of first preceding waiter */
+ int num_waiters; /* number of preceding waiters */
+} BlockedProcData;
+
+typedef struct BlockedProcsData
+{
+ BlockedProcData *procs; /* Array of per-blocked-proc information */
+ LockInstanceData *locks; /* Array of per-PROCLOCK information */
+ int *waiter_pids; /* Array of PIDs of other blocked PGPROCs */
+ int nprocs; /* # of valid entries in procs[] array */
+ int maxprocs; /* Allocated length of procs[] array */
+ int nlocks; /* # of valid entries in locks[] array */
+ int maxlocks; /* Allocated length of locks[] array */
+ int npids; /* # of valid entries in waiter_pids[] array */
+ int maxpids; /* Allocated length of waiter_pids[] array */
+} BlockedProcsData;
+
+
+/* Result codes for LockAcquire() */
+typedef enum
+{
+ LOCKACQUIRE_NOT_AVAIL, /* lock not available, and dontWait=true */
+ LOCKACQUIRE_OK, /* lock successfully acquired */
+ LOCKACQUIRE_ALREADY_HELD, /* incremented count for lock already held */
+ LOCKACQUIRE_ALREADY_CLEAR /* incremented count for lock already clear */
+} LockAcquireResult;
+
+/* Deadlock states identified by DeadLockCheck() */
+typedef enum
+{
+ DS_NOT_YET_CHECKED, /* no deadlock check has run yet */
+ DS_NO_DEADLOCK, /* no deadlock detected */
+ DS_SOFT_DEADLOCK, /* deadlock avoided by queue rearrangement */
+ DS_HARD_DEADLOCK, /* deadlock, no way out but ERROR */
+ DS_BLOCKED_BY_AUTOVACUUM /* no deadlock; queue blocked by autovacuum
+ * worker */
+} DeadLockState;
+
+/*
+ * The lockmgr's shared hash tables are partitioned to reduce contention.
+ * To determine which partition a given locktag belongs to, compute the tag's
+ * hash code with LockTagHashCode(), then apply one of these macros.
+ * NB: NUM_LOCK_PARTITIONS must be a power of 2!
+ */
+#define LockHashPartition(hashcode) \
+ ((hashcode) % NUM_LOCK_PARTITIONS)
+#define LockHashPartitionLock(hashcode) \
+ (&MainLWLockArray[LOCK_MANAGER_LWLOCK_OFFSET + \
+ LockHashPartition(hashcode)].lock)
+#define LockHashPartitionLockByIndex(i) \
+ (&MainLWLockArray[LOCK_MANAGER_LWLOCK_OFFSET + (i)].lock)
+
+/*
+ * The deadlock detector needs to be able to access lockGroupLeader and
+ * related fields in the PGPROC, so we arrange for those fields to be protected
+ * by one of the lock hash partition locks. Since the deadlock detector
+ * acquires all such locks anyway, this makes it safe for it to access these
+ * fields without doing anything extra. To avoid contention as much as
+ * possible, we map different PGPROCs to different partition locks. The lock
+ * used for a given lock group is determined by the group leader's pgprocno.
+ */
+#define LockHashPartitionLockByProc(leader_pgproc) \
+ LockHashPartitionLock((leader_pgproc)->pgprocno)
+
+/*
+ * function prototypes
+ */
+extern void InitLocks(void);
+extern LockMethod GetLocksMethodTable(const LOCK *lock);
+extern LockMethod GetLockTagsMethodTable(const LOCKTAG *locktag);
+extern uint32 LockTagHashCode(const LOCKTAG *locktag);
+extern bool DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2);
+extern LockAcquireResult LockAcquire(const LOCKTAG *locktag,
+ LOCKMODE lockmode,
+ bool sessionLock,
+ bool dontWait);
+extern LockAcquireResult LockAcquireExtended(const LOCKTAG *locktag,
+ LOCKMODE lockmode,
+ bool sessionLock,
+ bool dontWait,
+ bool reportMemoryError,
+ LOCALLOCK **locallockp);
+extern void AbortStrongLockAcquire(void);
+extern void MarkLockClear(LOCALLOCK *locallock);
+extern bool LockRelease(const LOCKTAG *locktag,
+ LOCKMODE lockmode, bool sessionLock);
+extern void LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks);
+extern void LockReleaseSession(LOCKMETHODID lockmethodid);
+extern void LockReleaseCurrentOwner(LOCALLOCK **locallocks, int nlocks);
+extern void LockReassignCurrentOwner(LOCALLOCK **locallocks, int nlocks);
+extern bool LockHeldByMe(const LOCKTAG *locktag, LOCKMODE lockmode);
+#ifdef USE_ASSERT_CHECKING
+extern HTAB *GetLockMethodLocalHash(void);
+#endif
+extern bool LockHasWaiters(const LOCKTAG *locktag,
+ LOCKMODE lockmode, bool sessionLock);
+extern VirtualTransactionId *GetLockConflicts(const LOCKTAG *locktag,
+ LOCKMODE lockmode, int *countp);
+extern void AtPrepare_Locks(void);
+extern void PostPrepare_Locks(TransactionId xid);
+extern bool LockCheckConflicts(LockMethod lockMethodTable,
+ LOCKMODE lockmode,
+ LOCK *lock, PROCLOCK *proclock);
+extern void GrantLock(LOCK *lock, PROCLOCK *proclock, LOCKMODE lockmode);
+extern void GrantAwaitedLock(void);
+extern void RemoveFromWaitQueue(PGPROC *proc, uint32 hashcode);
+extern Size LockShmemSize(void);
+extern LockData *GetLockStatusData(void);
+extern BlockedProcsData *GetBlockerStatusData(int blocked_pid);
+
+extern xl_standby_lock *GetRunningTransactionLocks(int *nlocks);
+extern const char *GetLockmodeName(LOCKMETHODID lockmethodid, LOCKMODE mode);
+
+extern void lock_twophase_recover(TransactionId xid, uint16 info,
+ void *recdata, uint32 len);
+extern void lock_twophase_postcommit(TransactionId xid, uint16 info,
+ void *recdata, uint32 len);
+extern void lock_twophase_postabort(TransactionId xid, uint16 info,
+ void *recdata, uint32 len);
+extern void lock_twophase_standby_recover(TransactionId xid, uint16 info,
+ void *recdata, uint32 len);
+
+extern DeadLockState DeadLockCheck(PGPROC *proc);
+extern PGPROC *GetBlockingAutoVacuumPgproc(void);
+extern void DeadLockReport(void) pg_attribute_noreturn();
+extern void RememberSimpleDeadLock(PGPROC *proc1,
+ LOCKMODE lockmode,
+ LOCK *lock,
+ PGPROC *proc2);
+extern void InitDeadLockChecking(void);
+
+extern int LockWaiterCount(const LOCKTAG *locktag);
+
+#ifdef LOCK_DEBUG
+extern void DumpLocks(PGPROC *proc);
+extern void DumpAllLocks(void);
+#endif
+
+/* Lock a VXID (used to wait for a transaction to finish) */
+extern void VirtualXactLockTableInsert(VirtualTransactionId vxid);
+extern void VirtualXactLockTableCleanup(void);
+extern bool VirtualXactLock(VirtualTransactionId vxid, bool wait);
+
+#endif /* LOCK_H_ */
diff --git a/src/include/storage/lockdefs.h b/src/include/storage/lockdefs.h
new file mode 100644
index 0000000..350ddd4
--- /dev/null
+++ b/src/include/storage/lockdefs.h
@@ -0,0 +1,59 @@
+/*-------------------------------------------------------------------------
+ *
+ * lockdefs.h
+ * Frontend exposed parts of postgres' low level lock mechanism
+ *
+ * The split between lockdefs.h and lock.h is not very principled. This file
+ * contains definition that have to (indirectly) be available when included by
+ * FRONTEND code.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/lockdefs.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LOCKDEFS_H_
+#define LOCKDEFS_H_
+
+/*
+ * LOCKMODE is an integer (1..N) indicating a lock type. LOCKMASK is a bit
+ * mask indicating a set of held or requested lock types (the bit 1<<mode
+ * corresponds to a particular lock mode).
+ */
+typedef int LOCKMASK;
+typedef int LOCKMODE;
+
+/*
+ * These are the valid values of type LOCKMODE for all the standard lock
+ * methods (both DEFAULT and USER).
+ */
+
+/* NoLock is not a lock mode, but a flag value meaning "don't get a lock" */
+#define NoLock 0
+
+#define AccessShareLock 1 /* SELECT */
+#define RowShareLock 2 /* SELECT FOR UPDATE/FOR SHARE */
+#define RowExclusiveLock 3 /* INSERT, UPDATE, DELETE */
+#define ShareUpdateExclusiveLock 4 /* VACUUM (non-FULL),ANALYZE, CREATE INDEX
+ * CONCURRENTLY */
+#define ShareLock 5 /* CREATE INDEX (WITHOUT CONCURRENTLY) */
+#define ShareRowExclusiveLock 6 /* like EXCLUSIVE MODE, but allows ROW
+ * SHARE */
+#define ExclusiveLock 7 /* blocks ROW SHARE/SELECT...FOR UPDATE */
+#define AccessExclusiveLock 8 /* ALTER TABLE, DROP TABLE, VACUUM FULL,
+ * and unqualified LOCK TABLE */
+
+#define MaxLockMode 8 /* highest standard lock mode */
+
+
+/* WAL representation of an AccessExclusiveLock on a table */
+typedef struct xl_standby_lock
+{
+ TransactionId xid; /* xid of holder of AccessExclusiveLock */
+ Oid dbOid; /* DB containing table */
+ Oid relOid; /* OID of table */
+} xl_standby_lock;
+
+#endif /* LOCKDEFS_H_ */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
new file mode 100644
index 0000000..e03d317
--- /dev/null
+++ b/src/include/storage/lwlock.h
@@ -0,0 +1,206 @@
+/*-------------------------------------------------------------------------
+ *
+ * lwlock.h
+ * Lightweight lock manager
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/lwlock.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LWLOCK_H
+#define LWLOCK_H
+
+#ifdef FRONTEND
+#error "lwlock.h may not be included from frontend code"
+#endif
+
+#include "port/atomics.h"
+#include "storage/proclist_types.h"
+
+struct PGPROC;
+
+/*
+ * Code outside of lwlock.c should not manipulate the contents of this
+ * structure directly, but we have to declare it here to allow LWLocks to be
+ * incorporated into other data structures.
+ */
+typedef struct LWLock
+{
+ uint16 tranche; /* tranche ID */
+ pg_atomic_uint32 state; /* state of exclusive/nonexclusive lockers */
+ proclist_head waiters; /* list of waiting PGPROCs */
+#ifdef LOCK_DEBUG
+ pg_atomic_uint32 nwaiters; /* number of waiters */
+ struct PGPROC *owner; /* last exclusive owner of the lock */
+#endif
+} LWLock;
+
+/*
+ * In most cases, it's desirable to force each tranche of LWLocks to be aligned
+ * on a cache line boundary and make the array stride a power of 2. This saves
+ * a few cycles in indexing, but more importantly ensures that individual
+ * LWLocks don't cross cache line boundaries. This reduces cache contention
+ * problems, especially on AMD Opterons. In some cases, it's useful to add
+ * even more padding so that each LWLock takes up an entire cache line; this is
+ * useful, for example, in the main LWLock array, where the overall number of
+ * locks is small but some are heavily contended.
+ */
+#define LWLOCK_PADDED_SIZE PG_CACHE_LINE_SIZE
+
+/* LWLock, padded to a full cache line size */
+typedef union LWLockPadded
+{
+ LWLock lock;
+ char pad[LWLOCK_PADDED_SIZE];
+} LWLockPadded;
+
+extern PGDLLIMPORT LWLockPadded *MainLWLockArray;
+
+/* struct for storing named tranche information */
+typedef struct NamedLWLockTranche
+{
+ int trancheId;
+ char *trancheName;
+} NamedLWLockTranche;
+
+extern PGDLLIMPORT NamedLWLockTranche *NamedLWLockTrancheArray;
+extern PGDLLIMPORT int NamedLWLockTrancheRequests;
+
+/* Names for fixed lwlocks */
+#include "storage/lwlocknames.h"
+
+/*
+ * It's a bit odd to declare NUM_BUFFER_PARTITIONS and NUM_LOCK_PARTITIONS
+ * here, but we need them to figure out offsets within MainLWLockArray, and
+ * having this file include lock.h or bufmgr.h would be backwards.
+ */
+
+/* Number of partitions of the shared buffer mapping hashtable */
+#define NUM_BUFFER_PARTITIONS 128
+
+/* Number of partitions the shared lock tables are divided into */
+#define LOG2_NUM_LOCK_PARTITIONS 4
+#define NUM_LOCK_PARTITIONS (1 << LOG2_NUM_LOCK_PARTITIONS)
+
+/* Number of partitions the shared predicate lock tables are divided into */
+#define LOG2_NUM_PREDICATELOCK_PARTITIONS 4
+#define NUM_PREDICATELOCK_PARTITIONS (1 << LOG2_NUM_PREDICATELOCK_PARTITIONS)
+
+/* Offsets for various chunks of preallocated lwlocks. */
+#define BUFFER_MAPPING_LWLOCK_OFFSET NUM_INDIVIDUAL_LWLOCKS
+#define LOCK_MANAGER_LWLOCK_OFFSET \
+ (BUFFER_MAPPING_LWLOCK_OFFSET + NUM_BUFFER_PARTITIONS)
+#define PREDICATELOCK_MANAGER_LWLOCK_OFFSET \
+ (LOCK_MANAGER_LWLOCK_OFFSET + NUM_LOCK_PARTITIONS)
+#define NUM_FIXED_LWLOCKS \
+ (PREDICATELOCK_MANAGER_LWLOCK_OFFSET + NUM_PREDICATELOCK_PARTITIONS)
+
+typedef enum LWLockMode
+{
+ LW_EXCLUSIVE,
+ LW_SHARED,
+ LW_WAIT_UNTIL_FREE /* A special mode used in PGPROC->lwWaitMode,
+ * when waiting for lock to become free. Not
+ * to be used as LWLockAcquire argument */
+} LWLockMode;
+
+
+#ifdef LOCK_DEBUG
+extern PGDLLIMPORT bool Trace_lwlocks;
+#endif
+
+extern bool LWLockAcquire(LWLock *lock, LWLockMode mode);
+extern bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode);
+extern bool LWLockAcquireOrWait(LWLock *lock, LWLockMode mode);
+extern void LWLockRelease(LWLock *lock);
+extern void LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val);
+extern void LWLockReleaseAll(void);
+extern bool LWLockHeldByMe(LWLock *lock);
+extern bool LWLockAnyHeldByMe(LWLock *lock, int nlocks, size_t stride);
+extern bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode);
+
+extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval);
+extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 value);
+
+extern Size LWLockShmemSize(void);
+extern void CreateLWLocks(void);
+extern void InitLWLockAccess(void);
+
+extern const char *GetLWLockIdentifier(uint32 classId, uint16 eventId);
+
+/*
+ * Extensions (or core code) can obtain an LWLocks by calling
+ * RequestNamedLWLockTranche() during postmaster startup. Subsequently,
+ * call GetNamedLWLockTranche() to obtain a pointer to an array containing
+ * the number of LWLocks requested.
+ */
+extern void RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks);
+extern LWLockPadded *GetNamedLWLockTranche(const char *tranche_name);
+
+/*
+ * There is another, more flexible method of obtaining lwlocks. First, call
+ * LWLockNewTrancheId just once to obtain a tranche ID; this allocates from
+ * a shared counter. Next, each individual process using the tranche should
+ * call LWLockRegisterTranche() to associate that tranche ID with a name.
+ * Finally, LWLockInitialize should be called just once per lwlock, passing
+ * the tranche ID as an argument.
+ *
+ * It may seem strange that each process using the tranche must register it
+ * separately, but dynamic shared memory segments aren't guaranteed to be
+ * mapped at the same address in all coordinating backends, so storing the
+ * registration in the main shared memory segment wouldn't work for that case.
+ */
+extern int LWLockNewTrancheId(void);
+extern void LWLockRegisterTranche(int tranche_id, const char *tranche_name);
+extern void LWLockInitialize(LWLock *lock, int tranche_id);
+
+/*
+ * Every tranche ID less than NUM_INDIVIDUAL_LWLOCKS is reserved; also,
+ * we reserve additional tranche IDs for builtin tranches not included in
+ * the set of individual LWLocks. A call to LWLockNewTrancheId will never
+ * return a value less than LWTRANCHE_FIRST_USER_DEFINED.
+ */
+typedef enum BuiltinTrancheIds
+{
+ LWTRANCHE_XACT_BUFFER = NUM_INDIVIDUAL_LWLOCKS,
+ LWTRANCHE_COMMITTS_BUFFER,
+ LWTRANCHE_SUBTRANS_BUFFER,
+ LWTRANCHE_MULTIXACTOFFSET_BUFFER,
+ LWTRANCHE_MULTIXACTMEMBER_BUFFER,
+ LWTRANCHE_NOTIFY_BUFFER,
+ LWTRANCHE_SERIAL_BUFFER,
+ LWTRANCHE_WAL_INSERT,
+ LWTRANCHE_BUFFER_CONTENT,
+ LWTRANCHE_REPLICATION_ORIGIN_STATE,
+ LWTRANCHE_REPLICATION_SLOT_IO,
+ LWTRANCHE_LOCK_FASTPATH,
+ LWTRANCHE_BUFFER_MAPPING,
+ LWTRANCHE_LOCK_MANAGER,
+ LWTRANCHE_PREDICATE_LOCK_MANAGER,
+ LWTRANCHE_PARALLEL_HASH_JOIN,
+ LWTRANCHE_PARALLEL_QUERY_DSA,
+ LWTRANCHE_PER_SESSION_DSA,
+ LWTRANCHE_PER_SESSION_RECORD_TYPE,
+ LWTRANCHE_PER_SESSION_RECORD_TYPMOD,
+ LWTRANCHE_SHARED_TUPLESTORE,
+ LWTRANCHE_SHARED_TIDBITMAP,
+ LWTRANCHE_PARALLEL_APPEND,
+ LWTRANCHE_PER_XACT_PREDICATE_LIST,
+ LWTRANCHE_PGSTATS_DSA,
+ LWTRANCHE_PGSTATS_HASH,
+ LWTRANCHE_PGSTATS_DATA,
+ LWTRANCHE_FIRST_USER_DEFINED
+} BuiltinTrancheIds;
+
+/*
+ * Prior to PostgreSQL 9.4, we used an enum type called LWLockId to refer
+ * to LWLocks. New code should instead use LWLock *. However, for the
+ * convenience of third-party code, we include the following typedef.
+ */
+typedef LWLock *LWLockId;
+
+#endif /* LWLOCK_H */
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
new file mode 100644
index 0000000..ffffa40
--- /dev/null
+++ b/src/include/storage/md.h
@@ -0,0 +1,52 @@
+/*-------------------------------------------------------------------------
+ *
+ * md.h
+ * magnetic disk storage manager public interface declarations.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/md.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef MD_H
+#define MD_H
+
+#include "storage/block.h"
+#include "storage/relfilenode.h"
+#include "storage/smgr.h"
+#include "storage/sync.h"
+
+/* md storage manager functionality */
+extern void mdinit(void);
+extern void mdopen(SMgrRelation reln);
+extern void mdclose(SMgrRelation reln, ForkNumber forknum);
+extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
+extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
+extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
+extern void mdextend(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, char *buffer, bool skipFsync);
+extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum);
+extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ char *buffer);
+extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, BlockNumber nblocks);
+extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
+extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber nblocks);
+extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
+
+extern void ForgetDatabaseSyncRequests(Oid dbid);
+extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo);
+
+/* md sync callbacks */
+extern int mdsyncfiletag(const FileTag *ftag, char *path);
+extern int mdunlinkfiletag(const FileTag *ftag, char *path);
+extern bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate);
+
+#endif /* MD_H */
diff --git a/src/include/storage/off.h b/src/include/storage/off.h
new file mode 100644
index 0000000..e6573ac
--- /dev/null
+++ b/src/include/storage/off.h
@@ -0,0 +1,57 @@
+/*-------------------------------------------------------------------------
+ *
+ * off.h
+ * POSTGRES disk "offset" definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/off.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef OFF_H
+#define OFF_H
+
+#include "storage/itemid.h"
+/*
+ * OffsetNumber:
+ *
+ * this is a 1-based index into the linp (ItemIdData) array in the
+ * header of each disk page.
+ */
+typedef uint16 OffsetNumber;
+
+#define InvalidOffsetNumber ((OffsetNumber) 0)
+#define FirstOffsetNumber ((OffsetNumber) 1)
+#define MaxOffsetNumber ((OffsetNumber) (BLCKSZ / sizeof(ItemIdData)))
+
+/* ----------------
+ * support macros
+ * ----------------
+ */
+
+/*
+ * OffsetNumberIsValid
+ * True iff the offset number is valid.
+ */
+#define OffsetNumberIsValid(offsetNumber) \
+ ((bool) ((offsetNumber != InvalidOffsetNumber) && \
+ (offsetNumber <= MaxOffsetNumber)))
+
+/*
+ * OffsetNumberNext
+ * OffsetNumberPrev
+ * Increments/decrements the argument. These macros look pointless
+ * but they help us disambiguate the different manipulations on
+ * OffsetNumbers (e.g., sometimes we subtract one from an
+ * OffsetNumber to move back, and sometimes we do so to form a
+ * real C array index).
+ */
+#define OffsetNumberNext(offsetNumber) \
+ ((OffsetNumber) (1 + (offsetNumber)))
+#define OffsetNumberPrev(offsetNumber) \
+ ((OffsetNumber) (-1 + (offsetNumber)))
+
+#endif /* OFF_H */
diff --git a/src/include/storage/pg_sema.h b/src/include/storage/pg_sema.h
new file mode 100644
index 0000000..5ca941a
--- /dev/null
+++ b/src/include/storage/pg_sema.h
@@ -0,0 +1,61 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_sema.h
+ * Platform-independent API for semaphores.
+ *
+ * PostgreSQL requires counting semaphores (the kind that keep track of
+ * multiple unlock operations, and will allow an equal number of subsequent
+ * lock operations before blocking). The underlying implementation is
+ * not the same on every platform. This file defines the API that must
+ * be provided by each port.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/pg_sema.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_SEMA_H
+#define PG_SEMA_H
+
+/*
+ * struct PGSemaphoreData and pointer type PGSemaphore are the data structure
+ * representing an individual semaphore. The contents of PGSemaphoreData vary
+ * across implementations and must never be touched by platform-independent
+ * code; hence, PGSemaphoreData is declared as an opaque struct here.
+ *
+ * However, Windows is sufficiently unlike our other ports that it doesn't
+ * seem worth insisting on ABI compatibility for Windows too. Hence, on
+ * that platform just define PGSemaphore as HANDLE.
+ */
+#ifndef USE_WIN32_SEMAPHORES
+typedef struct PGSemaphoreData *PGSemaphore;
+#else
+typedef HANDLE PGSemaphore;
+#endif
+
+
+/* Report amount of shared memory needed */
+extern Size PGSemaphoreShmemSize(int maxSemas);
+
+/* Module initialization (called during postmaster start or shmem reinit) */
+extern void PGReserveSemaphores(int maxSemas);
+
+/* Allocate a PGSemaphore structure with initial count 1 */
+extern PGSemaphore PGSemaphoreCreate(void);
+
+/* Reset a previously-initialized PGSemaphore to have count 0 */
+extern void PGSemaphoreReset(PGSemaphore sema);
+
+/* Lock a semaphore (decrement count), blocking if count would be < 0 */
+extern void PGSemaphoreLock(PGSemaphore sema);
+
+/* Unlock a semaphore (increment count) */
+extern void PGSemaphoreUnlock(PGSemaphore sema);
+
+/* Lock a semaphore only if able to do so without blocking */
+extern bool PGSemaphoreTryLock(PGSemaphore sema);
+
+#endif /* PG_SEMA_H */
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
new file mode 100644
index 0000000..da5962e
--- /dev/null
+++ b/src/include/storage/pg_shmem.h
@@ -0,0 +1,92 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_shmem.h
+ * Platform-independent API for shared memory support.
+ *
+ * Every port is expected to support shared memory with approximately
+ * SysV-ish semantics; in particular, a memory block is not anonymous
+ * but has an ID, and we must be able to tell whether there are any
+ * remaining processes attached to a block of a specified ID.
+ *
+ * To simplify life for the SysV implementation, the ID is assumed to
+ * consist of two unsigned long values (these are key and ID in SysV
+ * terms). Other platforms may ignore the second value if they need
+ * only one ID number.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/pg_shmem.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_SHMEM_H
+#define PG_SHMEM_H
+
+#include "storage/dsm_impl.h"
+
+typedef struct PGShmemHeader /* standard header for all Postgres shmem */
+{
+ int32 magic; /* magic # to identify Postgres segments */
+#define PGShmemMagic 679834894
+ pid_t creatorPID; /* PID of creating process (set but unread) */
+ Size totalsize; /* total size of segment */
+ Size freeoffset; /* offset to first free space */
+ dsm_handle dsm_control; /* ID of dynamic shared memory control seg */
+ void *index; /* pointer to ShmemIndex table */
+#ifndef WIN32 /* Windows doesn't have useful inode#s */
+ dev_t device; /* device data directory is on */
+ ino_t inode; /* inode number of data directory */
+#endif
+} PGShmemHeader;
+
+/* GUC variables */
+extern PGDLLIMPORT int shared_memory_type;
+extern PGDLLIMPORT int huge_pages;
+extern PGDLLIMPORT int huge_page_size;
+
+/* Possible values for huge_pages */
+typedef enum
+{
+ HUGE_PAGES_OFF,
+ HUGE_PAGES_ON,
+ HUGE_PAGES_TRY
+} HugePagesType;
+
+/* Possible values for shared_memory_type */
+typedef enum
+{
+ SHMEM_TYPE_WINDOWS,
+ SHMEM_TYPE_SYSV,
+ SHMEM_TYPE_MMAP
+} PGShmemType;
+
+#ifndef WIN32
+extern PGDLLIMPORT unsigned long UsedShmemSegID;
+#else
+extern PGDLLIMPORT HANDLE UsedShmemSegID;
+extern PGDLLIMPORT void *ShmemProtectiveRegion;
+#endif
+extern PGDLLIMPORT void *UsedShmemSegAddr;
+
+#if !defined(WIN32) && !defined(EXEC_BACKEND)
+#define DEFAULT_SHARED_MEMORY_TYPE SHMEM_TYPE_MMAP
+#elif !defined(WIN32)
+#define DEFAULT_SHARED_MEMORY_TYPE SHMEM_TYPE_SYSV
+#else
+#define DEFAULT_SHARED_MEMORY_TYPE SHMEM_TYPE_WINDOWS
+#endif
+
+#ifdef EXEC_BACKEND
+extern void PGSharedMemoryReAttach(void);
+extern void PGSharedMemoryNoReAttach(void);
+#endif
+
+extern PGShmemHeader *PGSharedMemoryCreate(Size size,
+ PGShmemHeader **shim);
+extern bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2);
+extern void PGSharedMemoryDetach(void);
+extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags);
+
+#endif /* PG_SHMEM_H */
diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h
new file mode 100644
index 0000000..58f4ddf
--- /dev/null
+++ b/src/include/storage/pmsignal.h
@@ -0,0 +1,105 @@
+/*-------------------------------------------------------------------------
+ *
+ * pmsignal.h
+ * routines for signaling between the postmaster and its child processes
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/pmsignal.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PMSIGNAL_H
+#define PMSIGNAL_H
+
+#include <signal.h>
+
+#ifdef HAVE_SYS_PRCTL_H
+#include "sys/prctl.h"
+#endif
+
+#ifdef HAVE_SYS_PROCCTL_H
+#include "sys/procctl.h"
+#endif
+
+/*
+ * Reasons for signaling the postmaster. We can cope with simultaneous
+ * signals for different reasons. If the same reason is signaled multiple
+ * times in quick succession, however, the postmaster is likely to observe
+ * only one notification of it. This is okay for the present uses.
+ */
+typedef enum
+{
+ PMSIGNAL_RECOVERY_STARTED, /* recovery has started */
+ PMSIGNAL_BEGIN_HOT_STANDBY, /* begin Hot Standby */
+ PMSIGNAL_ROTATE_LOGFILE, /* send SIGUSR1 to syslogger to rotate logfile */
+ PMSIGNAL_START_AUTOVAC_LAUNCHER, /* start an autovacuum launcher */
+ PMSIGNAL_START_AUTOVAC_WORKER, /* start an autovacuum worker */
+ PMSIGNAL_BACKGROUND_WORKER_CHANGE, /* background worker state change */
+ PMSIGNAL_START_WALRECEIVER, /* start a walreceiver */
+ PMSIGNAL_ADVANCE_STATE_MACHINE, /* advance postmaster's state machine */
+
+ NUM_PMSIGNALS /* Must be last value of enum! */
+} PMSignalReason;
+
+/*
+ * Reasons why the postmaster would send SIGQUIT to its children.
+ */
+typedef enum
+{
+ PMQUIT_NOT_SENT = 0, /* postmaster hasn't sent SIGQUIT */
+ PMQUIT_FOR_CRASH, /* some other backend bought the farm */
+ PMQUIT_FOR_STOP /* immediate stop was commanded */
+} QuitSignalReason;
+
+/* PMSignalData is an opaque struct, details known only within pmsignal.c */
+typedef struct PMSignalData PMSignalData;
+
+/*
+ * prototypes for functions in pmsignal.c
+ */
+extern Size PMSignalShmemSize(void);
+extern void PMSignalShmemInit(void);
+extern void SendPostmasterSignal(PMSignalReason reason);
+extern bool CheckPostmasterSignal(PMSignalReason reason);
+extern void SetQuitSignalReason(QuitSignalReason reason);
+extern QuitSignalReason GetQuitSignalReason(void);
+extern int AssignPostmasterChildSlot(void);
+extern bool ReleasePostmasterChildSlot(int slot);
+extern bool IsPostmasterChildWalSender(int slot);
+extern void MarkPostmasterChildActive(void);
+extern void MarkPostmasterChildInactive(void);
+extern void MarkPostmasterChildWalSender(void);
+extern bool PostmasterIsAliveInternal(void);
+extern void PostmasterDeathSignalInit(void);
+
+
+/*
+ * Do we have a way to ask for a signal on parent death?
+ *
+ * If we do, pmsignal.c will set up a signal handler, that sets a flag when
+ * the parent dies. Checking the flag first makes PostmasterIsAlive() a lot
+ * cheaper in usual case that the postmaster is alive.
+ */
+#if (defined(HAVE_SYS_PRCTL_H) && defined(PR_SET_PDEATHSIG)) || \
+ (defined(HAVE_SYS_PROCCTL_H) && defined(PROC_PDEATHSIG_CTL))
+#define USE_POSTMASTER_DEATH_SIGNAL
+#endif
+
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+extern PGDLLIMPORT volatile sig_atomic_t postmaster_possibly_dead;
+
+static inline bool
+PostmasterIsAlive(void)
+{
+ if (likely(!postmaster_possibly_dead))
+ return true;
+ return PostmasterIsAliveInternal();
+}
+#else
+#define PostmasterIsAlive() PostmasterIsAliveInternal()
+#endif
+
+#endif /* PMSIGNAL_H */
diff --git a/src/include/storage/predicate.h b/src/include/storage/predicate.h
new file mode 100644
index 0000000..8dfcb39
--- /dev/null
+++ b/src/include/storage/predicate.h
@@ -0,0 +1,87 @@
+/*-------------------------------------------------------------------------
+ *
+ * predicate.h
+ * POSTGRES public predicate locking definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/predicate.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PREDICATE_H
+#define PREDICATE_H
+
+#include "storage/lock.h"
+#include "utils/relcache.h"
+#include "utils/snapshot.h"
+
+
+/*
+ * GUC variables
+ */
+extern PGDLLIMPORT int max_predicate_locks_per_xact;
+extern PGDLLIMPORT int max_predicate_locks_per_relation;
+extern PGDLLIMPORT int max_predicate_locks_per_page;
+
+
+/* Number of SLRU buffers to use for Serial SLRU */
+#define NUM_SERIAL_BUFFERS 16
+
+/*
+ * A handle used for sharing SERIALIZABLEXACT objects between the participants
+ * in a parallel query.
+ */
+typedef void *SerializableXactHandle;
+
+/*
+ * function prototypes
+ */
+
+/* housekeeping for shared memory predicate lock structures */
+extern void InitPredicateLocks(void);
+extern Size PredicateLockShmemSize(void);
+
+extern void CheckPointPredicate(void);
+
+/* predicate lock reporting */
+extern bool PageIsPredicateLocked(Relation relation, BlockNumber blkno);
+
+/* predicate lock maintenance */
+extern Snapshot GetSerializableTransactionSnapshot(Snapshot snapshot);
+extern void SetSerializableTransactionSnapshot(Snapshot snapshot,
+ VirtualTransactionId *sourcevxid,
+ int sourcepid);
+extern void RegisterPredicateLockingXid(TransactionId xid);
+extern void PredicateLockRelation(Relation relation, Snapshot snapshot);
+extern void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot);
+extern void PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot,
+ TransactionId insert_xid);
+extern void PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, BlockNumber newblkno);
+extern void PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, BlockNumber newblkno);
+extern void TransferPredicateLocksToHeapRelation(Relation relation);
+extern void ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe);
+
+/* conflict detection (may also trigger rollback) */
+extern bool CheckForSerializableConflictOutNeeded(Relation relation, Snapshot snapshot);
+extern void CheckForSerializableConflictOut(Relation relation, TransactionId xid, Snapshot snapshot);
+extern void CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno);
+extern void CheckTableForSerializableConflictIn(Relation relation);
+
+/* final rollback checking */
+extern void PreCommit_CheckForSerializationFailure(void);
+
+/* two-phase commit support */
+extern void AtPrepare_PredicateLocks(void);
+extern void PostPrepare_PredicateLocks(TransactionId xid);
+extern void PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit);
+extern void predicatelock_twophase_recover(TransactionId xid, uint16 info,
+ void *recdata, uint32 len);
+
+/* parallel query support */
+extern SerializableXactHandle ShareSerializableXact(void);
+extern void AttachSerializableXact(SerializableXactHandle handle);
+
+#endif /* PREDICATE_H */
diff --git a/src/include/storage/predicate_internals.h b/src/include/storage/predicate_internals.h
new file mode 100644
index 0000000..2416d3c
--- /dev/null
+++ b/src/include/storage/predicate_internals.h
@@ -0,0 +1,494 @@
+/*-------------------------------------------------------------------------
+ *
+ * predicate_internals.h
+ * POSTGRES internal predicate locking definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/predicate_internals.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PREDICATE_INTERNALS_H
+#define PREDICATE_INTERNALS_H
+
+#include "storage/lock.h"
+#include "storage/lwlock.h"
+
+/*
+ * Commit number.
+ */
+typedef uint64 SerCommitSeqNo;
+
+/*
+ * Reserved commit sequence numbers:
+ * - 0 is reserved to indicate a non-existent SLRU entry; it cannot be
+ * used as a SerCommitSeqNo, even an invalid one
+ * - InvalidSerCommitSeqNo is used to indicate a transaction that
+ * hasn't committed yet, so use a number greater than all valid
+ * ones to make comparison do the expected thing
+ * - RecoverySerCommitSeqNo is used to refer to transactions that
+ * happened before a crash/recovery, since we restart the sequence
+ * at that point. It's earlier than all normal sequence numbers,
+ * and is only used by recovered prepared transactions
+ */
+#define InvalidSerCommitSeqNo ((SerCommitSeqNo) PG_UINT64_MAX)
+#define RecoverySerCommitSeqNo ((SerCommitSeqNo) 1)
+#define FirstNormalSerCommitSeqNo ((SerCommitSeqNo) 2)
+
+/*
+ * The SERIALIZABLEXACT struct contains information needed for each
+ * serializable database transaction to support SSI techniques.
+ *
+ * A home-grown list is maintained in shared memory to manage these.
+ * An entry is used when the serializable transaction acquires a snapshot.
+ * Unless the transaction is rolled back, this entry must generally remain
+ * until all concurrent transactions have completed. (There are special
+ * optimizations for READ ONLY transactions which often allow them to be
+ * cleaned up earlier.) A transaction which is rolled back is cleaned up
+ * as soon as possible.
+ *
+ * Eligibility for cleanup of committed transactions is generally determined
+ * by comparing the transaction's finishedBefore field to
+ * SxactGlobalXmin.
+ */
+typedef struct SERIALIZABLEXACT
+{
+ VirtualTransactionId vxid; /* The executing process always has one of
+ * these. */
+
+ /*
+ * We use two numbers to track the order that transactions commit. Before
+ * commit, a transaction is marked as prepared, and prepareSeqNo is set.
+ * Shortly after commit, it's marked as committed, and commitSeqNo is set.
+ * This doesn't give a strict commit order, but these two values together
+ * are good enough for us, as we can always err on the safe side and
+ * assume that there's a conflict, if we can't be sure of the exact
+ * ordering of two commits.
+ *
+ * Note that a transaction is marked as prepared for a short period during
+ * commit processing, even if two-phase commit is not used. But with
+ * two-phase commit, a transaction can stay in prepared state for some
+ * time.
+ */
+ SerCommitSeqNo prepareSeqNo;
+ SerCommitSeqNo commitSeqNo;
+
+ /* these values are not both interesting at the same time */
+ union
+ {
+ SerCommitSeqNo earliestOutConflictCommit; /* when committed with
+ * conflict out */
+ SerCommitSeqNo lastCommitBeforeSnapshot; /* when not committed or
+ * no conflict out */
+ } SeqNo;
+ SHM_QUEUE outConflicts; /* list of write transactions whose data we
+ * couldn't read. */
+ SHM_QUEUE inConflicts; /* list of read transactions which couldn't
+ * see our write. */
+ SHM_QUEUE predicateLocks; /* list of associated PREDICATELOCK objects */
+ SHM_QUEUE finishedLink; /* list link in
+ * FinishedSerializableTransactions */
+
+ /*
+ * perXactPredicateListLock is only used in parallel queries: it protects
+ * this SERIALIZABLEXACT's predicate lock list against other workers of
+ * the same session.
+ */
+ LWLock perXactPredicateListLock;
+
+ /*
+ * for r/o transactions: list of concurrent r/w transactions that we could
+ * potentially have conflicts with, and vice versa for r/w transactions
+ */
+ SHM_QUEUE possibleUnsafeConflicts;
+
+ TransactionId topXid; /* top level xid for the transaction, if one
+ * exists; else invalid */
+ TransactionId finishedBefore; /* invalid means still running; else the
+ * struct expires when no serializable
+ * xids are before this. */
+ TransactionId xmin; /* the transaction's snapshot xmin */
+ uint32 flags; /* OR'd combination of values defined below */
+ int pid; /* pid of associated process */
+ int pgprocno; /* pgprocno of associated process */
+} SERIALIZABLEXACT;
+
+#define SXACT_FLAG_COMMITTED 0x00000001 /* already committed */
+#define SXACT_FLAG_PREPARED 0x00000002 /* about to commit */
+#define SXACT_FLAG_ROLLED_BACK 0x00000004 /* already rolled back */
+#define SXACT_FLAG_DOOMED 0x00000008 /* will roll back */
+/*
+ * The following flag actually means that the flagged transaction has a
+ * conflict out *to a transaction which committed ahead of it*. It's hard
+ * to get that into a name of a reasonable length.
+ */
+#define SXACT_FLAG_CONFLICT_OUT 0x00000010
+#define SXACT_FLAG_READ_ONLY 0x00000020
+#define SXACT_FLAG_DEFERRABLE_WAITING 0x00000040
+#define SXACT_FLAG_RO_SAFE 0x00000080
+#define SXACT_FLAG_RO_UNSAFE 0x00000100
+#define SXACT_FLAG_SUMMARY_CONFLICT_IN 0x00000200
+#define SXACT_FLAG_SUMMARY_CONFLICT_OUT 0x00000400
+/*
+ * The following flag means the transaction has been partially released
+ * already, but is being preserved because parallel workers might have a
+ * reference to it. It'll be recycled by the leader at end-of-transaction.
+ */
+#define SXACT_FLAG_PARTIALLY_RELEASED 0x00000800
+
+/*
+ * The following types are used to provide an ad hoc list for holding
+ * SERIALIZABLEXACT objects. An HTAB is overkill, since there is no need to
+ * access these by key -- there are direct pointers to these objects where
+ * needed. If a shared memory list is created, these types can probably be
+ * eliminated in favor of using the general solution.
+ */
+typedef struct PredXactListElementData
+{
+ SHM_QUEUE link;
+ SERIALIZABLEXACT sxact;
+} PredXactListElementData;
+
+typedef struct PredXactListElementData *PredXactListElement;
+
+#define PredXactListElementDataSize \
+ ((Size)MAXALIGN(sizeof(PredXactListElementData)))
+
+typedef struct PredXactListData
+{
+ SHM_QUEUE availableList;
+ SHM_QUEUE activeList;
+
+ /*
+ * These global variables are maintained when registering and cleaning up
+ * serializable transactions. They must be global across all backends,
+ * but are not needed outside the predicate.c source file. Protected by
+ * SerializableXactHashLock.
+ */
+ TransactionId SxactGlobalXmin; /* global xmin for active serializable
+ * transactions */
+ int SxactGlobalXminCount; /* how many active serializable
+ * transactions have this xmin */
+ int WritableSxactCount; /* how many non-read-only serializable
+ * transactions are active */
+ SerCommitSeqNo LastSxactCommitSeqNo; /* a strictly monotonically
+ * increasing number for commits
+ * of serializable transactions */
+ /* Protected by SerializableXactHashLock. */
+ SerCommitSeqNo CanPartialClearThrough; /* can clear predicate locks and
+ * inConflicts for committed
+ * transactions through this seq
+ * no */
+ /* Protected by SerializableFinishedListLock. */
+ SerCommitSeqNo HavePartialClearedThrough; /* have cleared through this
+ * seq no */
+ SERIALIZABLEXACT *OldCommittedSxact; /* shared copy of dummy sxact */
+
+ PredXactListElement element;
+} PredXactListData;
+
+typedef struct PredXactListData *PredXactList;
+
+#define PredXactListDataSize \
+ ((Size)MAXALIGN(sizeof(PredXactListData)))
+
+
+/*
+ * The following types are used to provide lists of rw-conflicts between
+ * pairs of transactions. Since exactly the same information is needed,
+ * they are also used to record possible unsafe transaction relationships
+ * for purposes of identifying safe snapshots for read-only transactions.
+ *
+ * When a RWConflictData is not in use to record either type of relationship
+ * between a pair of transactions, it is kept on an "available" list. The
+ * outLink field is used for maintaining that list.
+ */
+typedef struct RWConflictData
+{
+ SHM_QUEUE outLink; /* link for list of conflicts out from a sxact */
+ SHM_QUEUE inLink; /* link for list of conflicts in to a sxact */
+ SERIALIZABLEXACT *sxactOut;
+ SERIALIZABLEXACT *sxactIn;
+} RWConflictData;
+
+typedef struct RWConflictData *RWConflict;
+
+#define RWConflictDataSize \
+ ((Size)MAXALIGN(sizeof(RWConflictData)))
+
+typedef struct RWConflictPoolHeaderData
+{
+ SHM_QUEUE availableList;
+ RWConflict element;
+} RWConflictPoolHeaderData;
+
+typedef struct RWConflictPoolHeaderData *RWConflictPoolHeader;
+
+#define RWConflictPoolHeaderDataSize \
+ ((Size)MAXALIGN(sizeof(RWConflictPoolHeaderData)))
+
+
+/*
+ * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable
+ * transaction or any of its subtransactions.
+ */
+typedef struct SERIALIZABLEXIDTAG
+{
+ TransactionId xid;
+} SERIALIZABLEXIDTAG;
+
+/*
+ * The SERIALIZABLEXID struct provides a link from a TransactionId for a
+ * serializable transaction to the related SERIALIZABLEXACT record, even if
+ * the transaction has completed and its connection has been closed.
+ *
+ * These are created as new top level transaction IDs are first assigned to
+ * transactions which are participating in predicate locking. This may
+ * never happen for a particular transaction if it doesn't write anything.
+ * They are removed with their related serializable transaction objects.
+ *
+ * The SubTransGetTopmostTransaction method is used where necessary to get
+ * from an XID which might be from a subtransaction to the top level XID.
+ */
+typedef struct SERIALIZABLEXID
+{
+ /* hash key */
+ SERIALIZABLEXIDTAG tag;
+
+ /* data */
+ SERIALIZABLEXACT *myXact; /* pointer to the top level transaction data */
+} SERIALIZABLEXID;
+
+
+/*
+ * The PREDICATELOCKTARGETTAG struct identifies a database object which can
+ * be the target of predicate locks.
+ *
+ * Note that the hash function being used doesn't properly respect tag
+ * length -- if the length of the structure isn't a multiple of four bytes it
+ * will go to a four byte boundary past the end of the tag. If you change
+ * this struct, make sure any slack space is initialized, so that any random
+ * bytes in the middle or at the end are not included in the hash.
+ *
+ * TODO SSI: If we always use the same fields for the same type of value, we
+ * should rename these. Holding off until it's clear there are no exceptions.
+ * Since indexes are relations with blocks and tuples, it's looking likely that
+ * the rename will be possible. If not, we may need to divide the last field
+ * and use part of it for a target type, so that we know how to interpret the
+ * data..
+ */
+typedef struct PREDICATELOCKTARGETTAG
+{
+ uint32 locktag_field1; /* a 32-bit ID field */
+ uint32 locktag_field2; /* a 32-bit ID field */
+ uint32 locktag_field3; /* a 32-bit ID field */
+ uint32 locktag_field4; /* a 32-bit ID field */
+} PREDICATELOCKTARGETTAG;
+
+/*
+ * The PREDICATELOCKTARGET struct represents a database object on which there
+ * are predicate locks.
+ *
+ * A hash list of these objects is maintained in shared memory. An entry is
+ * added when a predicate lock is requested on an object which doesn't
+ * already have one. An entry is removed when the last lock is removed from
+ * its list.
+ */
+typedef struct PREDICATELOCKTARGET
+{
+ /* hash key */
+ PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
+
+ /* data */
+ SHM_QUEUE predicateLocks; /* list of PREDICATELOCK objects assoc. with
+ * predicate lock target */
+} PREDICATELOCKTARGET;
+
+
+/*
+ * The PREDICATELOCKTAG struct identifies an individual predicate lock.
+ *
+ * It is the combination of predicate lock target (which is a lockable
+ * object) and a serializable transaction which has acquired a lock on that
+ * target.
+ */
+typedef struct PREDICATELOCKTAG
+{
+ PREDICATELOCKTARGET *myTarget;
+ SERIALIZABLEXACT *myXact;
+} PREDICATELOCKTAG;
+
+/*
+ * The PREDICATELOCK struct represents an individual lock.
+ *
+ * An entry can be created here when the related database object is read, or
+ * by promotion of multiple finer-grained targets. All entries related to a
+ * serializable transaction are removed when that serializable transaction is
+ * cleaned up. Entries can also be removed when they are combined into a
+ * single coarser-grained lock entry.
+ */
+typedef struct PREDICATELOCK
+{
+ /* hash key */
+ PREDICATELOCKTAG tag; /* unique identifier of lock */
+
+ /* data */
+ SHM_QUEUE targetLink; /* list link in PREDICATELOCKTARGET's list of
+ * predicate locks */
+ SHM_QUEUE xactLink; /* list link in SERIALIZABLEXACT's list of
+ * predicate locks */
+ SerCommitSeqNo commitSeqNo; /* only used for summarized predicate locks */
+} PREDICATELOCK;
+
+
+/*
+ * The LOCALPREDICATELOCK struct represents a local copy of data which is
+ * also present in the PREDICATELOCK table, organized for fast access without
+ * needing to acquire a LWLock. It is strictly for optimization.
+ *
+ * Each serializable transaction creates its own local hash table to hold a
+ * collection of these. This information is used to determine when a number
+ * of fine-grained locks should be promoted to a single coarser-grained lock.
+ * The information is maintained more-or-less in parallel to the
+ * PREDICATELOCK data, but because this data is not protected by locks and is
+ * only used in an optimization heuristic, it is allowed to drift in a few
+ * corner cases where maintaining exact data would be expensive.
+ *
+ * The hash table is created when the serializable transaction acquires its
+ * snapshot, and its memory is released upon completion of the transaction.
+ */
+typedef struct LOCALPREDICATELOCK
+{
+ /* hash key */
+ PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
+
+ /* data */
+ bool held; /* is lock held, or just its children? */
+ int childLocks; /* number of child locks currently held */
+} LOCALPREDICATELOCK;
+
+
+/*
+ * The types of predicate locks which can be acquired.
+ */
+typedef enum PredicateLockTargetType
+{
+ PREDLOCKTAG_RELATION,
+ PREDLOCKTAG_PAGE,
+ PREDLOCKTAG_TUPLE
+ /* TODO SSI: Other types may be needed for index locking */
+} PredicateLockTargetType;
+
+
+/*
+ * This structure is used to quickly capture a copy of all predicate
+ * locks. This is currently used only by the pg_lock_status function,
+ * which in turn is used by the pg_locks view.
+ */
+typedef struct PredicateLockData
+{
+ int nelements;
+ PREDICATELOCKTARGETTAG *locktags;
+ SERIALIZABLEXACT *xacts;
+} PredicateLockData;
+
+
+/*
+ * These macros define how we map logical IDs of lockable objects into the
+ * physical fields of PREDICATELOCKTARGETTAG. Use these to set up values,
+ * rather than accessing the fields directly. Note multiple eval of target!
+ */
+#define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \
+ ((locktag).locktag_field1 = (dboid), \
+ (locktag).locktag_field2 = (reloid), \
+ (locktag).locktag_field3 = InvalidBlockNumber, \
+ (locktag).locktag_field4 = InvalidOffsetNumber)
+
+#define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \
+ ((locktag).locktag_field1 = (dboid), \
+ (locktag).locktag_field2 = (reloid), \
+ (locktag).locktag_field3 = (blocknum), \
+ (locktag).locktag_field4 = InvalidOffsetNumber)
+
+#define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \
+ ((locktag).locktag_field1 = (dboid), \
+ (locktag).locktag_field2 = (reloid), \
+ (locktag).locktag_field3 = (blocknum), \
+ (locktag).locktag_field4 = (offnum))
+
+#define GET_PREDICATELOCKTARGETTAG_DB(locktag) \
+ ((Oid) (locktag).locktag_field1)
+#define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \
+ ((Oid) (locktag).locktag_field2)
+#define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \
+ ((BlockNumber) (locktag).locktag_field3)
+#define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \
+ ((OffsetNumber) (locktag).locktag_field4)
+#define GET_PREDICATELOCKTARGETTAG_TYPE(locktag) \
+ (((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \
+ (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE : \
+ PREDLOCKTAG_RELATION))
+
+/*
+ * Two-phase commit statefile records. There are two types: for each
+ * transaction, we generate one per-transaction record and a variable
+ * number of per-predicate-lock records.
+ */
+typedef enum TwoPhasePredicateRecordType
+{
+ TWOPHASEPREDICATERECORD_XACT,
+ TWOPHASEPREDICATERECORD_LOCK
+} TwoPhasePredicateRecordType;
+
+/*
+ * Per-transaction information to reconstruct a SERIALIZABLEXACT. Not
+ * much is needed because most of it not meaningful for a recovered
+ * prepared transaction.
+ *
+ * In particular, we do not record the in and out conflict lists for a
+ * prepared transaction because the associated SERIALIZABLEXACTs will
+ * not be available after recovery. Instead, we simply record the
+ * existence of each type of conflict by setting the transaction's
+ * summary conflict in/out flag.
+ */
+typedef struct TwoPhasePredicateXactRecord
+{
+ TransactionId xmin;
+ uint32 flags;
+} TwoPhasePredicateXactRecord;
+
+/* Per-lock state */
+typedef struct TwoPhasePredicateLockRecord
+{
+ PREDICATELOCKTARGETTAG target;
+ uint32 filler; /* to avoid length change in back-patched fix */
+} TwoPhasePredicateLockRecord;
+
+typedef struct TwoPhasePredicateRecord
+{
+ TwoPhasePredicateRecordType type;
+ union
+ {
+ TwoPhasePredicateXactRecord xactRecord;
+ TwoPhasePredicateLockRecord lockRecord;
+ } data;
+} TwoPhasePredicateRecord;
+
+/*
+ * Define a macro to use for an "empty" SERIALIZABLEXACT reference.
+ */
+#define InvalidSerializableXact ((SERIALIZABLEXACT *) NULL)
+
+
+/*
+ * Function definitions for functions needing awareness of predicate
+ * locking internals.
+ */
+extern PredicateLockData *GetPredicateLockStatusData(void);
+extern int GetSafeSnapshotBlockingPids(int blocked_pid,
+ int *output, int output_size);
+
+#endif /* PREDICATE_INTERNALS_H */
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
new file mode 100644
index 0000000..2579e61
--- /dev/null
+++ b/src/include/storage/proc.h
@@ -0,0 +1,461 @@
+/*-------------------------------------------------------------------------
+ *
+ * proc.h
+ * per-process shared memory data structures
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/proc.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _PROC_H_
+#define _PROC_H_
+
+#include "access/clog.h"
+#include "access/xlogdefs.h"
+#include "lib/ilist.h"
+#include "storage/latch.h"
+#include "storage/lock.h"
+#include "storage/pg_sema.h"
+#include "storage/proclist_types.h"
+
+/*
+ * Each backend advertises up to PGPROC_MAX_CACHED_SUBXIDS TransactionIds
+ * for non-aborted subtransactions of its current top transaction. These
+ * have to be treated as running XIDs by other backends.
+ *
+ * We also keep track of whether the cache overflowed (ie, the transaction has
+ * generated at least one subtransaction that didn't fit in the cache).
+ * If none of the caches have overflowed, we can assume that an XID that's not
+ * listed anywhere in the PGPROC array is not a running transaction. Else we
+ * have to look at pg_subtrans.
+ */
+#define PGPROC_MAX_CACHED_SUBXIDS 64 /* XXX guessed-at value */
+
+typedef struct XidCacheStatus
+{
+ /* number of cached subxids, never more than PGPROC_MAX_CACHED_SUBXIDS */
+ uint8 count;
+ /* has PGPROC->subxids overflowed */
+ bool overflowed;
+} XidCacheStatus;
+
+struct XidCache
+{
+ TransactionId xids[PGPROC_MAX_CACHED_SUBXIDS];
+};
+
+/*
+ * Flags for PGPROC->statusFlags and PROC_HDR->statusFlags[]
+ */
+#define PROC_IS_AUTOVACUUM 0x01 /* is it an autovac worker? */
+#define PROC_IN_VACUUM 0x02 /* currently running lazy vacuum */
+#define PROC_IN_SAFE_IC 0x04 /* currently running CREATE INDEX
+ * CONCURRENTLY or REINDEX
+ * CONCURRENTLY on non-expressional,
+ * non-partial index */
+#define PROC_VACUUM_FOR_WRAPAROUND 0x08 /* set by autovac only */
+#define PROC_IN_LOGICAL_DECODING 0x10 /* currently doing logical
+ * decoding outside xact */
+#define PROC_AFFECTS_ALL_HORIZONS 0x20 /* this proc's xmin must be
+ * included in vacuum horizons
+ * in all databases */
+
+/* flags reset at EOXact */
+#define PROC_VACUUM_STATE_MASK \
+ (PROC_IN_VACUUM | PROC_IN_SAFE_IC | PROC_VACUUM_FOR_WRAPAROUND)
+
+/*
+ * Xmin-related flags. Make sure any flags that affect how the process' Xmin
+ * value is interpreted by VACUUM are included here.
+ */
+#define PROC_XMIN_FLAGS (PROC_IN_VACUUM | PROC_IN_SAFE_IC)
+
+/*
+ * We allow a small number of "weak" relation locks (AccessShareLock,
+ * RowShareLock, RowExclusiveLock) to be recorded in the PGPROC structure
+ * rather than the main lock table. This eases contention on the lock
+ * manager LWLocks. See storage/lmgr/README for additional details.
+ */
+#define FP_LOCK_SLOTS_PER_BACKEND 16
+
+/*
+ * An invalid pgprocno. Must be larger than the maximum number of PGPROC
+ * structures we could possibly have. See comments for MAX_BACKENDS.
+ */
+#define INVALID_PGPROCNO PG_INT32_MAX
+
+/*
+ * Flags for PGPROC.delayChkpt
+ *
+ * These flags can be used to delay the start or completion of a checkpoint
+ * for short periods. A flag is in effect if the corresponding bit is set in
+ * the PGPROC of any backend.
+ *
+ * For our purposes here, a checkpoint has three phases: (1) determine the
+ * location to which the redo pointer will be moved, (2) write all the
+ * data durably to disk, and (3) WAL-log the checkpoint.
+ *
+ * Setting DELAY_CHKPT_START prevents the system from moving from phase 1
+ * to phase 2. This is useful when we are performing a WAL-logged modification
+ * of data that will be flushed to disk in phase 2. By setting this flag
+ * before writing WAL and clearing it after we've both written WAL and
+ * performed the corresponding modification, we ensure that if the WAL record
+ * is inserted prior to the new redo point, the corresponding data changes will
+ * also be flushed to disk before the checkpoint can complete. (In the
+ * extremely common case where the data being modified is in shared buffers
+ * and we acquire an exclusive content lock on the relevant buffers before
+ * writing WAL, this mechanism is not needed, because phase 2 will block
+ * until we release the content lock and then flush the modified data to
+ * disk.)
+ *
+ * Setting DELAY_CHKPT_COMPLETE prevents the system from moving from phase 2
+ * to phase 3. This is useful if we are performing a WAL-logged operation that
+ * might invalidate buffers, such as relation truncation. In this case, we need
+ * to ensure that any buffers which were invalidated and thus not flushed by
+ * the checkpoint are actaully destroyed on disk. Replay can cope with a file
+ * or block that doesn't exist, but not with a block that has the wrong
+ * contents.
+ */
+#define DELAY_CHKPT_START (1<<0)
+#define DELAY_CHKPT_COMPLETE (1<<1)
+
+typedef enum
+{
+ PROC_WAIT_STATUS_OK,
+ PROC_WAIT_STATUS_WAITING,
+ PROC_WAIT_STATUS_ERROR,
+} ProcWaitStatus;
+
+/*
+ * Each backend has a PGPROC struct in shared memory. There is also a list of
+ * currently-unused PGPROC structs that will be reallocated to new backends.
+ *
+ * links: list link for any list the PGPROC is in. When waiting for a lock,
+ * the PGPROC is linked into that lock's waitProcs queue. A recycled PGPROC
+ * is linked into ProcGlobal's freeProcs list.
+ *
+ * Note: twophase.c also sets up a dummy PGPROC struct for each currently
+ * prepared transaction. These PGPROCs appear in the ProcArray data structure
+ * so that the prepared transactions appear to be still running and are
+ * correctly shown as holding locks. A prepared transaction PGPROC can be
+ * distinguished from a real one at need by the fact that it has pid == 0.
+ * The semaphore and lock-activity fields in a prepared-xact PGPROC are unused,
+ * but its myProcLocks[] lists are valid.
+ *
+ * We allow many fields of this struct to be accessed without locks, such as
+ * delayChkpt and isBackgroundWorker. However, keep in mind that writing
+ * mirrored ones (see below) requires holding ProcArrayLock or XidGenLock in
+ * at least shared mode, so that pgxactoff does not change concurrently.
+ *
+ * Mirrored fields:
+ *
+ * Some fields in PGPROC (see "mirrored in ..." comment) are mirrored into an
+ * element of more densely packed ProcGlobal arrays. These arrays are indexed
+ * by PGPROC->pgxactoff. Both copies need to be maintained coherently.
+ *
+ * NB: The pgxactoff indexed value can *never* be accessed without holding
+ * locks.
+ *
+ * See PROC_HDR for details.
+ */
+struct PGPROC
+{
+ /* proc->links MUST BE FIRST IN STRUCT (see ProcSleep,ProcWakeup,etc) */
+ SHM_QUEUE links; /* list link if process is in a list */
+ PGPROC **procgloballist; /* procglobal list that owns this PGPROC */
+
+ PGSemaphore sem; /* ONE semaphore to sleep on */
+ ProcWaitStatus waitStatus;
+
+ Latch procLatch; /* generic latch for process */
+
+
+ TransactionId xid; /* id of top-level transaction currently being
+ * executed by this proc, if running and XID
+ * is assigned; else InvalidTransactionId.
+ * mirrored in ProcGlobal->xids[pgxactoff] */
+
+ TransactionId xmin; /* minimal running XID as it was when we were
+ * starting our xact, excluding LAZY VACUUM:
+ * vacuum must not remove tuples deleted by
+ * xid >= xmin ! */
+
+ LocalTransactionId lxid; /* local id of top-level transaction currently
+ * being executed by this proc, if running;
+ * else InvalidLocalTransactionId */
+ int pid; /* Backend's process ID; 0 if prepared xact */
+
+ int pgxactoff; /* offset into various ProcGlobal->arrays with
+ * data mirrored from this PGPROC */
+ int pgprocno;
+
+ /* These fields are zero while a backend is still starting up: */
+ BackendId backendId; /* This backend's backend ID (if assigned) */
+ Oid databaseId; /* OID of database this backend is using */
+ Oid roleId; /* OID of role using this backend */
+
+ Oid tempNamespaceId; /* OID of temp schema this backend is
+ * using */
+
+ bool isBackgroundWorker; /* true if background worker. */
+
+ /*
+ * While in hot standby mode, shows that a conflict signal has been sent
+ * for the current transaction. Set/cleared while holding ProcArrayLock,
+ * though not required. Accessed without lock, if needed.
+ */
+ bool recoveryConflictPending;
+
+ /* Info about LWLock the process is currently waiting for, if any. */
+ bool lwWaiting; /* true if waiting for an LW lock */
+ uint8 lwWaitMode; /* lwlock mode being waited for */
+ proclist_node lwWaitLink; /* position in LW lock wait list */
+
+ /* Support for condition variables. */
+ proclist_node cvWaitLink; /* position in CV wait list */
+
+ /* Info about lock the process is currently waiting for, if any. */
+ /* waitLock and waitProcLock are NULL if not currently waiting. */
+ LOCK *waitLock; /* Lock object we're sleeping on ... */
+ PROCLOCK *waitProcLock; /* Per-holder info for awaited lock */
+ LOCKMODE waitLockMode; /* type of lock we're waiting for */
+ LOCKMASK heldLocks; /* bitmask for lock types already held on this
+ * lock object by this backend */
+ pg_atomic_uint64 waitStart; /* time at which wait for lock acquisition
+ * started */
+
+ int delayChkptFlags; /* for DELAY_CHKPT_* flags */
+
+ uint8 statusFlags; /* this backend's status flags, see PROC_*
+ * above. mirrored in
+ * ProcGlobal->statusFlags[pgxactoff] */
+
+ /*
+ * Info to allow us to wait for synchronous replication, if needed.
+ * waitLSN is InvalidXLogRecPtr if not waiting; set only by user backend.
+ * syncRepState must not be touched except by owning process or WALSender.
+ * syncRepLinks used only while holding SyncRepLock.
+ */
+ XLogRecPtr waitLSN; /* waiting for this LSN or higher */
+ int syncRepState; /* wait state for sync rep */
+ SHM_QUEUE syncRepLinks; /* list link if process is in syncrep queue */
+
+ /*
+ * All PROCLOCK objects for locks held or awaited by this backend are
+ * linked into one of these lists, according to the partition number of
+ * their lock.
+ */
+ SHM_QUEUE myProcLocks[NUM_LOCK_PARTITIONS];
+
+ XidCacheStatus subxidStatus; /* mirrored with
+ * ProcGlobal->subxidStates[i] */
+ struct XidCache subxids; /* cache for subtransaction XIDs */
+
+ /* Support for group XID clearing. */
+ /* true, if member of ProcArray group waiting for XID clear */
+ bool procArrayGroupMember;
+ /* next ProcArray group member waiting for XID clear */
+ pg_atomic_uint32 procArrayGroupNext;
+
+ /*
+ * latest transaction id among the transaction's main XID and
+ * subtransactions
+ */
+ TransactionId procArrayGroupMemberXid;
+
+ uint32 wait_event_info; /* proc's wait information */
+
+ /* Support for group transaction status update. */
+ bool clogGroupMember; /* true, if member of clog group */
+ pg_atomic_uint32 clogGroupNext; /* next clog group member */
+ TransactionId clogGroupMemberXid; /* transaction id of clog group member */
+ XidStatus clogGroupMemberXidStatus; /* transaction status of clog
+ * group member */
+ int clogGroupMemberPage; /* clog page corresponding to
+ * transaction id of clog group member */
+ XLogRecPtr clogGroupMemberLsn; /* WAL location of commit record for clog
+ * group member */
+
+ /* Lock manager data, recording fast-path locks taken by this backend. */
+ LWLock fpInfoLock; /* protects per-backend fast-path state */
+ uint64 fpLockBits; /* lock modes held for each fast-path slot */
+ Oid fpRelId[FP_LOCK_SLOTS_PER_BACKEND]; /* slots for rel oids */
+ bool fpVXIDLock; /* are we holding a fast-path VXID lock? */
+ LocalTransactionId fpLocalTransactionId; /* lxid for fast-path VXID
+ * lock */
+
+ /*
+ * Support for lock groups. Use LockHashPartitionLockByProc on the group
+ * leader to get the LWLock protecting these fields.
+ */
+ PGPROC *lockGroupLeader; /* lock group leader, if I'm a member */
+ dlist_head lockGroupMembers; /* list of members, if I'm a leader */
+ dlist_node lockGroupLink; /* my member link, if I'm a member */
+};
+
+/* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */
+
+
+extern PGDLLIMPORT PGPROC *MyProc;
+
+/*
+ * There is one ProcGlobal struct for the whole database cluster.
+ *
+ * Adding/Removing an entry into the procarray requires holding *both*
+ * ProcArrayLock and XidGenLock in exclusive mode (in that order). Both are
+ * needed because the dense arrays (see below) are accessed from
+ * GetNewTransactionId() and GetSnapshotData(), and we don't want to add
+ * further contention by both using the same lock. Adding/Removing a procarray
+ * entry is much less frequent.
+ *
+ * Some fields in PGPROC are mirrored into more densely packed arrays (e.g.
+ * xids), with one entry for each backend. These arrays only contain entries
+ * for PGPROCs that have been added to the shared array with ProcArrayAdd()
+ * (in contrast to PGPROC array which has unused PGPROCs interspersed).
+ *
+ * The dense arrays are indexed by PGPROC->pgxactoff. Any concurrent
+ * ProcArrayAdd() / ProcArrayRemove() can lead to pgxactoff of a procarray
+ * member to change. Therefore it is only safe to use PGPROC->pgxactoff to
+ * access the dense array while holding either ProcArrayLock or XidGenLock.
+ *
+ * As long as a PGPROC is in the procarray, the mirrored values need to be
+ * maintained in both places in a coherent manner.
+ *
+ * The denser separate arrays are beneficial for three main reasons: First, to
+ * allow for as tight loops accessing the data as possible. Second, to prevent
+ * updates of frequently changing data (e.g. xmin) from invalidating
+ * cachelines also containing less frequently changing data (e.g. xid,
+ * statusFlags). Third to condense frequently accessed data into as few
+ * cachelines as possible.
+ *
+ * There are two main reasons to have the data mirrored between these dense
+ * arrays and PGPROC. First, as explained above, a PGPROC's array entries can
+ * only be accessed with either ProcArrayLock or XidGenLock held, whereas the
+ * PGPROC entries do not require that (obviously there may still be locking
+ * requirements around the individual field, separate from the concerns
+ * here). That is particularly important for a backend to efficiently checks
+ * it own values, which it often can safely do without locking. Second, the
+ * PGPROC fields allow to avoid unnecessary accesses and modification to the
+ * dense arrays. A backend's own PGPROC is more likely to be in a local cache,
+ * whereas the cachelines for the dense array will be modified by other
+ * backends (often removing it from the cache for other cores/sockets). At
+ * commit/abort time a check of the PGPROC value can avoid accessing/dirtying
+ * the corresponding array value.
+ *
+ * Basically it makes sense to access the PGPROC variable when checking a
+ * single backend's data, especially when already looking at the PGPROC for
+ * other reasons already. It makes sense to look at the "dense" arrays if we
+ * need to look at many / most entries, because we then benefit from the
+ * reduced indirection and better cross-process cache-ability.
+ *
+ * When entering a PGPROC for 2PC transactions with ProcArrayAdd(), the data
+ * in the dense arrays is initialized from the PGPROC while it already holds
+ * ProcArrayLock.
+ */
+typedef struct PROC_HDR
+{
+ /* Array of PGPROC structures (not including dummies for prepared txns) */
+ PGPROC *allProcs;
+
+ /* Array mirroring PGPROC.xid for each PGPROC currently in the procarray */
+ TransactionId *xids;
+
+ /*
+ * Array mirroring PGPROC.subxidStatus for each PGPROC currently in the
+ * procarray.
+ */
+ XidCacheStatus *subxidStates;
+
+ /*
+ * Array mirroring PGPROC.statusFlags for each PGPROC currently in the
+ * procarray.
+ */
+ uint8 *statusFlags;
+
+ /* Length of allProcs array */
+ uint32 allProcCount;
+ /* Head of list of free PGPROC structures */
+ PGPROC *freeProcs;
+ /* Head of list of autovacuum's free PGPROC structures */
+ PGPROC *autovacFreeProcs;
+ /* Head of list of bgworker free PGPROC structures */
+ PGPROC *bgworkerFreeProcs;
+ /* Head of list of walsender free PGPROC structures */
+ PGPROC *walsenderFreeProcs;
+ /* First pgproc waiting for group XID clear */
+ pg_atomic_uint32 procArrayGroupFirst;
+ /* First pgproc waiting for group transaction status update */
+ pg_atomic_uint32 clogGroupFirst;
+ /* WALWriter process's latch */
+ Latch *walwriterLatch;
+ /* Checkpointer process's latch */
+ Latch *checkpointerLatch;
+ /* Current shared estimate of appropriate spins_per_delay value */
+ int spins_per_delay;
+ /* Buffer id of the buffer that Startup process waits for pin on, or -1 */
+ int startupBufferPinWaitBufId;
+} PROC_HDR;
+
+extern PGDLLIMPORT PROC_HDR *ProcGlobal;
+
+extern PGDLLIMPORT PGPROC *PreparedXactProcs;
+
+/* Accessor for PGPROC given a pgprocno. */
+#define GetPGProcByNumber(n) (&ProcGlobal->allProcs[(n)])
+
+/*
+ * We set aside some extra PGPROC structures for auxiliary processes,
+ * ie things that aren't full-fledged backends but need shmem access.
+ *
+ * Background writer, checkpointer, WAL writer and archiver run during normal
+ * operation. Startup process and WAL receiver also consume 2 slots, but WAL
+ * writer is launched only after startup has exited, so we only need 5 slots.
+ */
+#define NUM_AUXILIARY_PROCS 5
+
+/* configurable options */
+extern PGDLLIMPORT int DeadlockTimeout;
+extern PGDLLIMPORT int StatementTimeout;
+extern PGDLLIMPORT int LockTimeout;
+extern PGDLLIMPORT int IdleInTransactionSessionTimeout;
+extern PGDLLIMPORT int IdleSessionTimeout;
+extern PGDLLIMPORT bool log_lock_waits;
+
+
+/*
+ * Function Prototypes
+ */
+extern int ProcGlobalSemas(void);
+extern Size ProcGlobalShmemSize(void);
+extern void InitProcGlobal(void);
+extern void InitProcess(void);
+extern void InitProcessPhase2(void);
+extern void InitAuxiliaryProcess(void);
+
+extern void SetStartupBufferPinWaitBufId(int bufid);
+extern int GetStartupBufferPinWaitBufId(void);
+
+extern bool HaveNFreeProcs(int n);
+extern void ProcReleaseLocks(bool isCommit);
+
+extern void ProcQueueInit(PROC_QUEUE *queue);
+extern ProcWaitStatus ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable);
+extern PGPROC *ProcWakeup(PGPROC *proc, ProcWaitStatus waitStatus);
+extern void ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock);
+extern void CheckDeadLockAlert(void);
+extern bool IsWaitingForLock(void);
+extern void LockErrorCleanup(void);
+
+extern void ProcWaitForSignal(uint32 wait_event_info);
+extern void ProcSendSignal(int pgprocno);
+
+extern PGPROC *AuxiliaryPidGetProc(int pid);
+
+extern void BecomeLockGroupLeader(void);
+extern bool BecomeLockGroupMember(PGPROC *leader, int pid);
+
+#endif /* _PROC_H_ */
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
new file mode 100644
index 0000000..781e3f6
--- /dev/null
+++ b/src/include/storage/procarray.h
@@ -0,0 +1,99 @@
+/*-------------------------------------------------------------------------
+ *
+ * procarray.h
+ * POSTGRES process array definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/procarray.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PROCARRAY_H
+#define PROCARRAY_H
+
+#include "storage/lock.h"
+#include "storage/standby.h"
+#include "utils/relcache.h"
+#include "utils/snapshot.h"
+
+
+extern Size ProcArrayShmemSize(void);
+extern void CreateSharedProcArray(void);
+extern void ProcArrayAdd(PGPROC *proc);
+extern void ProcArrayRemove(PGPROC *proc, TransactionId latestXid);
+
+extern void ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid);
+extern void ProcArrayClearTransaction(PGPROC *proc);
+
+extern void ProcArrayInitRecovery(TransactionId initializedUptoXID);
+extern void ProcArrayApplyRecoveryInfo(RunningTransactions running);
+extern void ProcArrayApplyXidAssignment(TransactionId topxid,
+ int nsubxids, TransactionId *subxids);
+
+extern void RecordKnownAssignedTransactionIds(TransactionId xid);
+extern void ExpireTreeKnownAssignedTransactionIds(TransactionId xid,
+ int nsubxids, TransactionId *subxids,
+ TransactionId max_xid);
+extern void ExpireAllKnownAssignedTransactionIds(void);
+extern void ExpireOldKnownAssignedTransactionIds(TransactionId xid);
+extern void KnownAssignedTransactionIdsIdleMaintenance(void);
+
+extern int GetMaxSnapshotXidCount(void);
+extern int GetMaxSnapshotSubxidCount(void);
+
+extern Snapshot GetSnapshotData(Snapshot snapshot);
+
+extern bool ProcArrayInstallImportedXmin(TransactionId xmin,
+ VirtualTransactionId *sourcevxid);
+extern bool ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc);
+
+extern RunningTransactions GetRunningTransactionData(void);
+
+extern bool TransactionIdIsInProgress(TransactionId xid);
+extern bool TransactionIdIsActive(TransactionId xid);
+extern TransactionId GetOldestNonRemovableTransactionId(Relation rel);
+extern TransactionId GetOldestTransactionIdConsideredRunning(void);
+extern TransactionId GetOldestActiveTransactionId(void);
+extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly);
+extern void GetReplicationHorizons(TransactionId *slot_xmin, TransactionId *catalog_xmin);
+
+extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids, int type);
+extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids,
+ int nvxids, int type);
+
+extern PGPROC *BackendPidGetProc(int pid);
+extern PGPROC *BackendPidGetProcWithLock(int pid);
+extern int BackendXidGetPid(TransactionId xid);
+extern bool IsBackendPid(int pid);
+
+extern VirtualTransactionId *GetCurrentVirtualXIDs(TransactionId limitXmin,
+ bool excludeXmin0, bool allDbs, int excludeVacuum,
+ int *nvxids);
+extern VirtualTransactionId *GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid);
+extern pid_t CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode);
+extern pid_t SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode,
+ bool conflictPending);
+
+extern bool MinimumActiveBackends(int min);
+extern int CountDBBackends(Oid databaseid);
+extern int CountDBConnections(Oid databaseid);
+extern void CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending);
+extern int CountUserBackends(Oid roleid);
+extern bool CountOtherDBBackends(Oid databaseId,
+ int *nbackends, int *nprepared);
+extern void TerminateOtherDBBackends(Oid databaseId);
+
+extern void XidCacheRemoveRunningXids(TransactionId xid,
+ int nxids, const TransactionId *xids,
+ TransactionId latestXid);
+
+extern void ProcArraySetReplicationSlotXmin(TransactionId xmin,
+ TransactionId catalog_xmin, bool already_locked);
+
+extern void ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
+ TransactionId *catalog_xmin);
+
+#endif /* PROCARRAY_H */
diff --git a/src/include/storage/proclist.h b/src/include/storage/proclist.h
new file mode 100644
index 0000000..509e341
--- /dev/null
+++ b/src/include/storage/proclist.h
@@ -0,0 +1,219 @@
+/*-------------------------------------------------------------------------
+ *
+ * proclist.h
+ * operations on doubly-linked lists of pgprocnos
+ *
+ * The interface is similar to dlist from ilist.h, but uses pgprocno instead
+ * of pointers. This allows proclist_head to be mapped at different addresses
+ * in different backends.
+ *
+ * See proclist_types.h for the structs that these functions operate on. They
+ * are separated to break a header dependency cycle with proc.h.
+ *
+ * Portions Copyright (c) 2016-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/include/storage/proclist.h
+ *-------------------------------------------------------------------------
+ */
+#ifndef PROCLIST_H
+#define PROCLIST_H
+
+#include "storage/proc.h"
+#include "storage/proclist_types.h"
+
+/*
+ * Initialize a proclist.
+ */
+static inline void
+proclist_init(proclist_head *list)
+{
+ list->head = list->tail = INVALID_PGPROCNO;
+}
+
+/*
+ * Is the list empty?
+ */
+static inline bool
+proclist_is_empty(proclist_head *list)
+{
+ return list->head == INVALID_PGPROCNO;
+}
+
+/*
+ * Get a pointer to a proclist_node inside a given PGPROC, given a procno and
+ * the proclist_node field's offset within struct PGPROC.
+ */
+static inline proclist_node *
+proclist_node_get(int procno, size_t node_offset)
+{
+ char *entry = (char *) GetPGProcByNumber(procno);
+
+ return (proclist_node *) (entry + node_offset);
+}
+
+/*
+ * Insert a process at the beginning of a list.
+ */
+static inline void
+proclist_push_head_offset(proclist_head *list, int procno, size_t node_offset)
+{
+ proclist_node *node = proclist_node_get(procno, node_offset);
+
+ Assert(node->next == 0 && node->prev == 0);
+
+ if (list->head == INVALID_PGPROCNO)
+ {
+ Assert(list->tail == INVALID_PGPROCNO);
+ node->next = node->prev = INVALID_PGPROCNO;
+ list->head = list->tail = procno;
+ }
+ else
+ {
+ Assert(list->tail != INVALID_PGPROCNO);
+ Assert(list->head != procno);
+ Assert(list->tail != procno);
+ node->next = list->head;
+ proclist_node_get(node->next, node_offset)->prev = procno;
+ node->prev = INVALID_PGPROCNO;
+ list->head = procno;
+ }
+}
+
+/*
+ * Insert a process at the end of a list.
+ */
+static inline void
+proclist_push_tail_offset(proclist_head *list, int procno, size_t node_offset)
+{
+ proclist_node *node = proclist_node_get(procno, node_offset);
+
+ Assert(node->next == 0 && node->prev == 0);
+
+ if (list->tail == INVALID_PGPROCNO)
+ {
+ Assert(list->head == INVALID_PGPROCNO);
+ node->next = node->prev = INVALID_PGPROCNO;
+ list->head = list->tail = procno;
+ }
+ else
+ {
+ Assert(list->head != INVALID_PGPROCNO);
+ Assert(list->head != procno);
+ Assert(list->tail != procno);
+ node->prev = list->tail;
+ proclist_node_get(node->prev, node_offset)->next = procno;
+ node->next = INVALID_PGPROCNO;
+ list->tail = procno;
+ }
+}
+
+/*
+ * Delete a process from a list --- it must be in the list!
+ */
+static inline void
+proclist_delete_offset(proclist_head *list, int procno, size_t node_offset)
+{
+ proclist_node *node = proclist_node_get(procno, node_offset);
+
+ Assert(node->next != 0 || node->prev != 0);
+
+ if (node->prev == INVALID_PGPROCNO)
+ {
+ Assert(list->head == procno);
+ list->head = node->next;
+ }
+ else
+ proclist_node_get(node->prev, node_offset)->next = node->next;
+
+ if (node->next == INVALID_PGPROCNO)
+ {
+ Assert(list->tail == procno);
+ list->tail = node->prev;
+ }
+ else
+ proclist_node_get(node->next, node_offset)->prev = node->prev;
+
+ node->next = node->prev = 0;
+}
+
+/*
+ * Check if a process is currently in a list. It must be known that the
+ * process is not in any _other_ proclist that uses the same proclist_node,
+ * so that the only possibilities are that it is in this list or none.
+ */
+static inline bool
+proclist_contains_offset(proclist_head *list, int procno,
+ size_t node_offset)
+{
+ proclist_node *node = proclist_node_get(procno, node_offset);
+
+ /* If it's not in any list, it's definitely not in this one. */
+ if (node->prev == 0 && node->next == 0)
+ return false;
+
+ /*
+ * It must, in fact, be in this list. Ideally, in assert-enabled builds,
+ * we'd verify that. But since this function is typically used while
+ * holding a spinlock, crawling the whole list is unacceptable. However,
+ * we can verify matters in O(1) time when the node is a list head or
+ * tail, and that seems worth doing, since in practice that should often
+ * be enough to catch mistakes.
+ */
+ Assert(node->prev != INVALID_PGPROCNO || list->head == procno);
+ Assert(node->next != INVALID_PGPROCNO || list->tail == procno);
+
+ return true;
+}
+
+/*
+ * Remove and return the first process from a list (there must be one).
+ */
+static inline PGPROC *
+proclist_pop_head_node_offset(proclist_head *list, size_t node_offset)
+{
+ PGPROC *proc;
+
+ Assert(!proclist_is_empty(list));
+ proc = GetPGProcByNumber(list->head);
+ proclist_delete_offset(list, list->head, node_offset);
+ return proc;
+}
+
+/*
+ * Helper macros to avoid repetition of offsetof(PGPROC, <member>).
+ * 'link_member' is the name of a proclist_node member in PGPROC.
+ */
+#define proclist_delete(list, procno, link_member) \
+ proclist_delete_offset((list), (procno), offsetof(PGPROC, link_member))
+#define proclist_push_head(list, procno, link_member) \
+ proclist_push_head_offset((list), (procno), offsetof(PGPROC, link_member))
+#define proclist_push_tail(list, procno, link_member) \
+ proclist_push_tail_offset((list), (procno), offsetof(PGPROC, link_member))
+#define proclist_pop_head_node(list, link_member) \
+ proclist_pop_head_node_offset((list), offsetof(PGPROC, link_member))
+#define proclist_contains(list, procno, link_member) \
+ proclist_contains_offset((list), (procno), offsetof(PGPROC, link_member))
+
+/*
+ * Iterate through the list pointed at by 'lhead', storing the current
+ * position in 'iter'. 'link_member' is the name of a proclist_node member in
+ * PGPROC. Access the current position with iter.cur.
+ *
+ * The only list modification allowed while iterating is deleting the current
+ * node with proclist_delete(list, iter.cur, node_offset).
+ */
+#define proclist_foreach_modify(iter, lhead, link_member) \
+ for (AssertVariableIsOfTypeMacro(iter, proclist_mutable_iter), \
+ AssertVariableIsOfTypeMacro(lhead, proclist_head *), \
+ (iter).cur = (lhead)->head, \
+ (iter).next = (iter).cur == INVALID_PGPROCNO ? INVALID_PGPROCNO : \
+ proclist_node_get((iter).cur, \
+ offsetof(PGPROC, link_member))->next; \
+ (iter).cur != INVALID_PGPROCNO; \
+ (iter).cur = (iter).next, \
+ (iter).next = (iter).cur == INVALID_PGPROCNO ? INVALID_PGPROCNO : \
+ proclist_node_get((iter).cur, \
+ offsetof(PGPROC, link_member))->next)
+
+#endif /* PROCLIST_H */
diff --git a/src/include/storage/proclist_types.h b/src/include/storage/proclist_types.h
new file mode 100644
index 0000000..5232679
--- /dev/null
+++ b/src/include/storage/proclist_types.h
@@ -0,0 +1,51 @@
+/*-------------------------------------------------------------------------
+ *
+ * proclist_types.h
+ * doubly-linked lists of pgprocnos
+ *
+ * See proclist.h for functions that operate on these types.
+ *
+ * Portions Copyright (c) 2016-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/include/storage/proclist_types.h
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef PROCLIST_TYPES_H
+#define PROCLIST_TYPES_H
+
+/*
+ * A node in a doubly-linked list of processes. The link fields contain
+ * the 0-based PGPROC indexes of the next and previous process, or
+ * INVALID_PGPROCNO in the next-link of the last node and the prev-link
+ * of the first node. A node that is currently not in any list
+ * should have next == prev == 0; this is not a possible state for a node
+ * that is in a list, because we disallow circularity.
+ */
+typedef struct proclist_node
+{
+ int next; /* pgprocno of the next PGPROC */
+ int prev; /* pgprocno of the prev PGPROC */
+} proclist_node;
+
+/*
+ * Header of a doubly-linked list of PGPROCs, identified by pgprocno.
+ * An empty list is represented by head == tail == INVALID_PGPROCNO.
+ */
+typedef struct proclist_head
+{
+ int head; /* pgprocno of the head PGPROC */
+ int tail; /* pgprocno of the tail PGPROC */
+} proclist_head;
+
+/*
+ * List iterator allowing some modifications while iterating.
+ */
+typedef struct proclist_mutable_iter
+{
+ int cur; /* pgprocno of the current PGPROC */
+ int next; /* pgprocno of the next PGPROC */
+} proclist_mutable_iter;
+
+#endif /* PROCLIST_TYPES_H */
diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h
new file mode 100644
index 0000000..ee63690
--- /dev/null
+++ b/src/include/storage/procsignal.h
@@ -0,0 +1,71 @@
+/*-------------------------------------------------------------------------
+ *
+ * procsignal.h
+ * Routines for interprocess signaling
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/procsignal.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PROCSIGNAL_H
+#define PROCSIGNAL_H
+
+#include "storage/backendid.h"
+
+
+/*
+ * Reasons for signaling a Postgres child process (a backend or an auxiliary
+ * process, like checkpointer). We can cope with concurrent signals for different
+ * reasons. However, if the same reason is signaled multiple times in quick
+ * succession, the process is likely to observe only one notification of it.
+ * This is okay for the present uses.
+ *
+ * Also, because of race conditions, it's important that all the signals be
+ * defined so that no harm is done if a process mistakenly receives one.
+ */
+typedef enum
+{
+ PROCSIG_CATCHUP_INTERRUPT, /* sinval catchup interrupt */
+ PROCSIG_NOTIFY_INTERRUPT, /* listen/notify interrupt */
+ PROCSIG_PARALLEL_MESSAGE, /* message from cooperating parallel backend */
+ PROCSIG_WALSND_INIT_STOPPING, /* ask walsenders to prepare for shutdown */
+ PROCSIG_BARRIER, /* global barrier interrupt */
+ PROCSIG_LOG_MEMORY_CONTEXT, /* ask backend to log the memory contexts */
+
+ /* Recovery conflict reasons */
+ PROCSIG_RECOVERY_CONFLICT_DATABASE,
+ PROCSIG_RECOVERY_CONFLICT_TABLESPACE,
+ PROCSIG_RECOVERY_CONFLICT_LOCK,
+ PROCSIG_RECOVERY_CONFLICT_SNAPSHOT,
+ PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
+ PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK,
+
+ NUM_PROCSIGNALS /* Must be last! */
+} ProcSignalReason;
+
+typedef enum
+{
+ PROCSIGNAL_BARRIER_SMGRRELEASE /* ask smgr to close files */
+} ProcSignalBarrierType;
+
+/*
+ * prototypes for functions in procsignal.c
+ */
+extern Size ProcSignalShmemSize(void);
+extern void ProcSignalShmemInit(void);
+
+extern void ProcSignalInit(int pss_idx);
+extern int SendProcSignal(pid_t pid, ProcSignalReason reason,
+ BackendId backendId);
+
+extern uint64 EmitProcSignalBarrier(ProcSignalBarrierType type);
+extern void WaitForProcSignalBarrier(uint64 generation);
+extern void ProcessProcSignalBarrier(void);
+
+extern void procsignal_sigusr1_handler(SIGNAL_ARGS);
+
+#endif /* PROCSIGNAL_H */
diff --git a/src/include/storage/reinit.h b/src/include/storage/reinit.h
new file mode 100644
index 0000000..bf2c10d
--- /dev/null
+++ b/src/include/storage/reinit.h
@@ -0,0 +1,28 @@
+/*-------------------------------------------------------------------------
+ *
+ * reinit.h
+ * Reinitialization of unlogged relations
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/reinit.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef REINIT_H
+#define REINIT_H
+
+#include "common/relpath.h"
+
+
+extern void ResetUnloggedRelations(int op);
+extern bool parse_filename_for_nontemp_relation(const char *name,
+ int *oidchars, ForkNumber *fork);
+
+#define UNLOGGED_RELATION_CLEANUP 0x0001
+#define UNLOGGED_RELATION_INIT 0x0002
+
+#endif /* REINIT_H */
diff --git a/src/include/storage/relfilenode.h b/src/include/storage/relfilenode.h
new file mode 100644
index 0000000..4fdc606
--- /dev/null
+++ b/src/include/storage/relfilenode.h
@@ -0,0 +1,99 @@
+/*-------------------------------------------------------------------------
+ *
+ * relfilenode.h
+ * Physical access information for relations.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/relfilenode.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef RELFILENODE_H
+#define RELFILENODE_H
+
+#include "common/relpath.h"
+#include "storage/backendid.h"
+
+/*
+ * RelFileNode must provide all that we need to know to physically access
+ * a relation, with the exception of the backend ID, which can be provided
+ * separately. Note, however, that a "physical" relation is comprised of
+ * multiple files on the filesystem, as each fork is stored as a separate
+ * file, and each fork can be divided into multiple segments. See md.c.
+ *
+ * spcNode identifies the tablespace of the relation. It corresponds to
+ * pg_tablespace.oid.
+ *
+ * dbNode identifies the database of the relation. It is zero for
+ * "shared" relations (those common to all databases of a cluster).
+ * Nonzero dbNode values correspond to pg_database.oid.
+ *
+ * relNode identifies the specific relation. relNode corresponds to
+ * pg_class.relfilenode (NOT pg_class.oid, because we need to be able
+ * to assign new physical files to relations in some situations).
+ * Notice that relNode is only unique within a database in a particular
+ * tablespace.
+ *
+ * Note: spcNode must be GLOBALTABLESPACE_OID if and only if dbNode is
+ * zero. We support shared relations only in the "global" tablespace.
+ *
+ * Note: in pg_class we allow reltablespace == 0 to denote that the
+ * relation is stored in its database's "default" tablespace (as
+ * identified by pg_database.dattablespace). However this shorthand
+ * is NOT allowed in RelFileNode structs --- the real tablespace ID
+ * must be supplied when setting spcNode.
+ *
+ * Note: in pg_class, relfilenode can be zero to denote that the relation
+ * is a "mapped" relation, whose current true filenode number is available
+ * from relmapper.c. Again, this case is NOT allowed in RelFileNodes.
+ *
+ * Note: various places use RelFileNode in hashtable keys. Therefore,
+ * there *must not* be any unused padding bytes in this struct. That
+ * should be safe as long as all the fields are of type Oid.
+ */
+typedef struct RelFileNode
+{
+ Oid spcNode; /* tablespace */
+ Oid dbNode; /* database */
+ Oid relNode; /* relation */
+} RelFileNode;
+
+/*
+ * Augmenting a relfilenode with the backend ID provides all the information
+ * we need to locate the physical storage. The backend ID is InvalidBackendId
+ * for regular relations (those accessible to more than one backend), or the
+ * owning backend's ID for backend-local relations. Backend-local relations
+ * are always transient and removed in case of a database crash; they are
+ * never WAL-logged or fsync'd.
+ */
+typedef struct RelFileNodeBackend
+{
+ RelFileNode node;
+ BackendId backend;
+} RelFileNodeBackend;
+
+#define RelFileNodeBackendIsTemp(rnode) \
+ ((rnode).backend != InvalidBackendId)
+
+/*
+ * Note: RelFileNodeEquals and RelFileNodeBackendEquals compare relNode first
+ * since that is most likely to be different in two unequal RelFileNodes. It
+ * is probably redundant to compare spcNode if the other fields are found equal,
+ * but do it anyway to be sure. Likewise for checking the backend ID in
+ * RelFileNodeBackendEquals.
+ */
+#define RelFileNodeEquals(node1, node2) \
+ ((node1).relNode == (node2).relNode && \
+ (node1).dbNode == (node2).dbNode && \
+ (node1).spcNode == (node2).spcNode)
+
+#define RelFileNodeBackendEquals(node1, node2) \
+ ((node1).node.relNode == (node2).node.relNode && \
+ (node1).node.dbNode == (node2).node.dbNode && \
+ (node1).backend == (node2).backend && \
+ (node1).node.spcNode == (node2).node.spcNode)
+
+#endif /* RELFILENODE_H */
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
new file mode 100644
index 0000000..4d3ffc7
--- /dev/null
+++ b/src/include/storage/s_lock.h
@@ -0,0 +1,1110 @@
+/*-------------------------------------------------------------------------
+ *
+ * s_lock.h
+ * Hardware-dependent implementation of spinlocks.
+ *
+ * NOTE: none of the macros in this file are intended to be called directly.
+ * Call them through the hardware-independent macros in spin.h.
+ *
+ * The following hardware-dependent macros must be provided for each
+ * supported platform:
+ *
+ * void S_INIT_LOCK(slock_t *lock)
+ * Initialize a spinlock (to the unlocked state).
+ *
+ * int S_LOCK(slock_t *lock)
+ * Acquire a spinlock, waiting if necessary.
+ * Time out and abort() if unable to acquire the lock in a
+ * "reasonable" amount of time --- typically ~ 1 minute.
+ * Should return number of "delays"; see s_lock.c
+ *
+ * void S_UNLOCK(slock_t *lock)
+ * Unlock a previously acquired lock.
+ *
+ * bool S_LOCK_FREE(slock_t *lock)
+ * Tests if the lock is free. Returns true if free, false if locked.
+ * This does *not* change the state of the lock.
+ *
+ * void SPIN_DELAY(void)
+ * Delay operation to occur inside spinlock wait loop.
+ *
+ * Note to implementors: there are default implementations for all these
+ * macros at the bottom of the file. Check if your platform can use
+ * these or needs to override them.
+ *
+ * Usually, S_LOCK() is implemented in terms of even lower-level macros
+ * TAS() and TAS_SPIN():
+ *
+ * int TAS(slock_t *lock)
+ * Atomic test-and-set instruction. Attempt to acquire the lock,
+ * but do *not* wait. Returns 0 if successful, nonzero if unable
+ * to acquire the lock.
+ *
+ * int TAS_SPIN(slock_t *lock)
+ * Like TAS(), but this version is used when waiting for a lock
+ * previously found to be contended. By default, this is the
+ * same as TAS(), but on some architectures it's better to poll a
+ * contended lock using an unlocked instruction and retry the
+ * atomic test-and-set only when it appears free.
+ *
+ * TAS() and TAS_SPIN() are NOT part of the API, and should never be called
+ * directly.
+ *
+ * CAUTION: on some platforms TAS() and/or TAS_SPIN() may sometimes report
+ * failure to acquire a lock even when the lock is not locked. For example,
+ * on Alpha TAS() will "fail" if interrupted. Therefore a retry loop must
+ * always be used, even if you are certain the lock is free.
+ *
+ * It is the responsibility of these macros to make sure that the compiler
+ * does not re-order accesses to shared memory to precede the actual lock
+ * acquisition, or follow the lock release. Prior to PostgreSQL 9.5, this
+ * was the caller's responsibility, which meant that callers had to use
+ * volatile-qualified pointers to refer to both the spinlock itself and the
+ * shared data being accessed within the spinlocked critical section. This
+ * was notationally awkward, easy to forget (and thus error-prone), and
+ * prevented some useful compiler optimizations. For these reasons, we
+ * now require that the macros themselves prevent compiler re-ordering,
+ * so that the caller doesn't need to take special precautions.
+ *
+ * On platforms with weak memory ordering, the TAS(), TAS_SPIN(), and
+ * S_UNLOCK() macros must further include hardware-level memory fence
+ * instructions to prevent similar re-ordering at the hardware level.
+ * TAS() and TAS_SPIN() must guarantee that loads and stores issued after
+ * the macro are not executed until the lock has been obtained. Conversely,
+ * S_UNLOCK() must guarantee that loads and stores issued before the macro
+ * have been executed before the lock is released.
+ *
+ * On most supported platforms, TAS() uses a tas() function written
+ * in assembly language to execute a hardware atomic-test-and-set
+ * instruction. Equivalent OS-supplied mutex routines could be used too.
+ *
+ * If no system-specific TAS() is available (ie, HAVE_SPINLOCKS is not
+ * defined), then we fall back on an emulation that uses SysV semaphores
+ * (see spin.c). This emulation will be MUCH MUCH slower than a proper TAS()
+ * implementation, because of the cost of a kernel call per lock or unlock.
+ * An old report is that Postgres spends around 40% of its time in semop(2)
+ * when using the SysV semaphore code.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/s_lock.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef S_LOCK_H
+#define S_LOCK_H
+
+#ifdef FRONTEND
+#error "s_lock.h may not be included from frontend code"
+#endif
+
+#ifdef HAVE_SPINLOCKS /* skip spinlocks if requested */
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+/*************************************************************************
+ * All the gcc inlines
+ * Gcc consistently defines the CPU as __cpu__.
+ * Other compilers use __cpu or __cpu__ so we test for both in those cases.
+ */
+
+/*----------
+ * Standard gcc asm format (assuming "volatile slock_t *lock"):
+
+ __asm__ __volatile__(
+ " instruction \n"
+ " instruction \n"
+ " instruction \n"
+: "=r"(_res), "+m"(*lock) // return register, in/out lock value
+: "r"(lock) // lock pointer, in input register
+: "memory", "cc"); // show clobbered registers here
+
+ * The output-operands list (after first colon) should always include
+ * "+m"(*lock), whether or not the asm code actually refers to this
+ * operand directly. This ensures that gcc believes the value in the
+ * lock variable is used and set by the asm code. Also, the clobbers
+ * list (after third colon) should always include "memory"; this prevents
+ * gcc from thinking it can cache the values of shared-memory fields
+ * across the asm code. Add "cc" if your asm code changes the condition
+ * code register, and also list any temp registers the code uses.
+ *----------
+ */
+
+
+#ifdef __i386__ /* 32-bit i386 */
+#define HAS_TEST_AND_SET
+
+typedef unsigned char slock_t;
+
+#define TAS(lock) tas(lock)
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+ register slock_t _res = 1;
+
+ /*
+ * Use a non-locking test before asserting the bus lock. Note that the
+ * extra test appears to be a small loss on some x86 platforms and a small
+ * win on others; it's by no means clear that we should keep it.
+ *
+ * When this was last tested, we didn't have separate TAS() and TAS_SPIN()
+ * macros. Nowadays it probably would be better to do a non-locking test
+ * in TAS_SPIN() but not in TAS(), like on x86_64, but no-one's done the
+ * testing to verify that. Without some empirical evidence, better to
+ * leave it alone.
+ */
+ __asm__ __volatile__(
+ " cmpb $0,%1 \n"
+ " jne 1f \n"
+ " lock \n"
+ " xchgb %0,%1 \n"
+ "1: \n"
+: "+q"(_res), "+m"(*lock)
+: /* no inputs */
+: "memory", "cc");
+ return (int) _res;
+}
+
+#define SPIN_DELAY() spin_delay()
+
+static __inline__ void
+spin_delay(void)
+{
+ /*
+ * This sequence is equivalent to the PAUSE instruction ("rep" is
+ * ignored by old IA32 processors if the following instruction is
+ * not a string operation); the IA-32 Architecture Software
+ * Developer's Manual, Vol. 3, Section 7.7.2 describes why using
+ * PAUSE in the inner loop of a spin lock is necessary for good
+ * performance:
+ *
+ * The PAUSE instruction improves the performance of IA-32
+ * processors supporting Hyper-Threading Technology when
+ * executing spin-wait loops and other routines where one
+ * thread is accessing a shared lock or semaphore in a tight
+ * polling loop. When executing a spin-wait loop, the
+ * processor can suffer a severe performance penalty when
+ * exiting the loop because it detects a possible memory order
+ * violation and flushes the core processor's pipeline. The
+ * PAUSE instruction provides a hint to the processor that the
+ * code sequence is a spin-wait loop. The processor uses this
+ * hint to avoid the memory order violation and prevent the
+ * pipeline flush. In addition, the PAUSE instruction
+ * de-pipelines the spin-wait loop to prevent it from
+ * consuming execution resources excessively.
+ */
+ __asm__ __volatile__(
+ " rep; nop \n");
+}
+
+#endif /* __i386__ */
+
+
+#ifdef __x86_64__ /* AMD Opteron, Intel EM64T */
+#define HAS_TEST_AND_SET
+
+typedef unsigned char slock_t;
+
+#define TAS(lock) tas(lock)
+
+/*
+ * On Intel EM64T, it's a win to use a non-locking test before the xchg proper,
+ * but only when spinning.
+ *
+ * See also Implementing Scalable Atomic Locks for Multi-Core Intel(tm) EM64T
+ * and IA32, by Michael Chynoweth and Mary R. Lee. As of this writing, it is
+ * available at:
+ * http://software.intel.com/en-us/articles/implementing-scalable-atomic-locks-for-multi-core-intel-em64t-and-ia32-architectures
+ */
+#define TAS_SPIN(lock) (*(lock) ? 1 : TAS(lock))
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+ register slock_t _res = 1;
+
+ __asm__ __volatile__(
+ " lock \n"
+ " xchgb %0,%1 \n"
+: "+q"(_res), "+m"(*lock)
+: /* no inputs */
+: "memory", "cc");
+ return (int) _res;
+}
+
+#define SPIN_DELAY() spin_delay()
+
+static __inline__ void
+spin_delay(void)
+{
+ /*
+ * Adding a PAUSE in the spin delay loop is demonstrably a no-op on
+ * Opteron, but it may be of some use on EM64T, so we keep it.
+ */
+ __asm__ __volatile__(
+ " rep; nop \n");
+}
+
+#endif /* __x86_64__ */
+
+
+#if defined(__ia64__) || defined(__ia64)
+/*
+ * Intel Itanium, gcc or Intel's compiler.
+ *
+ * Itanium has weak memory ordering, but we rely on the compiler to enforce
+ * strict ordering of accesses to volatile data. In particular, while the
+ * xchg instruction implicitly acts as a memory barrier with 'acquire'
+ * semantics, we do not have an explicit memory fence instruction in the
+ * S_UNLOCK macro. We use a regular assignment to clear the spinlock, and
+ * trust that the compiler marks the generated store instruction with the
+ * ".rel" opcode.
+ *
+ * Testing shows that assumption to hold on gcc, although I could not find
+ * any explicit statement on that in the gcc manual. In Intel's compiler,
+ * the -m[no-]serialize-volatile option controls that, and testing shows that
+ * it is enabled by default.
+ *
+ * While icc accepts gcc asm blocks on x86[_64], this is not true on ia64
+ * (at least not in icc versions before 12.x). So we have to carry a separate
+ * compiler-intrinsic-based implementation for it.
+ */
+#define HAS_TEST_AND_SET
+
+typedef unsigned int slock_t;
+
+#define TAS(lock) tas(lock)
+
+/* On IA64, it's a win to use a non-locking test before the xchg proper */
+#define TAS_SPIN(lock) (*(lock) ? 1 : TAS(lock))
+
+#ifndef __INTEL_COMPILER
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+ long int ret;
+
+ __asm__ __volatile__(
+ " xchg4 %0=%1,%2 \n"
+: "=r"(ret), "+m"(*lock)
+: "r"(1)
+: "memory");
+ return (int) ret;
+}
+
+#else /* __INTEL_COMPILER */
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+ int ret;
+
+ ret = _InterlockedExchange(lock,1); /* this is a xchg asm macro */
+
+ return ret;
+}
+
+/* icc can't use the regular gcc S_UNLOCK() macro either in this case */
+#define S_UNLOCK(lock) \
+ do { __memory_barrier(); *(lock) = 0; } while (0)
+
+#endif /* __INTEL_COMPILER */
+#endif /* __ia64__ || __ia64 */
+
+
+/*
+ * On ARM and ARM64, we use __sync_lock_test_and_set(int *, int) if available.
+ *
+ * We use the int-width variant of the builtin because it works on more chips
+ * than other widths.
+ */
+#if defined(__arm__) || defined(__arm) || defined(__aarch64__) || defined(__aarch64)
+#ifdef HAVE_GCC__SYNC_INT32_TAS
+#define HAS_TEST_AND_SET
+
+#define TAS(lock) tas(lock)
+
+typedef int slock_t;
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+ return __sync_lock_test_and_set(lock, 1);
+}
+
+#define S_UNLOCK(lock) __sync_lock_release(lock)
+
+/*
+ * Using an ISB instruction to delay in spinlock loops appears beneficial on
+ * high-core-count ARM64 processors. It seems mostly a wash for smaller gear,
+ * and ISB doesn't exist at all on pre-v7 ARM chips.
+ */
+#if defined(__aarch64__) || defined(__aarch64)
+
+#define SPIN_DELAY() spin_delay()
+
+static __inline__ void
+spin_delay(void)
+{
+ __asm__ __volatile__(
+ " isb; \n");
+}
+
+#endif /* __aarch64__ || __aarch64 */
+#endif /* HAVE_GCC__SYNC_INT32_TAS */
+#endif /* __arm__ || __arm || __aarch64__ || __aarch64 */
+
+
+/* S/390 and S/390x Linux (32- and 64-bit zSeries) */
+#if defined(__s390__) || defined(__s390x__)
+#define HAS_TEST_AND_SET
+
+typedef unsigned int slock_t;
+
+#define TAS(lock) tas(lock)
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+ int _res = 0;
+
+ __asm__ __volatile__(
+ " cs %0,%3,0(%2) \n"
+: "+d"(_res), "+m"(*lock)
+: "a"(lock), "d"(1)
+: "memory", "cc");
+ return _res;
+}
+
+#endif /* __s390__ || __s390x__ */
+
+
+#if defined(__sparc__) /* Sparc */
+/*
+ * Solaris has always run sparc processors in TSO (total store) mode, but
+ * linux didn't use to and the *BSDs still don't. So, be careful about
+ * acquire/release semantics. The CPU will treat superfluous membars as
+ * NOPs, so it's just code space.
+ */
+#define HAS_TEST_AND_SET
+
+typedef unsigned char slock_t;
+
+#define TAS(lock) tas(lock)
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+ register slock_t _res;
+
+ /*
+ * See comment in src/backend/port/tas/sunstudio_sparc.s for why this
+ * uses "ldstub", and that file uses "cas". gcc currently generates
+ * sparcv7-targeted binaries, so "cas" use isn't possible.
+ */
+ __asm__ __volatile__(
+ " ldstub [%2], %0 \n"
+: "=r"(_res), "+m"(*lock)
+: "r"(lock)
+: "memory");
+#if defined(__sparcv7) || defined(__sparc_v7__)
+ /*
+ * No stbar or membar available, luckily no actually produced hardware
+ * requires a barrier.
+ */
+#elif defined(__sparcv8) || defined(__sparc_v8__)
+ /* stbar is available (and required for both PSO, RMO), membar isn't */
+ __asm__ __volatile__ ("stbar \n":::"memory");
+#else
+ /*
+ * #LoadStore (RMO) | #LoadLoad (RMO) together are the appropriate acquire
+ * barrier for sparcv8+ upwards.
+ */
+ __asm__ __volatile__ ("membar #LoadStore | #LoadLoad \n":::"memory");
+#endif
+ return (int) _res;
+}
+
+#if defined(__sparcv7) || defined(__sparc_v7__)
+/*
+ * No stbar or membar available, luckily no actually produced hardware
+ * requires a barrier. We fall through to the default gcc definition of
+ * S_UNLOCK in this case.
+ */
+#elif defined(__sparcv8) || defined(__sparc_v8__)
+/* stbar is available (and required for both PSO, RMO), membar isn't */
+#define S_UNLOCK(lock) \
+do \
+{ \
+ __asm__ __volatile__ ("stbar \n":::"memory"); \
+ *((volatile slock_t *) (lock)) = 0; \
+} while (0)
+#else
+/*
+ * #LoadStore (RMO) | #StoreStore (RMO, PSO) together are the appropriate
+ * release barrier for sparcv8+ upwards.
+ */
+#define S_UNLOCK(lock) \
+do \
+{ \
+ __asm__ __volatile__ ("membar #LoadStore | #StoreStore \n":::"memory"); \
+ *((volatile slock_t *) (lock)) = 0; \
+} while (0)
+#endif
+
+#endif /* __sparc__ */
+
+
+/* PowerPC */
+#if defined(__ppc__) || defined(__powerpc__) || defined(__ppc64__) || defined(__powerpc64__)
+#define HAS_TEST_AND_SET
+
+typedef unsigned int slock_t;
+
+#define TAS(lock) tas(lock)
+
+/* On PPC, it's a win to use a non-locking test before the lwarx */
+#define TAS_SPIN(lock) (*(lock) ? 1 : TAS(lock))
+
+/*
+ * The second operand of addi can hold a constant zero or a register number,
+ * hence constraint "=&b" to avoid allocating r0. "b" stands for "address
+ * base register"; most operands having this register-or-zero property are
+ * address bases, e.g. the second operand of lwax.
+ *
+ * NOTE: per the Enhanced PowerPC Architecture manual, v1.0 dated 7-May-2002,
+ * an isync is a sufficient synchronization barrier after a lwarx/stwcx loop.
+ * On newer machines, we can use lwsync instead for better performance.
+ *
+ * Ordinarily, we'd code the branches here using GNU-style local symbols, that
+ * is "1f" referencing "1:" and so on. But some people run gcc on AIX with
+ * IBM's assembler as backend, and IBM's assembler doesn't do local symbols.
+ * So hand-code the branch offsets; fortunately, all PPC instructions are
+ * exactly 4 bytes each, so it's not too hard to count.
+ */
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+ slock_t _t;
+ int _res;
+
+ __asm__ __volatile__(
+#ifdef USE_PPC_LWARX_MUTEX_HINT
+" lwarx %0,0,%3,1 \n"
+#else
+" lwarx %0,0,%3 \n"
+#endif
+" cmpwi %0,0 \n"
+" bne $+16 \n" /* branch to li %1,1 */
+" addi %0,%0,1 \n"
+" stwcx. %0,0,%3 \n"
+" beq $+12 \n" /* branch to lwsync/isync */
+" li %1,1 \n"
+" b $+12 \n" /* branch to end of asm sequence */
+#ifdef USE_PPC_LWSYNC
+" lwsync \n"
+#else
+" isync \n"
+#endif
+" li %1,0 \n"
+
+: "=&b"(_t), "=r"(_res), "+m"(*lock)
+: "r"(lock)
+: "memory", "cc");
+ return _res;
+}
+
+/*
+ * PowerPC S_UNLOCK is almost standard but requires a "sync" instruction.
+ * On newer machines, we can use lwsync instead for better performance.
+ */
+#ifdef USE_PPC_LWSYNC
+#define S_UNLOCK(lock) \
+do \
+{ \
+ __asm__ __volatile__ (" lwsync \n" ::: "memory"); \
+ *((volatile slock_t *) (lock)) = 0; \
+} while (0)
+#else
+#define S_UNLOCK(lock) \
+do \
+{ \
+ __asm__ __volatile__ (" sync \n" ::: "memory"); \
+ *((volatile slock_t *) (lock)) = 0; \
+} while (0)
+#endif /* USE_PPC_LWSYNC */
+
+#endif /* powerpc */
+
+
+/* Linux Motorola 68k */
+#if (defined(__mc68000__) || defined(__m68k__)) && defined(__linux__)
+#define HAS_TEST_AND_SET
+
+typedef unsigned char slock_t;
+
+#define TAS(lock) tas(lock)
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+ register int rv;
+
+ __asm__ __volatile__(
+ " clrl %0 \n"
+ " tas %1 \n"
+ " sne %0 \n"
+: "=d"(rv), "+m"(*lock)
+: /* no inputs */
+: "memory", "cc");
+ return rv;
+}
+
+#endif /* (__mc68000__ || __m68k__) && __linux__ */
+
+
+/* Motorola 88k */
+#if defined(__m88k__)
+#define HAS_TEST_AND_SET
+
+typedef unsigned int slock_t;
+
+#define TAS(lock) tas(lock)
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+ register slock_t _res = 1;
+
+ __asm__ __volatile__(
+ " xmem %0, %2, %%r0 \n"
+: "+r"(_res), "+m"(*lock)
+: "r"(lock)
+: "memory");
+ return (int) _res;
+}
+
+#endif /* __m88k__ */
+
+
+/*
+ * VAXen -- even multiprocessor ones
+ * (thanks to Tom Ivar Helbekkmo)
+ */
+#if defined(__vax__)
+#define HAS_TEST_AND_SET
+
+typedef unsigned char slock_t;
+
+#define TAS(lock) tas(lock)
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+ register int _res;
+
+ __asm__ __volatile__(
+ " movl $1, %0 \n"
+ " bbssi $0, (%2), 1f \n"
+ " clrl %0 \n"
+ "1: \n"
+: "=&r"(_res), "+m"(*lock)
+: "r"(lock)
+: "memory");
+ return _res;
+}
+
+#endif /* __vax__ */
+
+
+#if defined(__mips__) && !defined(__sgi) /* non-SGI MIPS */
+#define HAS_TEST_AND_SET
+
+typedef unsigned int slock_t;
+
+#define TAS(lock) tas(lock)
+
+/*
+ * Original MIPS-I processors lacked the LL/SC instructions, but if we are
+ * so unfortunate as to be running on one of those, we expect that the kernel
+ * will handle the illegal-instruction traps and emulate them for us. On
+ * anything newer (and really, MIPS-I is extinct) LL/SC is the only sane
+ * choice because any other synchronization method must involve a kernel
+ * call. Unfortunately, many toolchains still default to MIPS-I as the
+ * codegen target; if the symbol __mips shows that that's the case, we
+ * have to force the assembler to accept LL/SC.
+ *
+ * R10000 and up processors require a separate SYNC, which has the same
+ * issues as LL/SC.
+ */
+#if __mips < 2
+#define MIPS_SET_MIPS2 " .set mips2 \n"
+#else
+#define MIPS_SET_MIPS2
+#endif
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+ register volatile slock_t *_l = lock;
+ register int _res;
+ register int _tmp;
+
+ __asm__ __volatile__(
+ " .set push \n"
+ MIPS_SET_MIPS2
+ " .set noreorder \n"
+ " .set nomacro \n"
+ " ll %0, %2 \n"
+ " or %1, %0, 1 \n"
+ " sc %1, %2 \n"
+ " xori %1, 1 \n"
+ " or %0, %0, %1 \n"
+ " sync \n"
+ " .set pop "
+: "=&r" (_res), "=&r" (_tmp), "+R" (*_l)
+: /* no inputs */
+: "memory");
+ return _res;
+}
+
+/* MIPS S_UNLOCK is almost standard but requires a "sync" instruction */
+#define S_UNLOCK(lock) \
+do \
+{ \
+ __asm__ __volatile__( \
+ " .set push \n" \
+ MIPS_SET_MIPS2 \
+ " .set noreorder \n" \
+ " .set nomacro \n" \
+ " sync \n" \
+ " .set pop " \
+: /* no outputs */ \
+: /* no inputs */ \
+: "memory"); \
+ *((volatile slock_t *) (lock)) = 0; \
+} while (0)
+
+#endif /* __mips__ && !__sgi */
+
+
+#if defined(__m32r__) && defined(HAVE_SYS_TAS_H) /* Renesas' M32R */
+#define HAS_TEST_AND_SET
+
+#include <sys/tas.h>
+
+typedef int slock_t;
+
+#define TAS(lock) tas(lock)
+
+#endif /* __m32r__ */
+
+
+#if defined(__sh__) /* Renesas' SuperH */
+#define HAS_TEST_AND_SET
+
+typedef unsigned char slock_t;
+
+#define TAS(lock) tas(lock)
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+ register int _res;
+
+ /*
+ * This asm is coded as if %0 could be any register, but actually SuperH
+ * restricts the target of xor-immediate to be R0. That's handled by
+ * the "z" constraint on _res.
+ */
+ __asm__ __volatile__(
+ " tas.b @%2 \n"
+ " movt %0 \n"
+ " xor #1,%0 \n"
+: "=z"(_res), "+m"(*lock)
+: "r"(lock)
+: "memory", "t");
+ return _res;
+}
+
+#endif /* __sh__ */
+
+
+/* These live in s_lock.c, but only for gcc */
+
+
+#if defined(__m68k__) && !defined(__linux__) /* non-Linux Motorola 68k */
+#define HAS_TEST_AND_SET
+
+typedef unsigned char slock_t;
+#endif
+
+
+/*
+ * If we have no platform-specific knowledge, but we found that the compiler
+ * provides __sync_lock_test_and_set(), use that. Prefer the int-width
+ * version over the char-width version if we have both, on the rather dubious
+ * grounds that that's known to be more likely to work in the ARM ecosystem.
+ * (But we dealt with ARM above.)
+ */
+#if !defined(HAS_TEST_AND_SET)
+
+#if defined(HAVE_GCC__SYNC_INT32_TAS)
+#define HAS_TEST_AND_SET
+
+#define TAS(lock) tas(lock)
+
+typedef int slock_t;
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+ return __sync_lock_test_and_set(lock, 1);
+}
+
+#define S_UNLOCK(lock) __sync_lock_release(lock)
+
+#elif defined(HAVE_GCC__SYNC_CHAR_TAS)
+#define HAS_TEST_AND_SET
+
+#define TAS(lock) tas(lock)
+
+typedef char slock_t;
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+ return __sync_lock_test_and_set(lock, 1);
+}
+
+#define S_UNLOCK(lock) __sync_lock_release(lock)
+
+#endif /* HAVE_GCC__SYNC_INT32_TAS */
+
+#endif /* !defined(HAS_TEST_AND_SET) */
+
+
+/*
+ * Default implementation of S_UNLOCK() for gcc/icc.
+ *
+ * Note that this implementation is unsafe for any platform that can reorder
+ * a memory access (either load or store) after a following store. That
+ * happens not to be possible on x86 and most legacy architectures (some are
+ * single-processor!), but many modern systems have weaker memory ordering.
+ * Those that do must define their own version of S_UNLOCK() rather than
+ * relying on this one.
+ */
+#if !defined(S_UNLOCK)
+#define S_UNLOCK(lock) \
+ do { __asm__ __volatile__("" : : : "memory"); *(lock) = 0; } while (0)
+#endif
+
+#endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
+
+
+
+/*
+ * ---------------------------------------------------------------------
+ * Platforms that use non-gcc inline assembly:
+ * ---------------------------------------------------------------------
+ */
+
+#if !defined(HAS_TEST_AND_SET) /* We didn't trigger above, let's try here */
+
+
+#if defined(__hppa) || defined(__hppa__) /* HP PA-RISC, GCC and HP compilers */
+/*
+ * HP's PA-RISC
+ *
+ * See src/backend/port/hpux/tas.c.template for details about LDCWX. Because
+ * LDCWX requires a 16-byte-aligned address, we declare slock_t as a 16-byte
+ * struct. The active word in the struct is whichever has the aligned address;
+ * the other three words just sit at -1.
+ *
+ * When using gcc, we can inline the required assembly code.
+ */
+#define HAS_TEST_AND_SET
+
+typedef struct
+{
+ int sema[4];
+} slock_t;
+
+#define TAS_ACTIVE_WORD(lock) ((volatile int *) (((uintptr_t) (lock) + 15) & ~15))
+
+#if defined(__GNUC__)
+
+static __inline__ int
+tas(volatile slock_t *lock)
+{
+ volatile int *lockword = TAS_ACTIVE_WORD(lock);
+ register int lockval;
+
+ __asm__ __volatile__(
+ " ldcwx 0(0,%2),%0 \n"
+: "=r"(lockval), "+m"(*lockword)
+: "r"(lockword)
+: "memory");
+ return (lockval == 0);
+}
+
+/*
+ * The hppa implementation doesn't follow the rules of this files and provides
+ * a gcc specific implementation outside of the above defined(__GNUC__). It
+ * does so to avoid duplication between the HP compiler and gcc. So undefine
+ * the generic fallback S_UNLOCK from above.
+ */
+#ifdef S_UNLOCK
+#undef S_UNLOCK
+#endif
+#define S_UNLOCK(lock) \
+ do { \
+ __asm__ __volatile__("" : : : "memory"); \
+ *TAS_ACTIVE_WORD(lock) = -1; \
+ } while (0)
+
+#endif /* __GNUC__ */
+
+#define S_INIT_LOCK(lock) \
+ do { \
+ volatile slock_t *lock_ = (lock); \
+ lock_->sema[0] = -1; \
+ lock_->sema[1] = -1; \
+ lock_->sema[2] = -1; \
+ lock_->sema[3] = -1; \
+ } while (0)
+
+#define S_LOCK_FREE(lock) (*TAS_ACTIVE_WORD(lock) != 0)
+
+#endif /* __hppa || __hppa__ */
+
+
+#if defined(__hpux) && defined(__ia64) && !defined(__GNUC__)
+/*
+ * HP-UX on Itanium, non-gcc/icc compiler
+ *
+ * We assume that the compiler enforces strict ordering of loads/stores on
+ * volatile data (see comments on the gcc-version earlier in this file).
+ * Note that this assumption does *not* hold if you use the
+ * +Ovolatile=__unordered option on the HP-UX compiler, so don't do that.
+ *
+ * See also Implementing Spinlocks on the Intel Itanium Architecture and
+ * PA-RISC, by Tor Ekqvist and David Graves, for more information. As of
+ * this writing, version 1.0 of the manual is available at:
+ * http://h21007.www2.hp.com/portal/download/files/unprot/itanium/spinlocks.pdf
+ */
+#define HAS_TEST_AND_SET
+
+typedef unsigned int slock_t;
+
+#include <ia64/sys/inline.h>
+#define TAS(lock) _Asm_xchg(_SZ_W, lock, 1, _LDHINT_NONE)
+/* On IA64, it's a win to use a non-locking test before the xchg proper */
+#define TAS_SPIN(lock) (*(lock) ? 1 : TAS(lock))
+#define S_UNLOCK(lock) \
+ do { _Asm_mf(); (*(lock)) = 0; } while (0)
+
+#endif /* HPUX on IA64, non gcc/icc */
+
+#if defined(_AIX) /* AIX */
+/*
+ * AIX (POWER)
+ */
+#define HAS_TEST_AND_SET
+
+#include <sys/atomic_op.h>
+
+typedef int slock_t;
+
+#define TAS(lock) _check_lock((slock_t *) (lock), 0, 1)
+#define S_UNLOCK(lock) _clear_lock((slock_t *) (lock), 0)
+#endif /* _AIX */
+
+
+/* These are in sunstudio_(sparc|x86).s */
+
+#if defined(__SUNPRO_C) && (defined(__i386) || defined(__x86_64__) || defined(__sparc__) || defined(__sparc))
+#define HAS_TEST_AND_SET
+
+#if defined(__i386) || defined(__x86_64__) || defined(__sparcv9) || defined(__sparcv8plus)
+typedef unsigned int slock_t;
+#else
+typedef unsigned char slock_t;
+#endif
+
+extern slock_t pg_atomic_cas(volatile slock_t *lock, slock_t with,
+ slock_t cmp);
+
+#define TAS(a) (pg_atomic_cas((a), 1, 0) != 0)
+#endif
+
+
+#ifdef _MSC_VER
+typedef LONG slock_t;
+
+#define HAS_TEST_AND_SET
+#define TAS(lock) (InterlockedCompareExchange(lock, 1, 0))
+
+#define SPIN_DELAY() spin_delay()
+
+/* If using Visual C++ on Win64, inline assembly is unavailable.
+ * Use a _mm_pause intrinsic instead of rep nop.
+ */
+#if defined(_WIN64)
+static __forceinline void
+spin_delay(void)
+{
+ _mm_pause();
+}
+#else
+static __forceinline void
+spin_delay(void)
+{
+ /* See comment for gcc code. Same code, MASM syntax */
+ __asm rep nop;
+}
+#endif
+
+#include <intrin.h>
+#pragma intrinsic(_ReadWriteBarrier)
+
+#define S_UNLOCK(lock) \
+ do { _ReadWriteBarrier(); (*(lock)) = 0; } while (0)
+
+#endif
+
+
+#endif /* !defined(HAS_TEST_AND_SET) */
+
+
+/* Blow up if we didn't have any way to do spinlocks */
+#ifndef HAS_TEST_AND_SET
+#error PostgreSQL does not have native spinlock support on this platform. To continue the compilation, rerun configure using --disable-spinlocks. However, performance will be poor. Please report this to pgsql-bugs@lists.postgresql.org.
+#endif
+
+
+#else /* !HAVE_SPINLOCKS */
+
+
+/*
+ * Fake spinlock implementation using semaphores --- slow and prone
+ * to fall foul of kernel limits on number of semaphores, so don't use this
+ * unless you must! The subroutines appear in spin.c.
+ */
+typedef int slock_t;
+
+extern bool s_lock_free_sema(volatile slock_t *lock);
+extern void s_unlock_sema(volatile slock_t *lock);
+extern void s_init_lock_sema(volatile slock_t *lock, bool nested);
+extern int tas_sema(volatile slock_t *lock);
+
+#define S_LOCK_FREE(lock) s_lock_free_sema(lock)
+#define S_UNLOCK(lock) s_unlock_sema(lock)
+#define S_INIT_LOCK(lock) s_init_lock_sema(lock, false)
+#define TAS(lock) tas_sema(lock)
+
+
+#endif /* HAVE_SPINLOCKS */
+
+
+/*
+ * Default Definitions - override these above as needed.
+ */
+
+#if !defined(S_LOCK)
+#define S_LOCK(lock) \
+ (TAS(lock) ? s_lock((lock), __FILE__, __LINE__, PG_FUNCNAME_MACRO) : 0)
+#endif /* S_LOCK */
+
+#if !defined(S_LOCK_FREE)
+#define S_LOCK_FREE(lock) (*(lock) == 0)
+#endif /* S_LOCK_FREE */
+
+#if !defined(S_UNLOCK)
+/*
+ * Our default implementation of S_UNLOCK is essentially *(lock) = 0. This
+ * is unsafe if the platform can reorder a memory access (either load or
+ * store) after a following store; platforms where this is possible must
+ * define their own S_UNLOCK. But CPU reordering is not the only concern:
+ * if we simply defined S_UNLOCK() as an inline macro, the compiler might
+ * reorder instructions from inside the critical section to occur after the
+ * lock release. Since the compiler probably can't know what the external
+ * function s_unlock is doing, putting the same logic there should be adequate.
+ * A sufficiently-smart globally optimizing compiler could break that
+ * assumption, though, and the cost of a function call for every spinlock
+ * release may hurt performance significantly, so we use this implementation
+ * only for platforms where we don't know of a suitable intrinsic. For the
+ * most part, those are relatively obscure platform/compiler combinations to
+ * which the PostgreSQL project does not have access.
+ */
+#define USE_DEFAULT_S_UNLOCK
+extern void s_unlock(volatile slock_t *lock);
+#define S_UNLOCK(lock) s_unlock(lock)
+#endif /* S_UNLOCK */
+
+#if !defined(S_INIT_LOCK)
+#define S_INIT_LOCK(lock) S_UNLOCK(lock)
+#endif /* S_INIT_LOCK */
+
+#if !defined(SPIN_DELAY)
+#define SPIN_DELAY() ((void) 0)
+#endif /* SPIN_DELAY */
+
+#if !defined(TAS)
+extern int tas(volatile slock_t *lock); /* in port/.../tas.s, or
+ * s_lock.c */
+
+#define TAS(lock) tas(lock)
+#endif /* TAS */
+
+#if !defined(TAS_SPIN)
+#define TAS_SPIN(lock) TAS(lock)
+#endif /* TAS_SPIN */
+
+extern PGDLLIMPORT slock_t dummy_spinlock;
+
+/*
+ * Platform-independent out-of-line support routines
+ */
+extern int s_lock(volatile slock_t *lock, const char *file, int line, const char *func);
+
+/* Support for dynamic adjustment of spins_per_delay */
+#define DEFAULT_SPINS_PER_DELAY 100
+
+extern void set_spins_per_delay(int shared_spins_per_delay);
+extern int update_spins_per_delay(int shared_spins_per_delay);
+
+/*
+ * Support for spin delay which is useful in various places where
+ * spinlock-like procedures take place.
+ */
+typedef struct
+{
+ int spins;
+ int delays;
+ int cur_delay;
+ const char *file;
+ int line;
+ const char *func;
+} SpinDelayStatus;
+
+static inline void
+init_spin_delay(SpinDelayStatus *status,
+ const char *file, int line, const char *func)
+{
+ status->spins = 0;
+ status->delays = 0;
+ status->cur_delay = 0;
+ status->file = file;
+ status->line = line;
+ status->func = func;
+}
+
+#define init_local_spin_delay(status) init_spin_delay(status, __FILE__, __LINE__, PG_FUNCNAME_MACRO)
+extern void perform_spin_delay(SpinDelayStatus *status);
+extern void finish_spin_delay(SpinDelayStatus *status);
+
+#endif /* S_LOCK_H */
diff --git a/src/include/storage/sharedfileset.h b/src/include/storage/sharedfileset.h
new file mode 100644
index 0000000..b1cde36
--- /dev/null
+++ b/src/include/storage/sharedfileset.h
@@ -0,0 +1,37 @@
+/*-------------------------------------------------------------------------
+ *
+ * sharedfileset.h
+ * Shared temporary file management.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/sharedfileset.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef SHAREDFILESET_H
+#define SHAREDFILESET_H
+
+#include "storage/dsm.h"
+#include "storage/fd.h"
+#include "storage/fileset.h"
+#include "storage/spin.h"
+
+/*
+ * A set of temporary files that can be shared by multiple backends.
+ */
+typedef struct SharedFileSet
+{
+ FileSet fs;
+ slock_t mutex; /* mutex protecting the reference count */
+ int refcnt; /* number of attached backends */
+} SharedFileSet;
+
+extern void SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg);
+extern void SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg);
+extern void SharedFileSetDeleteAll(SharedFileSet *fileset);
+
+#endif
diff --git a/src/include/storage/shm_mq.h b/src/include/storage/shm_mq.h
new file mode 100644
index 0000000..b6fe687
--- /dev/null
+++ b/src/include/storage/shm_mq.h
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ *
+ * shm_mq.h
+ * single-reader, single-writer shared memory message queue
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/shm_mq.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SHM_MQ_H
+#define SHM_MQ_H
+
+#include "postmaster/bgworker.h"
+#include "storage/dsm.h"
+#include "storage/proc.h"
+
+/* The queue itself, in shared memory. */
+struct shm_mq;
+typedef struct shm_mq shm_mq;
+
+/* Backend-private state. */
+struct shm_mq_handle;
+typedef struct shm_mq_handle shm_mq_handle;
+
+/* Descriptors for a single write spanning multiple locations. */
+typedef struct
+{
+ const char *data;
+ Size len;
+} shm_mq_iovec;
+
+/* Possible results of a send or receive operation. */
+typedef enum
+{
+ SHM_MQ_SUCCESS, /* Sent or received a message. */
+ SHM_MQ_WOULD_BLOCK, /* Not completed; retry later. */
+ SHM_MQ_DETACHED /* Other process has detached queue. */
+} shm_mq_result;
+
+/*
+ * Primitives to create a queue and set the sender and receiver.
+ *
+ * Both the sender and the receiver must be set before any messages are read
+ * or written, but they need not be set by the same process. Each must be
+ * set exactly once.
+ */
+extern shm_mq *shm_mq_create(void *address, Size size);
+extern void shm_mq_set_receiver(shm_mq *mq, PGPROC *);
+extern void shm_mq_set_sender(shm_mq *mq, PGPROC *);
+
+/* Accessor methods for sender and receiver. */
+extern PGPROC *shm_mq_get_receiver(shm_mq *);
+extern PGPROC *shm_mq_get_sender(shm_mq *);
+
+/* Set up backend-local queue state. */
+extern shm_mq_handle *shm_mq_attach(shm_mq *mq, dsm_segment *seg,
+ BackgroundWorkerHandle *handle);
+
+/* Associate worker handle with shm_mq. */
+extern void shm_mq_set_handle(shm_mq_handle *, BackgroundWorkerHandle *);
+
+/* Break connection, release handle resources. */
+extern void shm_mq_detach(shm_mq_handle *mqh);
+
+/* Get the shm_mq from handle. */
+extern shm_mq *shm_mq_get_queue(shm_mq_handle *mqh);
+
+/* Send or receive messages. */
+extern shm_mq_result shm_mq_send(shm_mq_handle *mqh,
+ Size nbytes, const void *data, bool nowait,
+ bool force_flush);
+extern shm_mq_result shm_mq_sendv(shm_mq_handle *mqh, shm_mq_iovec *iov,
+ int iovcnt, bool nowait, bool force_flush);
+extern shm_mq_result shm_mq_receive(shm_mq_handle *mqh,
+ Size *nbytesp, void **datap, bool nowait);
+
+/* Wait for our counterparty to attach to the queue. */
+extern shm_mq_result shm_mq_wait_for_attach(shm_mq_handle *mqh);
+
+/* Smallest possible queue. */
+extern PGDLLIMPORT const Size shm_mq_minimum_size;
+
+#endif /* SHM_MQ_H */
diff --git a/src/include/storage/shm_toc.h b/src/include/storage/shm_toc.h
new file mode 100644
index 0000000..153a57c
--- /dev/null
+++ b/src/include/storage/shm_toc.h
@@ -0,0 +1,58 @@
+/*-------------------------------------------------------------------------
+ *
+ * shm_toc.h
+ * shared memory segment table of contents
+ *
+ * This is intended to provide a simple way to divide a chunk of shared
+ * memory (probably dynamic shared memory allocated via dsm_create) into
+ * a number of regions and keep track of the addresses of those regions or
+ * key data structures within those regions. This is not intended to
+ * scale to a large number of keys and will perform poorly if used that
+ * way; if you need a large number of pointers, store them within some
+ * other data structure within the segment and only put the pointer to
+ * the data structure itself in the table of contents.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/shm_toc.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SHM_TOC_H
+#define SHM_TOC_H
+
+#include "storage/shmem.h" /* for add_size() */
+
+/* shm_toc is an opaque type known only within shm_toc.c */
+typedef struct shm_toc shm_toc;
+
+extern shm_toc *shm_toc_create(uint64 magic, void *address, Size nbytes);
+extern shm_toc *shm_toc_attach(uint64 magic, void *address);
+extern void *shm_toc_allocate(shm_toc *toc, Size nbytes);
+extern Size shm_toc_freespace(shm_toc *toc);
+extern void shm_toc_insert(shm_toc *toc, uint64 key, void *address);
+extern void *shm_toc_lookup(shm_toc *toc, uint64 key, bool noError);
+
+/*
+ * Tools for estimating how large a chunk of shared memory will be needed
+ * to store a TOC and its dependent objects. Note: we don't really support
+ * large numbers of keys, but it's convenient to declare number_of_keys
+ * as a Size anyway.
+ */
+typedef struct
+{
+ Size space_for_chunks;
+ Size number_of_keys;
+} shm_toc_estimator;
+
+#define shm_toc_initialize_estimator(e) \
+ ((e)->space_for_chunks = 0, (e)->number_of_keys = 0)
+#define shm_toc_estimate_chunk(e, sz) \
+ ((e)->space_for_chunks = add_size((e)->space_for_chunks, BUFFERALIGN(sz)))
+#define shm_toc_estimate_keys(e, cnt) \
+ ((e)->number_of_keys = add_size((e)->number_of_keys, cnt))
+
+extern Size shm_toc_estimate(shm_toc_estimator *e);
+
+#endif /* SHM_TOC_H */
diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h
new file mode 100644
index 0000000..de9e7c6
--- /dev/null
+++ b/src/include/storage/shmem.h
@@ -0,0 +1,81 @@
+/*-------------------------------------------------------------------------
+ *
+ * shmem.h
+ * shared memory management structures
+ *
+ * Historical note:
+ * A long time ago, Postgres' shared memory region was allowed to be mapped
+ * at a different address in each process, and shared memory "pointers" were
+ * passed around as offsets relative to the start of the shared memory region.
+ * That is no longer the case: each process must map the shared memory region
+ * at the same address. This means shared memory pointers can be passed
+ * around directly between different processes.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/shmem.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SHMEM_H
+#define SHMEM_H
+
+#include "utils/hsearch.h"
+
+
+/* shmqueue.c */
+typedef struct SHM_QUEUE
+{
+ struct SHM_QUEUE *prev;
+ struct SHM_QUEUE *next;
+} SHM_QUEUE;
+
+/* shmem.c */
+extern void InitShmemAccess(void *seghdr);
+extern void InitShmemAllocation(void);
+extern void *ShmemAlloc(Size size);
+extern void *ShmemAllocNoError(Size size);
+extern void *ShmemAllocUnlocked(Size size);
+extern bool ShmemAddrIsValid(const void *addr);
+extern void InitShmemIndex(void);
+extern HTAB *ShmemInitHash(const char *name, long init_size, long max_size,
+ HASHCTL *infoP, int hash_flags);
+extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr);
+extern Size add_size(Size s1, Size s2);
+extern Size mul_size(Size s1, Size s2);
+
+/* ipci.c */
+extern void RequestAddinShmemSpace(Size size);
+
+/* size constants for the shmem index table */
+ /* max size of data structure string name */
+#define SHMEM_INDEX_KEYSIZE (48)
+ /* estimated size of the shmem index table (not a hard limit) */
+#define SHMEM_INDEX_SIZE (64)
+
+/* this is a hash bucket in the shmem index table */
+typedef struct
+{
+ char key[SHMEM_INDEX_KEYSIZE]; /* string name */
+ void *location; /* location in shared mem */
+ Size size; /* # bytes requested for the structure */
+ Size allocated_size; /* # bytes actually allocated */
+} ShmemIndexEnt;
+
+/*
+ * prototypes for functions in shmqueue.c
+ */
+extern void SHMQueueInit(SHM_QUEUE *queue);
+extern void SHMQueueElemInit(SHM_QUEUE *queue);
+extern void SHMQueueDelete(SHM_QUEUE *queue);
+extern void SHMQueueInsertBefore(SHM_QUEUE *queue, SHM_QUEUE *elem);
+extern void SHMQueueInsertAfter(SHM_QUEUE *queue, SHM_QUEUE *elem);
+extern Pointer SHMQueueNext(const SHM_QUEUE *queue, const SHM_QUEUE *curElem,
+ Size linkOffset);
+extern Pointer SHMQueuePrev(const SHM_QUEUE *queue, const SHM_QUEUE *curElem,
+ Size linkOffset);
+extern bool SHMQueueEmpty(const SHM_QUEUE *queue);
+extern bool SHMQueueIsDetached(const SHM_QUEUE *queue);
+
+#endif /* SHMEM_H */
diff --git a/src/include/storage/sinval.h b/src/include/storage/sinval.h
new file mode 100644
index 0000000..e7cd456
--- /dev/null
+++ b/src/include/storage/sinval.h
@@ -0,0 +1,153 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinval.h
+ * POSTGRES shared cache invalidation communication definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/sinval.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SINVAL_H
+#define SINVAL_H
+
+#include <signal.h>
+
+#include "storage/relfilenode.h"
+
+/*
+ * We support several types of shared-invalidation messages:
+ * * invalidate a specific tuple in a specific catcache
+ * * invalidate all catcache entries from a given system catalog
+ * * invalidate a relcache entry for a specific logical relation
+ * * invalidate all relcache entries
+ * * invalidate an smgr cache entry for a specific physical relation
+ * * invalidate the mapped-relation mapping for a given database
+ * * invalidate any saved snapshot that might be used to scan a given relation
+ * More types could be added if needed. The message type is identified by
+ * the first "int8" field of the message struct. Zero or positive means a
+ * specific-catcache inval message (and also serves as the catcache ID field).
+ * Negative values identify the other message types, as per codes below.
+ *
+ * Catcache inval events are initially driven by detecting tuple inserts,
+ * updates and deletions in system catalogs (see CacheInvalidateHeapTuple).
+ * An update can generate two inval events, one for the old tuple and one for
+ * the new, but this is reduced to one event if the tuple's hash key doesn't
+ * change. Note that the inval events themselves don't actually say whether
+ * the tuple is being inserted or deleted. Also, since we transmit only a
+ * hash key, there is a small risk of unnecessary invalidations due to chance
+ * matches of hash keys.
+ *
+ * Note that some system catalogs have multiple caches on them (with different
+ * indexes). On detecting a tuple invalidation in such a catalog, separate
+ * catcache inval messages must be generated for each of its caches, since
+ * the hash keys will generally be different.
+ *
+ * Catcache, relcache, and snapshot invalidations are transactional, and so
+ * are sent to other backends upon commit. Internally to the generating
+ * backend, they are also processed at CommandCounterIncrement so that later
+ * commands in the same transaction see the new state. The generating backend
+ * also has to process them at abort, to flush out any cache state it's loaded
+ * from no-longer-valid entries.
+ *
+ * smgr and relation mapping invalidations are non-transactional: they are
+ * sent immediately when the underlying file change is made.
+ */
+
+typedef struct
+{
+ int8 id; /* cache ID --- must be first */
+ Oid dbId; /* database ID, or 0 if a shared relation */
+ uint32 hashValue; /* hash value of key for this catcache */
+} SharedInvalCatcacheMsg;
+
+#define SHAREDINVALCATALOG_ID (-1)
+
+typedef struct
+{
+ int8 id; /* type field --- must be first */
+ Oid dbId; /* database ID, or 0 if a shared catalog */
+ Oid catId; /* ID of catalog whose contents are invalid */
+} SharedInvalCatalogMsg;
+
+#define SHAREDINVALRELCACHE_ID (-2)
+
+typedef struct
+{
+ int8 id; /* type field --- must be first */
+ Oid dbId; /* database ID, or 0 if a shared relation */
+ Oid relId; /* relation ID, or 0 if whole relcache */
+} SharedInvalRelcacheMsg;
+
+#define SHAREDINVALSMGR_ID (-3)
+
+typedef struct
+{
+ /* note: field layout chosen to pack into 16 bytes */
+ int8 id; /* type field --- must be first */
+ int8 backend_hi; /* high bits of backend ID, if temprel */
+ uint16 backend_lo; /* low bits of backend ID, if temprel */
+ RelFileNode rnode; /* spcNode, dbNode, relNode */
+} SharedInvalSmgrMsg;
+
+#define SHAREDINVALRELMAP_ID (-4)
+
+typedef struct
+{
+ int8 id; /* type field --- must be first */
+ Oid dbId; /* database ID, or 0 for shared catalogs */
+} SharedInvalRelmapMsg;
+
+#define SHAREDINVALSNAPSHOT_ID (-5)
+
+typedef struct
+{
+ int8 id; /* type field --- must be first */
+ Oid dbId; /* database ID, or 0 if a shared relation */
+ Oid relId; /* relation ID */
+} SharedInvalSnapshotMsg;
+
+typedef union
+{
+ int8 id; /* type field --- must be first */
+ SharedInvalCatcacheMsg cc;
+ SharedInvalCatalogMsg cat;
+ SharedInvalRelcacheMsg rc;
+ SharedInvalSmgrMsg sm;
+ SharedInvalRelmapMsg rm;
+ SharedInvalSnapshotMsg sn;
+} SharedInvalidationMessage;
+
+
+/* Counter of messages processed; don't worry about overflow. */
+extern PGDLLIMPORT uint64 SharedInvalidMessageCounter;
+
+extern PGDLLIMPORT volatile sig_atomic_t catchupInterruptPending;
+
+extern void SendSharedInvalidMessages(const SharedInvalidationMessage *msgs,
+ int n);
+extern void ReceiveSharedInvalidMessages(void (*invalFunction) (SharedInvalidationMessage *msg),
+ void (*resetFunction) (void));
+
+/* signal handler for catchup events (PROCSIG_CATCHUP_INTERRUPT) */
+extern void HandleCatchupInterrupt(void);
+
+/*
+ * enable/disable processing of catchup events directly from signal handler.
+ * The enable routine first performs processing of any catchup events that
+ * have occurred since the last disable.
+ */
+extern void ProcessCatchupInterrupt(void);
+
+extern int xactGetCommittedInvalidationMessages(SharedInvalidationMessage **msgs,
+ bool *RelcacheInitFileInval);
+extern void ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs,
+ int nmsgs, bool RelcacheInitFileInval,
+ Oid dbid, Oid tsid);
+
+extern void LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg);
+
+#endif /* SINVAL_H */
diff --git a/src/include/storage/sinvaladt.h b/src/include/storage/sinvaladt.h
new file mode 100644
index 0000000..91e2418
--- /dev/null
+++ b/src/include/storage/sinvaladt.h
@@ -0,0 +1,43 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinvaladt.h
+ * POSTGRES shared cache invalidation data manager.
+ *
+ * The shared cache invalidation manager is responsible for transmitting
+ * invalidation messages between backends. Any message sent by any backend
+ * must be delivered to all already-running backends before it can be
+ * forgotten. (If we run out of space, we instead deliver a "RESET"
+ * message to backends that have fallen too far behind.)
+ *
+ * The struct type SharedInvalidationMessage, defining the contents of
+ * a single message, is defined in sinval.h.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/sinvaladt.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SINVALADT_H
+#define SINVALADT_H
+
+#include "storage/lock.h"
+#include "storage/sinval.h"
+
+/*
+ * prototypes for functions in sinvaladt.c
+ */
+extern Size SInvalShmemSize(void);
+extern void CreateSharedInvalidationState(void);
+extern void SharedInvalBackendInit(bool sendOnly);
+extern PGPROC *BackendIdGetProc(int backendID);
+extern void BackendIdGetTransactionIds(int backendID, TransactionId *xid, TransactionId *xmin);
+
+extern void SIInsertDataEntries(const SharedInvalidationMessage *data, int n);
+extern int SIGetDataEntries(SharedInvalidationMessage *data, int datasize);
+extern void SICleanupQueue(bool callerHasWriteLock, int minFree);
+
+extern LocalTransactionId GetNextLocalTransactionId(void);
+
+#endif /* SINVALADT_H */
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
new file mode 100644
index 0000000..6b63c60
--- /dev/null
+++ b/src/include/storage/smgr.h
@@ -0,0 +1,111 @@
+/*-------------------------------------------------------------------------
+ *
+ * smgr.h
+ * storage manager switch public interface declarations.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/smgr.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SMGR_H
+#define SMGR_H
+
+#include "lib/ilist.h"
+#include "storage/block.h"
+#include "storage/relfilenode.h"
+
+/*
+ * smgr.c maintains a table of SMgrRelation objects, which are essentially
+ * cached file handles. An SMgrRelation is created (if not already present)
+ * by smgropen(), and destroyed by smgrclose(). Note that neither of these
+ * operations imply I/O, they just create or destroy a hashtable entry.
+ * (But smgrclose() may release associated resources, such as OS-level file
+ * descriptors.)
+ *
+ * An SMgrRelation may have an "owner", which is just a pointer to it from
+ * somewhere else; smgr.c will clear this pointer if the SMgrRelation is
+ * closed. We use this to avoid dangling pointers from relcache to smgr
+ * without having to make the smgr explicitly aware of relcache. There
+ * can't be more than one "owner" pointer per SMgrRelation, but that's
+ * all we need.
+ *
+ * SMgrRelations that do not have an "owner" are considered to be transient,
+ * and are deleted at end of transaction.
+ */
+typedef struct SMgrRelationData
+{
+ /* rnode is the hashtable lookup key, so it must be first! */
+ RelFileNodeBackend smgr_rnode; /* relation physical identifier */
+
+ /* pointer to owning pointer, or NULL if none */
+ struct SMgrRelationData **smgr_owner;
+
+ /*
+ * The following fields are reset to InvalidBlockNumber upon a cache flush
+ * event, and hold the last known size for each fork. This information is
+ * currently only reliable during recovery, since there is no cache
+ * invalidation for fork extension.
+ */
+ BlockNumber smgr_targblock; /* current insertion target block */
+ BlockNumber smgr_cached_nblocks[MAX_FORKNUM + 1]; /* last known size */
+
+ /* additional public fields may someday exist here */
+
+ /*
+ * Fields below here are intended to be private to smgr.c and its
+ * submodules. Do not touch them from elsewhere.
+ */
+ int smgr_which; /* storage manager selector */
+
+ /*
+ * for md.c; per-fork arrays of the number of open segments
+ * (md_num_open_segs) and the segments themselves (md_seg_fds).
+ */
+ int md_num_open_segs[MAX_FORKNUM + 1];
+ struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1];
+
+ /* if unowned, list link in list of all unowned SMgrRelations */
+ dlist_node node;
+} SMgrRelationData;
+
+typedef SMgrRelationData *SMgrRelation;
+
+#define SmgrIsTemp(smgr) \
+ RelFileNodeBackendIsTemp((smgr)->smgr_rnode)
+
+extern void smgrinit(void);
+extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend);
+extern bool smgrexists(SMgrRelation reln, ForkNumber forknum);
+extern void smgrsetowner(SMgrRelation *owner, SMgrRelation reln);
+extern void smgrclearowner(SMgrRelation *owner, SMgrRelation reln);
+extern void smgrclose(SMgrRelation reln);
+extern void smgrcloseall(void);
+extern void smgrclosenode(RelFileNodeBackend rnode);
+extern void smgrrelease(SMgrRelation reln);
+extern void smgrreleaseall(void);
+extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
+extern void smgrdosyncall(SMgrRelation *rels, int nrels);
+extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
+extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, char *buffer, bool skipFsync);
+extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum);
+extern void smgrread(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, char *buffer);
+extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, BlockNumber nblocks);
+extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
+extern BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum);
+extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum,
+ int nforks, BlockNumber *nblocks);
+extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
+extern void AtEOXact_SMgr(void);
+extern bool ProcessBarrierSmgrRelease(void);
+
+#endif /* SMGR_H */
diff --git a/src/include/storage/spin.h b/src/include/storage/spin.h
new file mode 100644
index 0000000..7031f1d
--- /dev/null
+++ b/src/include/storage/spin.h
@@ -0,0 +1,77 @@
+/*-------------------------------------------------------------------------
+ *
+ * spin.h
+ * Hardware-independent implementation of spinlocks.
+ *
+ *
+ * The hardware-independent interface to spinlocks is defined by the
+ * typedef "slock_t" and these macros:
+ *
+ * void SpinLockInit(volatile slock_t *lock)
+ * Initialize a spinlock (to the unlocked state).
+ *
+ * void SpinLockAcquire(volatile slock_t *lock)
+ * Acquire a spinlock, waiting if necessary.
+ * Time out and abort() if unable to acquire the lock in a
+ * "reasonable" amount of time --- typically ~ 1 minute.
+ *
+ * void SpinLockRelease(volatile slock_t *lock)
+ * Unlock a previously acquired lock.
+ *
+ * bool SpinLockFree(slock_t *lock)
+ * Tests if the lock is free. Returns true if free, false if locked.
+ * This does *not* change the state of the lock.
+ *
+ * Callers must beware that the macro argument may be evaluated multiple
+ * times!
+ *
+ * Load and store operations in calling code are guaranteed not to be
+ * reordered with respect to these operations, because they include a
+ * compiler barrier. (Before PostgreSQL 9.5, callers needed to use a
+ * volatile qualifier to access data protected by spinlocks.)
+ *
+ * Keep in mind the coding rule that spinlocks must not be held for more
+ * than a few instructions. In particular, we assume it is not possible
+ * for a CHECK_FOR_INTERRUPTS() to occur while holding a spinlock, and so
+ * it is not necessary to do HOLD/RESUME_INTERRUPTS() in these macros.
+ *
+ * These macros are implemented in terms of hardware-dependent macros
+ * supplied by s_lock.h. There is not currently any extra functionality
+ * added by this header, but there has been in the past and may someday
+ * be again.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/spin.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SPIN_H
+#define SPIN_H
+
+#include "storage/s_lock.h"
+#ifndef HAVE_SPINLOCKS
+#include "storage/pg_sema.h"
+#endif
+
+
+#define SpinLockInit(lock) S_INIT_LOCK(lock)
+
+#define SpinLockAcquire(lock) S_LOCK(lock)
+
+#define SpinLockRelease(lock) S_UNLOCK(lock)
+
+#define SpinLockFree(lock) S_LOCK_FREE(lock)
+
+
+extern int SpinlockSemas(void);
+extern Size SpinlockSemaSize(void);
+
+#ifndef HAVE_SPINLOCKS
+extern void SpinlockSemaInit(void);
+extern PGDLLIMPORT PGSemaphore *SpinlockSemaArray;
+#endif
+
+#endif /* SPIN_H */
diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h
new file mode 100644
index 0000000..6a77632
--- /dev/null
+++ b/src/include/storage/standby.h
@@ -0,0 +1,98 @@
+/*-------------------------------------------------------------------------
+ *
+ * standby.h
+ * Definitions for hot standby mode.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/standby.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef STANDBY_H
+#define STANDBY_H
+
+#include "datatype/timestamp.h"
+#include "storage/lock.h"
+#include "storage/procsignal.h"
+#include "storage/relfilenode.h"
+#include "storage/standbydefs.h"
+
+/* User-settable GUC parameters */
+extern PGDLLIMPORT int vacuum_defer_cleanup_age;
+extern PGDLLIMPORT int max_standby_archive_delay;
+extern PGDLLIMPORT int max_standby_streaming_delay;
+extern PGDLLIMPORT bool log_recovery_conflict_waits;
+
+extern void InitRecoveryTransactionEnvironment(void);
+extern void ShutdownRecoveryTransactionEnvironment(void);
+
+extern void ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid,
+ RelFileNode node);
+extern void ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid,
+ RelFileNode node);
+extern void ResolveRecoveryConflictWithTablespace(Oid tsid);
+extern void ResolveRecoveryConflictWithDatabase(Oid dbid);
+
+extern void ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict);
+extern void ResolveRecoveryConflictWithBufferPin(void);
+extern void CheckRecoveryConflictDeadlock(void);
+extern void StandbyDeadLockHandler(void);
+extern void StandbyTimeoutHandler(void);
+extern void StandbyLockTimeoutHandler(void);
+extern void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
+ TimestampTz cur_ts, VirtualTransactionId *wait_list,
+ bool still_waiting);
+
+/*
+ * Standby Rmgr (RM_STANDBY_ID)
+ *
+ * Standby recovery manager exists to perform actions that are required
+ * to make hot standby work. That includes logging AccessExclusiveLocks taken
+ * by transactions and running-xacts snapshots.
+ */
+extern void StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid);
+extern void StandbyReleaseLockTree(TransactionId xid,
+ int nsubxids, TransactionId *subxids);
+extern void StandbyReleaseAllLocks(void);
+extern void StandbyReleaseOldLocks(TransactionId oldxid);
+
+#define MinSizeOfXactRunningXacts offsetof(xl_running_xacts, xids)
+
+
+/*
+ * Declarations for GetRunningTransactionData(). Similar to Snapshots, but
+ * not quite. This has nothing at all to do with visibility on this server,
+ * so this is completely separate from snapmgr.c and snapmgr.h.
+ * This data is important for creating the initial snapshot state on a
+ * standby server. We need lots more information than a normal snapshot,
+ * hence we use a specific data structure for our needs. This data
+ * is written to WAL as a separate record immediately after each
+ * checkpoint. That means that wherever we start a standby from we will
+ * almost immediately see the data we need to begin executing queries.
+ */
+
+typedef struct RunningTransactionsData
+{
+ int xcnt; /* # of xact ids in xids[] */
+ int subxcnt; /* # of subxact ids in xids[] */
+ bool subxid_overflow; /* snapshot overflowed, subxids missing */
+ TransactionId nextXid; /* xid from ShmemVariableCache->nextXid */
+ TransactionId oldestRunningXid; /* *not* oldestXmin */
+ TransactionId latestCompletedXid; /* so we can set xmax */
+
+ TransactionId *xids; /* array of (sub)xids still running */
+} RunningTransactionsData;
+
+typedef RunningTransactionsData *RunningTransactions;
+
+extern void LogAccessExclusiveLock(Oid dbOid, Oid relOid);
+extern void LogAccessExclusiveLockPrepare(void);
+
+extern XLogRecPtr LogStandbySnapshot(void);
+extern void LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
+ bool relcacheInitFileInval);
+
+#endif /* STANDBY_H */
diff --git a/src/include/storage/standbydefs.h b/src/include/storage/standbydefs.h
new file mode 100644
index 0000000..c0234b6
--- /dev/null
+++ b/src/include/storage/standbydefs.h
@@ -0,0 +1,74 @@
+/*-------------------------------------------------------------------------
+ *
+ * standbydefs.h
+ * Frontend exposed definitions for hot standby mode.
+ *
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/standbydefs.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef STANDBYDEFS_H
+#define STANDBYDEFS_H
+
+#include "access/xlogreader.h"
+#include "lib/stringinfo.h"
+#include "storage/lockdefs.h"
+#include "storage/sinval.h"
+
+/* Recovery handlers for the Standby Rmgr (RM_STANDBY_ID) */
+extern void standby_redo(XLogReaderState *record);
+extern void standby_desc(StringInfo buf, XLogReaderState *record);
+extern const char *standby_identify(uint8 info);
+extern void standby_desc_invalidations(StringInfo buf,
+ int nmsgs, SharedInvalidationMessage *msgs,
+ Oid dbId, Oid tsId,
+ bool relcacheInitFileInval);
+
+/*
+ * XLOG message types
+ */
+#define XLOG_STANDBY_LOCK 0x00
+#define XLOG_RUNNING_XACTS 0x10
+#define XLOG_INVALIDATIONS 0x20
+
+typedef struct xl_standby_locks
+{
+ int nlocks; /* number of entries in locks array */
+ xl_standby_lock locks[FLEXIBLE_ARRAY_MEMBER];
+} xl_standby_locks;
+
+/*
+ * When we write running xact data to WAL, we use this structure.
+ */
+typedef struct xl_running_xacts
+{
+ int xcnt; /* # of xact ids in xids[] */
+ int subxcnt; /* # of subxact ids in xids[] */
+ bool subxid_overflow; /* snapshot overflowed, subxids missing */
+ TransactionId nextXid; /* xid from ShmemVariableCache->nextXid */
+ TransactionId oldestRunningXid; /* *not* oldestXmin */
+ TransactionId latestCompletedXid; /* so we can set xmax */
+
+ TransactionId xids[FLEXIBLE_ARRAY_MEMBER];
+} xl_running_xacts;
+
+/*
+ * Invalidations for standby, currently only when transactions without an
+ * assigned xid commit.
+ */
+typedef struct xl_invalidations
+{
+ Oid dbId; /* MyDatabaseId */
+ Oid tsId; /* MyDatabaseTableSpace */
+ bool relcacheInitFileInval; /* invalidate relcache init files */
+ int nmsgs; /* number of shared inval msgs */
+ SharedInvalidationMessage msgs[FLEXIBLE_ARRAY_MEMBER];
+} xl_invalidations;
+
+#define MinSizeOfInvalidations offsetof(xl_invalidations, msgs)
+
+#endif /* STANDBYDEFS_H */
diff --git a/src/include/storage/sync.h b/src/include/storage/sync.h
new file mode 100644
index 0000000..9737e1e
--- /dev/null
+++ b/src/include/storage/sync.h
@@ -0,0 +1,66 @@
+/*-------------------------------------------------------------------------
+ *
+ * sync.h
+ * File synchronization management code.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/sync.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SYNC_H
+#define SYNC_H
+
+#include "storage/relfilenode.h"
+
+/*
+ * Type of sync request. These are used to manage the set of pending
+ * requests to call a sync handler's sync or unlink functions at the next
+ * checkpoint.
+ */
+typedef enum SyncRequestType
+{
+ SYNC_REQUEST, /* schedule a call of sync function */
+ SYNC_UNLINK_REQUEST, /* schedule a call of unlink function */
+ SYNC_FORGET_REQUEST, /* forget all calls for a tag */
+ SYNC_FILTER_REQUEST /* forget all calls satisfying match fn */
+} SyncRequestType;
+
+/*
+ * Which set of functions to use to handle a given request. The values of
+ * the enumerators must match the indexes of the function table in sync.c.
+ */
+typedef enum SyncRequestHandler
+{
+ SYNC_HANDLER_MD = 0,
+ SYNC_HANDLER_CLOG,
+ SYNC_HANDLER_COMMIT_TS,
+ SYNC_HANDLER_MULTIXACT_OFFSET,
+ SYNC_HANDLER_MULTIXACT_MEMBER,
+ SYNC_HANDLER_NONE
+} SyncRequestHandler;
+
+/*
+ * A tag identifying a file. Currently it has the members required for md.c's
+ * usage, but sync.c has no knowledge of the internal structure, and it is
+ * liable to change as required by future handlers.
+ */
+typedef struct FileTag
+{
+ int16 handler; /* SyncRequestHandler value, saving space */
+ int16 forknum; /* ForkNumber, saving space */
+ RelFileNode rnode;
+ uint32 segno;
+} FileTag;
+
+extern void InitSync(void);
+extern void SyncPreCheckpoint(void);
+extern void SyncPostCheckpoint(void);
+extern void ProcessSyncRequests(void);
+extern void RememberSyncRequest(const FileTag *ftag, SyncRequestType type);
+extern bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
+ bool retryOnError);
+
+#endif /* SYNC_H */