1 files changed, 272 insertions, 0 deletions
diff --git a/src/backend/storage/ipc/shm_toc.c b/src/backend/storage/ipc/shm_toc.c
new file mode 100644
index 0000000..0cd8244
--- /dev/null
+++ b/src/backend/storage/ipc/shm_toc.c
@@ -0,0 +1,272 @@
+/*-------------------------------------------------------------------------
+ *
+ * shm_toc.c
+ *	  shared memory segment table of contents
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/storage/ipc/shm_toc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "port/atomics.h"
+#include "storage/shm_toc.h"
+#include "storage/spin.h"
+
+typedef struct shm_toc_entry
+{
+	uint64		key;			/* Arbitrary identifier */
+	Size		offset;			/* Offset, in bytes, from TOC start */
+} shm_toc_entry;
+
+struct shm_toc
+{
+	uint64		toc_magic;		/* Magic number identifying this TOC */
+	slock_t		toc_mutex;		/* Spinlock for mutual exclusion */
+	Size		toc_total_bytes;	/* Bytes managed by this TOC */
+	Size		toc_allocated_bytes;	/* Bytes allocated of those managed */
+	uint32		toc_nentry;		/* Number of entries in TOC */
+	shm_toc_entry toc_entry[FLEXIBLE_ARRAY_MEMBER];
+};
+
+/*
+ * Initialize a region of shared memory with a table of contents.
+ */
+shm_toc *
+shm_toc_create(uint64 magic, void *address, Size nbytes)
+{
+	shm_toc    *toc = (shm_toc *) address;
+
+	Assert(nbytes > offsetof(shm_toc, toc_entry));
+	toc->toc_magic = magic;
+	SpinLockInit(&toc->toc_mutex);
+
+	/*
+	 * The alignment code in shm_toc_allocate() assumes that the starting
+	 * value is buffer-aligned.
+	 */
+	toc->toc_total_bytes = BUFFERALIGN_DOWN(nbytes);
+	toc->toc_allocated_bytes = 0;
+	toc->toc_nentry = 0;
+
+	return toc;
+}
+
+/*
+ * Attach to an existing table of contents.  If the magic number found at
+ * the target address doesn't match our expectations, return NULL.
+ */
+shm_toc *
+shm_toc_attach(uint64 magic, void *address)
+{
+	shm_toc    *toc = (shm_toc *) address;
+
+	if (toc->toc_magic != magic)
+		return NULL;
+
+	Assert(toc->toc_total_bytes >= toc->toc_allocated_bytes);
+	Assert(toc->toc_total_bytes > offsetof(shm_toc, toc_entry));
+
+	return toc;
+}
+
+/*
+ * Allocate shared memory from a segment managed by a table of contents.
+ *
+ * This is not a full-blown allocator; there's no way to free memory.  It's
+ * just a way of dividing a single physical shared memory segment into logical
+ * chunks that may be used for different purposes.
+ *
+ * We allocate backwards from the end of the segment, so that the TOC entries
+ * can grow forward from the start of the segment.
+ */
+void *
+shm_toc_allocate(shm_toc *toc, Size nbytes)
+{
+	volatile shm_toc *vtoc = toc;
+	Size		total_bytes;
+	Size		allocated_bytes;
+	Size		nentry;
+	Size		toc_bytes;
+
+	/*
+	 * Make sure request is well-aligned.  XXX: MAXALIGN is not enough,
+	 * because atomic ops might need a wider alignment.  We don't have a
+	 * proper definition for the minimum to make atomic ops safe, but
+	 * BUFFERALIGN ought to be enough.
+	 */
+	nbytes = BUFFERALIGN(nbytes);
+
+	SpinLockAcquire(&toc->toc_mutex);
+
+	total_bytes = vtoc->toc_total_bytes;
+	allocated_bytes = vtoc->toc_allocated_bytes;
+	nentry = vtoc->toc_nentry;
+	toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
+		+ allocated_bytes;
+
+	/* Check for memory exhaustion and overflow. */
+	if (toc_bytes + nbytes > total_bytes || toc_bytes + nbytes < toc_bytes)
+	{
+		SpinLockRelease(&toc->toc_mutex);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory")));
+	}
+	vtoc->toc_allocated_bytes += nbytes;
+
+	SpinLockRelease(&toc->toc_mutex);
+
+	return ((char *) toc) + (total_bytes - allocated_bytes - nbytes);
+}
+
+/*
+ * Return the number of bytes that can still be allocated.
+ */
+Size
+shm_toc_freespace(shm_toc *toc)
+{
+	volatile shm_toc *vtoc = toc;
+	Size		total_bytes;
+	Size		allocated_bytes;
+	Size		nentry;
+	Size		toc_bytes;
+
+	SpinLockAcquire(&toc->toc_mutex);
+	total_bytes = vtoc->toc_total_bytes;
+	allocated_bytes = vtoc->toc_allocated_bytes;
+	nentry = vtoc->toc_nentry;
+	SpinLockRelease(&toc->toc_mutex);
+
+	toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry);
+	Assert(allocated_bytes + BUFFERALIGN(toc_bytes) <= total_bytes);
+	return total_bytes - (allocated_bytes + BUFFERALIGN(toc_bytes));
+}
+
+/*
+ * Insert a TOC entry.
+ *
+ * The idea here is that the process setting up the shared memory segment will
+ * register the addresses of data structures within the segment using this
+ * function.  Each data structure will be identified using a 64-bit key, which
+ * is assumed to be a well-known or discoverable integer.  Other processes
+ * accessing the shared memory segment can pass the same key to
+ * shm_toc_lookup() to discover the addresses of those data structures.
+ *
+ * Since the shared memory segment may be mapped at different addresses within
+ * different backends, we store relative rather than absolute pointers.
+ *
+ * This won't scale well to a large number of keys.  Hopefully, that isn't
+ * necessary; if it proves to be, we might need to provide a more sophisticated
+ * data structure here.  But the real idea here is just to give someone mapping
+ * a dynamic shared memory the ability to find the bare minimum number of
+ * pointers that they need to bootstrap.  If you're storing a lot of stuff in
+ * the TOC, you're doing it wrong.
+ */
+void
+shm_toc_insert(shm_toc *toc, uint64 key, void *address)
+{
+	volatile shm_toc *vtoc = toc;
+	Size		total_bytes;
+	Size		allocated_bytes;
+	Size		nentry;
+	Size		toc_bytes;
+	Size		offset;
+
+	/* Relativize pointer. */
+	Assert(address > (void *) toc);
+	offset = ((char *) address) - (char *) toc;
+
+	SpinLockAcquire(&toc->toc_mutex);
+
+	total_bytes = vtoc->toc_total_bytes;
+	allocated_bytes = vtoc->toc_allocated_bytes;
+	nentry = vtoc->toc_nentry;
+	toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
+		+ allocated_bytes;
+
+	/* Check for memory exhaustion and overflow. */
+	if (toc_bytes + sizeof(shm_toc_entry) > total_bytes ||
+		toc_bytes + sizeof(shm_toc_entry) < toc_bytes ||
+		nentry >= PG_UINT32_MAX)
+	{
+		SpinLockRelease(&toc->toc_mutex);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory")));
+	}
+
+	Assert(offset < total_bytes);
+	vtoc->toc_entry[nentry].key = key;
+	vtoc->toc_entry[nentry].offset = offset;
+
+	/*
+	 * By placing a write barrier after filling in the entry and before
+	 * updating the number of entries, we make it safe to read the TOC
+	 * unlocked.
+	 */
+	pg_write_barrier();
+
+	vtoc->toc_nentry++;
+
+	SpinLockRelease(&toc->toc_mutex);
+}
+
+/*
+ * Look up a TOC entry.
+ *
+ * If the key is not found, returns NULL if noError is true, otherwise
+ * throws elog(ERROR).
+ *
+ * Unlike the other functions in this file, this operation acquires no lock;
+ * it uses only barriers.  It probably wouldn't hurt concurrency very much even
+ * if it did get a lock, but since it's reasonably likely that a group of
+ * worker processes could each read a series of entries from the same TOC
+ * right around the same time, there seems to be some value in avoiding it.
+ */
+void *
+shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
+{
+	uint32		nentry;
+	uint32		i;
+
+	/*
+	 * Read the number of entries before we examine any entry.  We assume that
+	 * reading a uint32 is atomic.
+	 */
+	nentry = toc->toc_nentry;
+	pg_read_barrier();
+
+	/* Now search for a matching entry. */
+	for (i = 0; i < nentry; ++i)
+	{
+		if (toc->toc_entry[i].key == key)
+			return ((char *) toc) + toc->toc_entry[i].offset;
+	}
+
+	/* No matching entry was found. */
+	if (!noError)
+		elog(ERROR, "could not find key " UINT64_FORMAT " in shm TOC at %p",
+			 key, toc);
+	return NULL;
+}
+
+/*
+ * Estimate how much shared memory will be required to store a TOC and its
+ * dependent data structures.
+ */
+Size
+shm_toc_estimate(shm_toc_estimator *e)
+{
+	Size		sz;
+
+	sz = offsetof(shm_toc, toc_entry);
+	sz = add_size(sz, mul_size(e->number_of_keys, sizeof(shm_toc_entry)));
+	sz = add_size(sz, e->space_for_chunks);
+
+	return BUFFERALIGN(sz);
+}