summaryrefslogtreecommitdiffstats
path: root/src/backend/storage/ipc/shm_toc.c
blob: 0c9ef64428be4af3dce174071d8cc4cb88a1c1d6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
/*-------------------------------------------------------------------------
 *
 * shm_toc.c
 *	  shared memory segment table of contents
 *
 * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * src/backend/storage/ipc/shm_toc.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "port/atomics.h"
#include "storage/shm_toc.h"
#include "storage/spin.h"

typedef struct shm_toc_entry
{
	uint64		key;			/* Arbitrary identifier */
	Size		offset;			/* Offset, in bytes, from TOC start */
} shm_toc_entry;

struct shm_toc
{
	uint64		toc_magic;		/* Magic number identifying this TOC */
	slock_t		toc_mutex;		/* Spinlock for mutual exclusion */
	Size		toc_total_bytes;	/* Bytes managed by this TOC */
	Size		toc_allocated_bytes;	/* Bytes allocated of those managed */
	uint32		toc_nentry;		/* Number of entries in TOC */
	shm_toc_entry toc_entry[FLEXIBLE_ARRAY_MEMBER];
};

/*
 * Initialize a region of shared memory with a table of contents.
 */
shm_toc *
shm_toc_create(uint64 magic, void *address, Size nbytes)
{
	shm_toc    *toc = (shm_toc *) address;

	Assert(nbytes > offsetof(shm_toc, toc_entry));
	toc->toc_magic = magic;
	SpinLockInit(&toc->toc_mutex);

	/*
	 * The alignment code in shm_toc_allocate() assumes that the starting
	 * value is buffer-aligned.
	 */
	toc->toc_total_bytes = BUFFERALIGN_DOWN(nbytes);
	toc->toc_allocated_bytes = 0;
	toc->toc_nentry = 0;

	return toc;
}

/*
 * Attach to an existing table of contents.  If the magic number found at
 * the target address doesn't match our expectations, return NULL.
 */
shm_toc *
shm_toc_attach(uint64 magic, void *address)
{
	shm_toc    *toc = (shm_toc *) address;

	if (toc->toc_magic != magic)
		return NULL;

	Assert(toc->toc_total_bytes >= toc->toc_allocated_bytes);
	Assert(toc->toc_total_bytes > offsetof(shm_toc, toc_entry));

	return toc;
}

/*
 * Allocate shared memory from a segment managed by a table of contents.
 *
 * This is not a full-blown allocator; there's no way to free memory.  It's
 * just a way of dividing a single physical shared memory segment into logical
 * chunks that may be used for different purposes.
 *
 * We allocate backwards from the end of the segment, so that the TOC entries
 * can grow forward from the start of the segment.
 */
void *
shm_toc_allocate(shm_toc *toc, Size nbytes)
{
	volatile shm_toc *vtoc = toc;
	Size		total_bytes;
	Size		allocated_bytes;
	Size		nentry;
	Size		toc_bytes;

	/*
	 * Make sure request is well-aligned.  XXX: MAXALIGN is not enough,
	 * because atomic ops might need a wider alignment.  We don't have a
	 * proper definition for the minimum to make atomic ops safe, but
	 * BUFFERALIGN ought to be enough.
	 */
	nbytes = BUFFERALIGN(nbytes);

	SpinLockAcquire(&toc->toc_mutex);

	total_bytes = vtoc->toc_total_bytes;
	allocated_bytes = vtoc->toc_allocated_bytes;
	nentry = vtoc->toc_nentry;
	toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
		+ allocated_bytes;

	/* Check for memory exhaustion and overflow. */
	if (toc_bytes + nbytes > total_bytes || toc_bytes + nbytes < toc_bytes)
	{
		SpinLockRelease(&toc->toc_mutex);
		ereport(ERROR,
				(errcode(ERRCODE_OUT_OF_MEMORY),
				 errmsg("out of shared memory")));
	}
	vtoc->toc_allocated_bytes += nbytes;

	SpinLockRelease(&toc->toc_mutex);

	return ((char *) toc) + (total_bytes - allocated_bytes - nbytes);
}

/*
 * Return the number of bytes that can still be allocated.
 */
Size
shm_toc_freespace(shm_toc *toc)
{
	volatile shm_toc *vtoc = toc;
	Size		total_bytes;
	Size		allocated_bytes;
	Size		nentry;
	Size		toc_bytes;

	SpinLockAcquire(&toc->toc_mutex);
	total_bytes = vtoc->toc_total_bytes;
	allocated_bytes = vtoc->toc_allocated_bytes;
	nentry = vtoc->toc_nentry;
	SpinLockRelease(&toc->toc_mutex);

	toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry);
	Assert(allocated_bytes + BUFFERALIGN(toc_bytes) <= total_bytes);
	return total_bytes - (allocated_bytes + BUFFERALIGN(toc_bytes));
}

/*
 * Insert a TOC entry.
 *
 * The idea here is that the process setting up the shared memory segment will
 * register the addresses of data structures within the segment using this
 * function.  Each data structure will be identified using a 64-bit key, which
 * is assumed to be a well-known or discoverable integer.  Other processes
 * accessing the shared memory segment can pass the same key to
 * shm_toc_lookup() to discover the addresses of those data structures.
 *
 * Since the shared memory segment may be mapped at different addresses within
 * different backends, we store relative rather than absolute pointers.
 *
 * This won't scale well to a large number of keys.  Hopefully, that isn't
 * necessary; if it proves to be, we might need to provide a more sophisticated
 * data structure here.  But the real idea here is just to give someone mapping
 * a dynamic shared memory the ability to find the bare minimum number of
 * pointers that they need to bootstrap.  If you're storing a lot of stuff in
 * the TOC, you're doing it wrong.
 */
void
shm_toc_insert(shm_toc *toc, uint64 key, void *address)
{
	volatile shm_toc *vtoc = toc;
	Size		total_bytes;
	Size		allocated_bytes;
	Size		nentry;
	Size		toc_bytes;
	Size		offset;

	/* Relativize pointer. */
	Assert(address > (void *) toc);
	offset = ((char *) address) - (char *) toc;

	SpinLockAcquire(&toc->toc_mutex);

	total_bytes = vtoc->toc_total_bytes;
	allocated_bytes = vtoc->toc_allocated_bytes;
	nentry = vtoc->toc_nentry;
	toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
		+ allocated_bytes;

	/* Check for memory exhaustion and overflow. */
	if (toc_bytes + sizeof(shm_toc_entry) > total_bytes ||
		toc_bytes + sizeof(shm_toc_entry) < toc_bytes ||
		nentry >= PG_UINT32_MAX)
	{
		SpinLockRelease(&toc->toc_mutex);
		ereport(ERROR,
				(errcode(ERRCODE_OUT_OF_MEMORY),
				 errmsg("out of shared memory")));
	}

	Assert(offset < total_bytes);
	vtoc->toc_entry[nentry].key = key;
	vtoc->toc_entry[nentry].offset = offset;

	/*
	 * By placing a write barrier after filling in the entry and before
	 * updating the number of entries, we make it safe to read the TOC
	 * unlocked.
	 */
	pg_write_barrier();

	vtoc->toc_nentry++;

	SpinLockRelease(&toc->toc_mutex);
}

/*
 * Look up a TOC entry.
 *
 * If the key is not found, returns NULL if noError is true, otherwise
 * throws elog(ERROR).
 *
 * Unlike the other functions in this file, this operation acquires no lock;
 * it uses only barriers.  It probably wouldn't hurt concurrency very much even
 * if it did get a lock, but since it's reasonably likely that a group of
 * worker processes could each read a series of entries from the same TOC
 * right around the same time, there seems to be some value in avoiding it.
 */
void *
shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
{
	uint32		nentry;
	uint32		i;

	/*
	 * Read the number of entries before we examine any entry.  We assume that
	 * reading a uint32 is atomic.
	 */
	nentry = toc->toc_nentry;
	pg_read_barrier();

	/* Now search for a matching entry. */
	for (i = 0; i < nentry; ++i)
	{
		if (toc->toc_entry[i].key == key)
			return ((char *) toc) + toc->toc_entry[i].offset;
	}

	/* No matching entry was found. */
	if (!noError)
		elog(ERROR, "could not find key " UINT64_FORMAT " in shm TOC at %p",
			 key, toc);
	return NULL;
}

/*
 * Estimate how much shared memory will be required to store a TOC and its
 * dependent data structures.
 */
Size
shm_toc_estimate(shm_toc_estimator *e)
{
	Size		sz;

	sz = offsetof(shm_toc, toc_entry);
	sz = add_size(sz, mul_size(e->number_of_keys, sizeof(shm_toc_entry)));
	sz = add_size(sz, e->space_for_chunks);

	return BUFFERALIGN(sz);
}