summaryrefslogtreecommitdiffstats
path: root/src/backend/utils/mmgr
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
commit46651ce6fe013220ed397add242004d764fc0153 (patch)
tree6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/utils/mmgr
parentInitial commit. (diff)
downloadpostgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz
postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/utils/mmgr')
-rw-r--r--src/backend/utils/mmgr/Makefile25
-rw-r--r--src/backend/utils/mmgr/README487
-rw-r--r--src/backend/utils/mmgr/aset.c1520
-rw-r--r--src/backend/utils/mmgr/dsa.c2287
-rw-r--r--src/backend/utils/mmgr/freepage.c1886
-rw-r--r--src/backend/utils/mmgr/generation.c832
-rw-r--r--src/backend/utils/mmgr/mcxt.c1341
-rw-r--r--src/backend/utils/mmgr/memdebug.c93
-rw-r--r--src/backend/utils/mmgr/portalmem.c1340
-rw-r--r--src/backend/utils/mmgr/slab.c794
10 files changed, 10605 insertions, 0 deletions
diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile
new file mode 100644
index 0000000..3b4cfdb
--- /dev/null
+++ b/src/backend/utils/mmgr/Makefile
@@ -0,0 +1,25 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for utils/mmgr
+#
+# IDENTIFICATION
+# src/backend/utils/mmgr/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/utils/mmgr
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ aset.o \
+ dsa.o \
+ freepage.o \
+ generation.o \
+ mcxt.o \
+ memdebug.o \
+ portalmem.o \
+ slab.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mmgr/README b/src/backend/utils/mmgr/README
new file mode 100644
index 0000000..221b4bd
--- /dev/null
+++ b/src/backend/utils/mmgr/README
@@ -0,0 +1,487 @@
+src/backend/utils/mmgr/README
+
+Memory Context System Design Overview
+=====================================
+
+Background
+----------
+
+We do most of our memory allocation in "memory contexts", which are usually
+AllocSets as implemented by src/backend/utils/mmgr/aset.c. The key to
+successful memory management without lots of overhead is to define a useful
+set of contexts with appropriate lifespans.
+
+The basic operations on a memory context are:
+
+* create a context
+
+* allocate a chunk of memory within a context (equivalent of standard
+ C library's malloc())
+
+* delete a context (including freeing all the memory allocated therein)
+
+* reset a context (free all memory allocated in the context, but not the
+ context object itself)
+
+* inquire about the total amount of memory allocated to the context
+ (the raw memory from which the context allocates chunks; not the
+ chunks themselves)
+
+Given a chunk of memory previously allocated from a context, one can
+free it or reallocate it larger or smaller (corresponding to standard C
+library's free() and realloc() routines). These operations return memory
+to or get more memory from the same context the chunk was originally
+allocated in.
+
+At all times there is a "current" context denoted by the
+CurrentMemoryContext global variable. palloc() implicitly allocates space
+in that context. The MemoryContextSwitchTo() operation selects a new current
+context (and returns the previous context, so that the caller can restore the
+previous context before exiting).
+
+The main advantage of memory contexts over plain use of malloc/free is
+that the entire contents of a memory context can be freed easily, without
+having to request freeing of each individual chunk within it. This is
+both faster and more reliable than per-chunk bookkeeping. We use this
+fact to clean up at transaction end: by resetting all the active contexts
+of transaction or shorter lifespan, we can reclaim all transient memory.
+Similarly, we can clean up at the end of each query, or after each tuple
+is processed during a query.
+
+
+Some Notes About the palloc API Versus Standard C Library
+---------------------------------------------------------
+
+The behavior of palloc and friends is similar to the standard C library's
+malloc and friends, but there are some deliberate differences too. Here
+are some notes to clarify the behavior.
+
+* If out of memory, palloc and repalloc exit via elog(ERROR). They
+never return NULL, and it is not necessary or useful to test for such
+a result. With palloc_extended() that behavior can be overridden
+using the MCXT_ALLOC_NO_OOM flag.
+
+* palloc(0) is explicitly a valid operation. It does not return a NULL
+pointer, but a valid chunk of which no bytes may be used. However, the
+chunk might later be repalloc'd larger; it can also be pfree'd without
+error. Similarly, repalloc allows realloc'ing to zero size.
+
+* pfree and repalloc do not accept a NULL pointer. This is intentional.
+
+
+The Current Memory Context
+--------------------------
+
+Because it would be too much notational overhead to always pass an
+appropriate memory context to called routines, there always exists the
+notion of the current memory context CurrentMemoryContext. Without it,
+for example, the copyObject routines would need to be passed a context, as
+would function execution routines that return a pass-by-reference
+datatype. Similarly for routines that temporarily allocate space
+internally, but don't return it to their caller? We certainly don't
+want to clutter every call in the system with "here is a context to
+use for any temporary memory allocation you might want to do".
+
+The upshot of that reasoning, though, is that CurrentMemoryContext should
+generally point at a short-lifespan context if at all possible. During
+query execution it usually points to a context that gets reset after each
+tuple. Only in *very* circumscribed code should it ever point at a
+context having greater than transaction lifespan, since doing so risks
+permanent memory leaks.
+
+
+pfree/repalloc Do Not Depend On CurrentMemoryContext
+----------------------------------------------------
+
+pfree() and repalloc() can be applied to any chunk whether it belongs
+to CurrentMemoryContext or not --- the chunk's owning context will be
+invoked to handle the operation, regardless.
+
+
+"Parent" and "Child" Contexts
+-----------------------------
+
+If all contexts were independent, it'd be hard to keep track of them,
+especially in error cases. That is solved by creating a tree of
+"parent" and "child" contexts. When creating a memory context, the
+new context can be specified to be a child of some existing context.
+A context can have many children, but only one parent. In this way
+the contexts form a forest (not necessarily a single tree, since there
+could be more than one top-level context; although in current practice
+there is only one top context, TopMemoryContext).
+
+Deleting a context deletes all its direct and indirect children as
+well. When resetting a context it's almost always more useful to
+delete child contexts, thus MemoryContextReset() means that, and if
+you really do want a tree of empty contexts you need to call
+MemoryContextResetOnly() plus MemoryContextResetChildren().
+
+These features allow us to manage a lot of contexts without fear that
+some will be leaked; we only need to keep track of one top-level
+context that we are going to delete at transaction end, and make sure
+that any shorter-lived contexts we create are descendants of that
+context. Since the tree can have multiple levels, we can deal easily
+with nested lifetimes of storage, such as per-transaction,
+per-statement, per-scan, per-tuple. Storage lifetimes that only
+partially overlap can be handled by allocating from different trees of
+the context forest (there are some examples in the next section).
+
+For convenience we also provide operations like "reset/delete all children
+of a given context, but don't reset or delete that context itself".
+
+
+Memory Context Reset/Delete Callbacks
+-------------------------------------
+
+A feature introduced in Postgres 9.5 allows memory contexts to be used
+for managing more resources than just plain palloc'd memory. This is
+done by registering a "reset callback function" for a memory context.
+Such a function will be called, once, just before the context is next
+reset or deleted. It can be used to give up resources that are in some
+sense associated with an object allocated within the context. Possible
+use-cases include
+* closing open files associated with a tuplesort object;
+* releasing reference counts on long-lived cache objects that are held
+ by some object within the context being reset;
+* freeing malloc-managed memory associated with some palloc'd object.
+That last case would just represent bad programming practice for pure
+Postgres code; better to have made all the allocations using palloc,
+in the target context or some child context. However, it could well
+come in handy for code that interfaces to non-Postgres libraries.
+
+Any number of reset callbacks can be established for a memory context;
+they are called in reverse order of registration. Also, callbacks
+attached to child contexts are called before callbacks attached to
+parent contexts, if a tree of contexts is being reset or deleted.
+
+The API for this requires the caller to provide a MemoryContextCallback
+memory chunk to hold the state for a callback. Typically this should be
+allocated in the same context it is logically attached to, so that it
+will be released automatically after use. The reason for asking the
+caller to provide this memory is that in most usage scenarios, the caller
+will be creating some larger struct within the target context, and the
+MemoryContextCallback struct can be made "for free" without a separate
+palloc() call by including it in this larger struct.
+
+
+Memory Contexts in Practice
+===========================
+
+Globally Known Contexts
+-----------------------
+
+There are a few widely-known contexts that are typically referenced
+through global variables. At any instant the system may contain many
+additional contexts, but all other contexts should be direct or indirect
+children of one of these contexts to ensure they are not leaked in event
+of an error.
+
+TopMemoryContext --- this is the actual top level of the context tree;
+every other context is a direct or indirect child of this one. Allocating
+here is essentially the same as "malloc", because this context will never
+be reset or deleted. This is for stuff that should live forever, or for
+stuff that the controlling module will take care of deleting at the
+appropriate time. An example is fd.c's tables of open files. Avoid
+allocating stuff here unless really necessary, and especially avoid
+running with CurrentMemoryContext pointing here.
+
+PostmasterContext --- this is the postmaster's normal working context.
+After a backend is spawned, it can delete PostmasterContext to free its
+copy of memory the postmaster was using that it doesn't need.
+Note that in non-EXEC_BACKEND builds, the postmaster's copy of pg_hba.conf
+and pg_ident.conf data is used directly during authentication in backend
+processes; so backends can't delete PostmasterContext until that's done.
+(The postmaster has only TopMemoryContext, PostmasterContext, and
+ErrorContext --- the remaining top-level contexts are set up in each
+backend during startup.)
+
+CacheMemoryContext --- permanent storage for relcache, catcache, and
+related modules. This will never be reset or deleted, either, so it's
+not truly necessary to distinguish it from TopMemoryContext. But it
+seems worthwhile to maintain the distinction for debugging purposes.
+(Note: CacheMemoryContext has child contexts with shorter lifespans.
+For example, a child context is the best place to keep the subsidiary
+storage associated with a relcache entry; that way we can free rule
+parsetrees and so forth easily, without having to depend on constructing
+a reliable version of freeObject().)
+
+MessageContext --- this context holds the current command message from the
+frontend, as well as any derived storage that need only live as long as
+the current message (for example, in simple-Query mode the parse and plan
+trees can live here). This context will be reset, and any children
+deleted, at the top of each cycle of the outer loop of PostgresMain. This
+is kept separate from per-transaction and per-portal contexts because a
+query string might need to live either a longer or shorter time than any
+single transaction or portal.
+
+TopTransactionContext --- this holds everything that lives until end of the
+top-level transaction. This context will be reset, and all its children
+deleted, at conclusion of each top-level transaction cycle. In most cases
+you don't want to allocate stuff directly here, but in CurTransactionContext;
+what does belong here is control information that exists explicitly to manage
+status across multiple subtransactions. Note: this context is NOT cleared
+immediately upon error; its contents will survive until the transaction block
+is exited by COMMIT/ROLLBACK.
+
+CurTransactionContext --- this holds data that has to survive until the end
+of the current transaction, and in particular will be needed at top-level
+transaction commit. When we are in a top-level transaction this is the same
+as TopTransactionContext, but in subtransactions it points to a child context.
+It is important to understand that if a subtransaction aborts, its
+CurTransactionContext is thrown away after finishing the abort processing;
+but a committed subtransaction's CurTransactionContext is kept until top-level
+commit (unless of course one of the intermediate levels of subtransaction
+aborts). This ensures that we do not keep data from a failed subtransaction
+longer than necessary. Because of this behavior, you must be careful to clean
+up properly during subtransaction abort --- the subtransaction's state must be
+delinked from any pointers or lists kept in upper transactions, or you will
+have dangling pointers leading to a crash at top-level commit. An example of
+data kept here is pending NOTIFY messages, which are sent at top-level commit,
+but only if the generating subtransaction did not abort.
+
+PortalContext --- this is not actually a separate context, but a
+global variable pointing to the per-portal context of the currently active
+execution portal. This can be used if it's necessary to allocate storage
+that will live just as long as the execution of the current portal requires.
+
+ErrorContext --- this permanent context is switched into for error
+recovery processing, and then reset on completion of recovery. We arrange
+to have a few KB of memory available in it at all times. In this way, we
+can ensure that some memory is available for error recovery even if the
+backend has run out of memory otherwise. This allows out-of-memory to be
+treated as a normal ERROR condition, not a FATAL error.
+
+
+Contexts For Prepared Statements And Portals
+--------------------------------------------
+
+A prepared-statement object has an associated private context, in which
+the parse and plan trees for its query are stored. Because these trees
+are read-only to the executor, the prepared statement can be re-used many
+times without further copying of these trees.
+
+An execution-portal object has a private context that is referenced by
+PortalContext when the portal is active. In the case of a portal created
+by DECLARE CURSOR, this private context contains the query parse and plan
+trees (there being no other object that can hold them). Portals created
+from prepared statements simply reference the prepared statements' trees,
+and don't actually need any storage allocated in their private contexts.
+
+
+Logical Replication Worker Contexts
+-----------------------------------
+
+ApplyContext --- permanent during whole lifetime of apply worker. It
+is possible to use TopMemoryContext here as well, but for simplicity
+of memory usage analysis we spin up different context.
+
+ApplyMessageContext --- short-lived context that is reset after each
+logical replication protocol message is processed.
+
+
+Transient Contexts During Execution
+-----------------------------------
+
+When creating a prepared statement, the parse and plan trees will be built
+in a temporary context that's a child of MessageContext (so that it will
+go away automatically upon error). On success, the finished plan is
+copied to the prepared statement's private context, and the temp context
+is released; this allows planner temporary space to be recovered before
+execution begins. (In simple-Query mode we don't bother with the extra
+copy step, so the planner temp space stays around till end of query.)
+
+The top-level executor routines, as well as most of the "plan node"
+execution code, will normally run in a context that is created by
+ExecutorStart and destroyed by ExecutorEnd; this context also holds the
+"plan state" tree built during ExecutorStart. Most of the memory
+allocated in these routines is intended to live until end of query,
+so this is appropriate for those purposes. The executor's top context
+is a child of PortalContext, that is, the per-portal context of the
+portal that represents the query's execution.
+
+The main memory-management consideration in the executor is that
+expression evaluation --- both for qual testing and for computation of
+targetlist entries --- needs to not leak memory. To do this, each
+ExprContext (expression-eval context) created in the executor has a
+private memory context associated with it, and we switch into that context
+when evaluating expressions in that ExprContext. The plan node that owns
+the ExprContext is responsible for resetting the private context to empty
+when it no longer needs the results of expression evaluations. Typically
+the reset is done at the start of each tuple-fetch cycle in the plan node.
+
+Note that this design gives each plan node its own expression-eval memory
+context. This appears necessary to handle nested joins properly, since
+an outer plan node might need to retain expression results it has computed
+while obtaining the next tuple from an inner node --- but the inner node
+might execute many tuple cycles and many expressions before returning a
+tuple. The inner node must be able to reset its own expression context
+more often than once per outer tuple cycle. Fortunately, memory contexts
+are cheap enough that giving one to each plan node doesn't seem like a
+problem.
+
+A problem with running index accesses and sorts in a query-lifespan context
+is that these operations invoke datatype-specific comparison functions,
+and if the comparators leak any memory then that memory won't be recovered
+till end of query. The comparator functions all return bool or int32,
+so there's no problem with their result data, but there can be a problem
+with leakage of internal temporary data. In particular, comparator
+functions that operate on TOAST-able data types need to be careful
+not to leak detoasted versions of their inputs. This is annoying, but
+it appeared a lot easier to make the comparators conform than to fix the
+index and sort routines, so that's what was done for 7.1. This remains
+the state of affairs in btree and hash indexes, so btree and hash support
+functions still need to not leak memory. Most of the other index AMs
+have been modified to run opclass support functions in short-lived
+contexts, so that leakage is not a problem; this is necessary in view
+of the fact that their support functions tend to be far more complex.
+
+There are some special cases, such as aggregate functions. nodeAgg.c
+needs to remember the results of evaluation of aggregate transition
+functions from one tuple cycle to the next, so it can't just discard
+all per-tuple state in each cycle. The easiest way to handle this seems
+to be to have two per-tuple contexts in an aggregate node, and to
+ping-pong between them, so that at each tuple one is the active allocation
+context and the other holds any results allocated by the prior cycle's
+transition function.
+
+Executor routines that switch the active CurrentMemoryContext may need
+to copy data into their caller's current memory context before returning.
+However, we have minimized the need for that, because of the convention
+of resetting the per-tuple context at the *start* of an execution cycle
+rather than at its end. With that rule, an execution node can return a
+tuple that is palloc'd in its per-tuple context, and the tuple will remain
+good until the node is called for another tuple or told to end execution.
+This parallels the situation with pass-by-reference values at the table
+scan level, since a scan node can return a direct pointer to a tuple in a
+disk buffer that is only guaranteed to remain good that long.
+
+A more common reason for copying data is to transfer a result from
+per-tuple context to per-query context; for example, a Unique node will
+save the last distinct tuple value in its per-query context, requiring a
+copy step.
+
+
+Mechanisms to Allow Multiple Types of Contexts
+----------------------------------------------
+
+To efficiently allow for different allocation patterns, and for
+experimentation, we allow for different types of memory contexts with
+different allocation policies but similar external behavior. To
+handle this, memory allocation functions are accessed via function
+pointers, and we require all context types to obey the conventions
+given here.
+
+A memory context is represented by struct MemoryContextData (see
+memnodes.h). This struct identifies the exact type of the context, and
+contains information common between the different types of
+MemoryContext like the parent and child contexts, and the name of the
+context.
+
+This is essentially an abstract superclass, and the behavior is
+determined by the "methods" pointer is its virtual function table
+(struct MemoryContextMethods). Specific memory context types will use
+derived structs having these fields as their first fields. All the
+contexts of a specific type will have methods pointers that point to
+the same static table of function pointers.
+
+While operations like allocating from and resetting a context take the
+relevant MemoryContext as a parameter, operations like free and
+realloc are trickier. To make those work, we require all memory
+context types to produce allocated chunks that are immediately,
+without any padding, preceded by a pointer to the corresponding
+MemoryContext.
+
+If a type of allocator needs additional information about its chunks,
+like e.g. the size of the allocation, that information can in turn
+precede the MemoryContext. This means the only overhead implied by
+the memory context mechanism is a pointer to its context, so we're not
+constraining context-type designers very much.
+
+Given this, routines like pfree determine their corresponding context
+with an operation like (although that is usually encapsulated in
+GetMemoryChunkContext())
+
+ MemoryContext context = *(MemoryContext*) (((char *) pointer) - sizeof(void *));
+
+and then invoke the corresponding method for the context
+
+ context->methods->free_p(pointer);
+
+
+More Control Over aset.c Behavior
+---------------------------------
+
+By default aset.c always allocates an 8K block upon the first
+allocation in a context, and doubles that size for each successive
+block request. That's good behavior for a context that might hold
+*lots* of data. But if there are dozens if not hundreds of smaller
+contexts in the system, we need to be able to fine-tune things a
+little better.
+
+The creator of a context is able to specify an initial block size and
+a maximum block size. Selecting smaller values can prevent wastage of
+space in contexts that aren't expected to hold very much (an example
+is the relcache's per-relation contexts).
+
+Also, it is possible to specify a minimum context size, in case for some
+reason that should be different from the initial size for additional
+blocks. An aset.c context will always contain at least one block,
+of size minContextSize if that is specified, otherwise initBlockSize.
+
+We expect that per-tuple contexts will be reset frequently and typically
+will not allocate very much space per tuple cycle. To make this usage
+pattern cheap, the first block allocated in a context is not given
+back to malloc() during reset, but just cleared. This avoids malloc
+thrashing.
+
+
+Alternative Memory Context Implementations
+------------------------------------------
+
+aset.c is our default general-purpose implementation, working fine
+in most situations. We also have two implementations optimized for
+special use cases, providing either better performance or lower memory
+usage compared to aset.c (or both).
+
+* slab.c (SlabContext) is designed for allocations of fixed-length
+ chunks, and does not allow allocations of chunks with different size.
+
+* generation.c (GenerationContext) is designed for cases when chunks
+ are allocated in groups with similar lifespan (generations), or
+ roughly in FIFO order.
+
+Both memory contexts aim to free memory back to the operating system
+(unlike aset.c, which keeps the freed chunks in a freelist, and only
+returns the memory when reset/deleted).
+
+These memory contexts were initially developed for ReorderBuffer, but
+may be useful elsewhere as long as the allocation patterns match.
+
+
+Memory Accounting
+-----------------
+
+One of the basic memory context operations is determining the amount of
+memory used in the context (and its children). We have multiple places
+that implement their own ad hoc memory accounting, and this is meant to
+provide a unified approach. Ad hoc accounting solutions work for places
+with tight control over the allocations or when it's easy to determine
+sizes of allocated chunks (e.g. places that only work with tuples).
+
+The accounting built into the memory contexts is transparent and works
+transparently for all allocations as long as they end up in the right
+memory context subtree.
+
+Consider for example aggregate functions - the aggregate state is often
+represented by an arbitrary structure, allocated from the transition
+function, so the ad hoc accounting is unlikely to work. The built-in
+accounting will however handle such cases just fine.
+
+To minimize overhead, the accounting is done at the block level, not for
+individual allocation chunks.
+
+The accounting is lazy - after a block is allocated (or freed), only the
+context owning that block is updated. This means that when inquiring
+about the memory usage in a given context, we have to walk all children
+contexts recursively. This means the memory accounting is not intended
+for cases with too many memory contexts (in the relevant subtree).
diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c
new file mode 100644
index 0000000..77872e7
--- /dev/null
+++ b/src/backend/utils/mmgr/aset.c
@@ -0,0 +1,1520 @@
+/*-------------------------------------------------------------------------
+ *
+ * aset.c
+ * Allocation set definitions.
+ *
+ * AllocSet is our standard implementation of the abstract MemoryContext
+ * type.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/utils/mmgr/aset.c
+ *
+ * NOTE:
+ * This is a new (Feb. 05, 1999) implementation of the allocation set
+ * routines. AllocSet...() does not use OrderedSet...() any more.
+ * Instead it manages allocations in a block pool by itself, combining
+ * many small allocations in a few bigger blocks. AllocSetFree() normally
+ * doesn't free() memory really. It just add's the free'd area to some
+ * list for later reuse by AllocSetAlloc(). All memory blocks are free()'d
+ * at once on AllocSetReset(), which happens when the memory context gets
+ * destroyed.
+ * Jan Wieck
+ *
+ * Performance improvement from Tom Lane, 8/99: for extremely large request
+ * sizes, we do want to be able to give the memory back to free() as soon
+ * as it is pfree()'d. Otherwise we risk tying up a lot of memory in
+ * freelist entries that might never be usable. This is specially needed
+ * when the caller is repeatedly repalloc()'ing a block bigger and bigger;
+ * the previous instances of the block were guaranteed to be wasted until
+ * AllocSetReset() under the old way.
+ *
+ * Further improvement 12/00: as the code stood, request sizes in the
+ * midrange between "small" and "large" were handled very inefficiently,
+ * because any sufficiently large free chunk would be used to satisfy a
+ * request, even if it was much larger than necessary. This led to more
+ * and more wasted space in allocated chunks over time. To fix, get rid
+ * of the midrange behavior: we now handle only "small" power-of-2-size
+ * chunks as chunks. Anything "large" is passed off to malloc(). Change
+ * the number of freelists to change the small/large boundary.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "port/pg_bitutils.h"
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+
+/*--------------------
+ * Chunk freelist k holds chunks of size 1 << (k + ALLOC_MINBITS),
+ * for k = 0 .. ALLOCSET_NUM_FREELISTS-1.
+ *
+ * Note that all chunks in the freelists have power-of-2 sizes. This
+ * improves recyclability: we may waste some space, but the wasted space
+ * should stay pretty constant as requests are made and released.
+ *
+ * A request too large for the last freelist is handled by allocating a
+ * dedicated block from malloc(). The block still has a block header and
+ * chunk header, but when the chunk is freed we'll return the whole block
+ * to malloc(), not put it on our freelists.
+ *
+ * CAUTION: ALLOC_MINBITS must be large enough so that
+ * 1<<ALLOC_MINBITS is at least MAXALIGN,
+ * or we may fail to align the smallest chunks adequately.
+ * 8-byte alignment is enough on all currently known machines.
+ *
+ * With the current parameters, request sizes up to 8K are treated as chunks,
+ * larger requests go into dedicated blocks. Change ALLOCSET_NUM_FREELISTS
+ * to adjust the boundary point; and adjust ALLOCSET_SEPARATE_THRESHOLD in
+ * memutils.h to agree. (Note: in contexts with small maxBlockSize, we may
+ * set the allocChunkLimit to less than 8K, so as to avoid space wastage.)
+ *--------------------
+ */
+
+#define ALLOC_MINBITS 3 /* smallest chunk size is 8 bytes */
+#define ALLOCSET_NUM_FREELISTS 11
+#define ALLOC_CHUNK_LIMIT (1 << (ALLOCSET_NUM_FREELISTS-1+ALLOC_MINBITS))
+/* Size of largest chunk that we use a fixed size for */
+#define ALLOC_CHUNK_FRACTION 4
+/* We allow chunks to be at most 1/4 of maxBlockSize (less overhead) */
+
+/*--------------------
+ * The first block allocated for an allocset has size initBlockSize.
+ * Each time we have to allocate another block, we double the block size
+ * (if possible, and without exceeding maxBlockSize), so as to reduce
+ * the bookkeeping load on malloc().
+ *
+ * Blocks allocated to hold oversize chunks do not follow this rule, however;
+ * they are just however big they need to be to hold that single chunk.
+ *
+ * Also, if a minContextSize is specified, the first block has that size,
+ * and then initBlockSize is used for the next one.
+ *--------------------
+ */
+
+#define ALLOC_BLOCKHDRSZ MAXALIGN(sizeof(AllocBlockData))
+#define ALLOC_CHUNKHDRSZ sizeof(struct AllocChunkData)
+
+typedef struct AllocBlockData *AllocBlock; /* forward reference */
+typedef struct AllocChunkData *AllocChunk;
+
+/*
+ * AllocPointer
+ * Aligned pointer which may be a member of an allocation set.
+ */
+typedef void *AllocPointer;
+
+/*
+ * AllocSetContext is our standard implementation of MemoryContext.
+ *
+ * Note: header.isReset means there is nothing for AllocSetReset to do.
+ * This is different from the aset being physically empty (empty blocks list)
+ * because we will still have a keeper block. It's also different from the set
+ * being logically empty, because we don't attempt to detect pfree'ing the
+ * last active chunk.
+ */
+typedef struct AllocSetContext
+{
+ MemoryContextData header; /* Standard memory-context fields */
+ /* Info about storage allocated in this context: */
+ AllocBlock blocks; /* head of list of blocks in this set */
+ AllocChunk freelist[ALLOCSET_NUM_FREELISTS]; /* free chunk lists */
+ /* Allocation parameters for this context: */
+ Size initBlockSize; /* initial block size */
+ Size maxBlockSize; /* maximum block size */
+ Size nextBlockSize; /* next block size to allocate */
+ Size allocChunkLimit; /* effective chunk size limit */
+ AllocBlock keeper; /* keep this block over resets */
+ /* freelist this context could be put in, or -1 if not a candidate: */
+ int freeListIndex; /* index in context_freelists[], or -1 */
+} AllocSetContext;
+
+typedef AllocSetContext *AllocSet;
+
+/*
+ * AllocBlock
+ * An AllocBlock is the unit of memory that is obtained by aset.c
+ * from malloc(). It contains one or more AllocChunks, which are
+ * the units requested by palloc() and freed by pfree(). AllocChunks
+ * cannot be returned to malloc() individually, instead they are put
+ * on freelists by pfree() and re-used by the next palloc() that has
+ * a matching request size.
+ *
+ * AllocBlockData is the header data for a block --- the usable space
+ * within the block begins at the next alignment boundary.
+ */
+typedef struct AllocBlockData
+{
+ AllocSet aset; /* aset that owns this block */
+ AllocBlock prev; /* prev block in aset's blocks list, if any */
+ AllocBlock next; /* next block in aset's blocks list, if any */
+ char *freeptr; /* start of free space in this block */
+ char *endptr; /* end of space in this block */
+} AllocBlockData;
+
+/*
+ * AllocChunk
+ * The prefix of each piece of memory in an AllocBlock
+ *
+ * Note: to meet the memory context APIs, the payload area of the chunk must
+ * be maxaligned, and the "aset" link must be immediately adjacent to the
+ * payload area (cf. GetMemoryChunkContext). We simplify matters for this
+ * module by requiring sizeof(AllocChunkData) to be maxaligned, and then
+ * we can ensure things work by adding any required alignment padding before
+ * the "aset" field. There is a static assertion below that the alignment
+ * is done correctly.
+ */
+typedef struct AllocChunkData
+{
+ /* size is always the size of the usable space in the chunk */
+ Size size;
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* when debugging memory usage, also store actual requested size */
+ /* this is zero in a free chunk */
+ Size requested_size;
+
+#define ALLOCCHUNK_RAWSIZE (SIZEOF_SIZE_T * 2 + SIZEOF_VOID_P)
+#else
+#define ALLOCCHUNK_RAWSIZE (SIZEOF_SIZE_T + SIZEOF_VOID_P)
+#endif /* MEMORY_CONTEXT_CHECKING */
+
+ /* ensure proper alignment by adding padding if needed */
+#if (ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF) != 0
+ char padding[MAXIMUM_ALIGNOF - ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF];
+#endif
+
+ /* aset is the owning aset if allocated, or the freelist link if free */
+ void *aset;
+ /* there must not be any padding to reach a MAXALIGN boundary here! */
+} AllocChunkData;
+
+/*
+ * Only the "aset" field should be accessed outside this module.
+ * We keep the rest of an allocated chunk's header marked NOACCESS when using
+ * valgrind. But note that chunk headers that are in a freelist are kept
+ * accessible, for simplicity.
+ */
+#define ALLOCCHUNK_PRIVATE_LEN offsetof(AllocChunkData, aset)
+
+/*
+ * AllocPointerIsValid
+ * True iff pointer is valid allocation pointer.
+ */
+#define AllocPointerIsValid(pointer) PointerIsValid(pointer)
+
+/*
+ * AllocSetIsValid
+ * True iff set is valid allocation set.
+ */
+#define AllocSetIsValid(set) PointerIsValid(set)
+
+#define AllocPointerGetChunk(ptr) \
+ ((AllocChunk)(((char *)(ptr)) - ALLOC_CHUNKHDRSZ))
+#define AllocChunkGetPointer(chk) \
+ ((AllocPointer)(((char *)(chk)) + ALLOC_CHUNKHDRSZ))
+
+/*
+ * Rather than repeatedly creating and deleting memory contexts, we keep some
+ * freed contexts in freelists so that we can hand them out again with little
+ * work. Before putting a context in a freelist, we reset it so that it has
+ * only its initial malloc chunk and no others. To be a candidate for a
+ * freelist, a context must have the same minContextSize/initBlockSize as
+ * other contexts in the list; but its maxBlockSize is irrelevant since that
+ * doesn't affect the size of the initial chunk.
+ *
+ * We currently provide one freelist for ALLOCSET_DEFAULT_SIZES contexts
+ * and one for ALLOCSET_SMALL_SIZES contexts; the latter works for
+ * ALLOCSET_START_SMALL_SIZES too, since only the maxBlockSize differs.
+ *
+ * Ordinarily, we re-use freelist contexts in last-in-first-out order, in
+ * hopes of improving locality of reference. But if there get to be too
+ * many contexts in the list, we'd prefer to drop the most-recently-created
+ * contexts in hopes of keeping the process memory map compact.
+ * We approximate that by simply deleting all existing entries when the list
+ * overflows, on the assumption that queries that allocate a lot of contexts
+ * will probably free them in more or less reverse order of allocation.
+ *
+ * Contexts in a freelist are chained via their nextchild pointers.
+ */
+#define MAX_FREE_CONTEXTS 100 /* arbitrary limit on freelist length */
+
+typedef struct AllocSetFreeList
+{
+ int num_free; /* current list length */
+ AllocSetContext *first_free; /* list header */
+} AllocSetFreeList;
+
+/* context_freelists[0] is for default params, [1] for small params */
+static AllocSetFreeList context_freelists[2] =
+{
+ {
+ 0, NULL
+ },
+ {
+ 0, NULL
+ }
+};
+
+/*
+ * These functions implement the MemoryContext API for AllocSet contexts.
+ */
+static void *AllocSetAlloc(MemoryContext context, Size size);
+static void AllocSetFree(MemoryContext context, void *pointer);
+static void *AllocSetRealloc(MemoryContext context, void *pointer, Size size);
+static void AllocSetReset(MemoryContext context);
+static void AllocSetDelete(MemoryContext context);
+static Size AllocSetGetChunkSpace(MemoryContext context, void *pointer);
+static bool AllocSetIsEmpty(MemoryContext context);
+static void AllocSetStats(MemoryContext context,
+ MemoryStatsPrintFunc printfunc, void *passthru,
+ MemoryContextCounters *totals,
+ bool print_to_stderr);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void AllocSetCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for AllocSet contexts.
+ */
+static const MemoryContextMethods AllocSetMethods = {
+ AllocSetAlloc,
+ AllocSetFree,
+ AllocSetRealloc,
+ AllocSetReset,
+ AllocSetDelete,
+ AllocSetGetChunkSpace,
+ AllocSetIsEmpty,
+ AllocSetStats
+#ifdef MEMORY_CONTEXT_CHECKING
+ ,AllocSetCheck
+#endif
+};
+
+
+/* ----------
+ * AllocSetFreeIndex -
+ *
+ * Depending on the size of an allocation compute which freechunk
+ * list of the alloc set it belongs to. Caller must have verified
+ * that size <= ALLOC_CHUNK_LIMIT.
+ * ----------
+ */
+static inline int
+AllocSetFreeIndex(Size size)
+{
+ int idx;
+
+ if (size > (1 << ALLOC_MINBITS))
+ {
+ /*----------
+ * At this point we must compute ceil(log2(size >> ALLOC_MINBITS)).
+ * This is the same as
+ * pg_leftmost_one_pos32((size - 1) >> ALLOC_MINBITS) + 1
+ * or equivalently
+ * pg_leftmost_one_pos32(size - 1) - ALLOC_MINBITS + 1
+ *
+ * However, rather than just calling that function, we duplicate the
+ * logic here, allowing an additional optimization. It's reasonable
+ * to assume that ALLOC_CHUNK_LIMIT fits in 16 bits, so we can unroll
+ * the byte-at-a-time loop in pg_leftmost_one_pos32 and just handle
+ * the last two bytes.
+ *
+ * Yes, this function is enough of a hot-spot to make it worth this
+ * much trouble.
+ *----------
+ */
+#ifdef HAVE__BUILTIN_CLZ
+ idx = 31 - __builtin_clz((uint32) size - 1) - ALLOC_MINBITS + 1;
+#else
+ uint32 t,
+ tsize;
+
+ /* Statically assert that we only have a 16-bit input value. */
+ StaticAssertStmt(ALLOC_CHUNK_LIMIT < (1 << 16),
+ "ALLOC_CHUNK_LIMIT must be less than 64kB");
+
+ tsize = size - 1;
+ t = tsize >> 8;
+ idx = t ? pg_leftmost_one_pos[t] + 8 : pg_leftmost_one_pos[tsize];
+ idx -= ALLOC_MINBITS - 1;
+#endif
+
+ Assert(idx < ALLOCSET_NUM_FREELISTS);
+ }
+ else
+ idx = 0;
+
+ return idx;
+}
+
+
+/*
+ * Public routines
+ */
+
+
+/*
+ * AllocSetContextCreateInternal
+ * Create a new AllocSet context.
+ *
+ * parent: parent context, or NULL if top-level context
+ * name: name of context (must be statically allocated)
+ * minContextSize: minimum context size
+ * initBlockSize: initial allocation block size
+ * maxBlockSize: maximum allocation block size
+ *
+ * Most callers should abstract the context size parameters using a macro
+ * such as ALLOCSET_DEFAULT_SIZES.
+ *
+ * Note: don't call this directly; go through the wrapper macro
+ * AllocSetContextCreate.
+ */
+MemoryContext
+AllocSetContextCreateInternal(MemoryContext parent,
+ const char *name,
+ Size minContextSize,
+ Size initBlockSize,
+ Size maxBlockSize)
+{
+ int freeListIndex;
+ Size firstBlockSize;
+ AllocSet set;
+ AllocBlock block;
+
+ /* Assert we padded AllocChunkData properly */
+ StaticAssertStmt(ALLOC_CHUNKHDRSZ == MAXALIGN(ALLOC_CHUNKHDRSZ),
+ "sizeof(AllocChunkData) is not maxaligned");
+ StaticAssertStmt(offsetof(AllocChunkData, aset) + sizeof(MemoryContext) ==
+ ALLOC_CHUNKHDRSZ,
+ "padding calculation in AllocChunkData is wrong");
+
+ /*
+ * First, validate allocation parameters. Once these were regular runtime
+ * test and elog's, but in practice Asserts seem sufficient because nobody
+ * varies their parameters at runtime. We somewhat arbitrarily enforce a
+ * minimum 1K block size.
+ */
+ Assert(initBlockSize == MAXALIGN(initBlockSize) &&
+ initBlockSize >= 1024);
+ Assert(maxBlockSize == MAXALIGN(maxBlockSize) &&
+ maxBlockSize >= initBlockSize &&
+ AllocHugeSizeIsValid(maxBlockSize)); /* must be safe to double */
+ Assert(minContextSize == 0 ||
+ (minContextSize == MAXALIGN(minContextSize) &&
+ minContextSize >= 1024 &&
+ minContextSize <= maxBlockSize));
+
+ /*
+ * Check whether the parameters match either available freelist. We do
+ * not need to demand a match of maxBlockSize.
+ */
+ if (minContextSize == ALLOCSET_DEFAULT_MINSIZE &&
+ initBlockSize == ALLOCSET_DEFAULT_INITSIZE)
+ freeListIndex = 0;
+ else if (minContextSize == ALLOCSET_SMALL_MINSIZE &&
+ initBlockSize == ALLOCSET_SMALL_INITSIZE)
+ freeListIndex = 1;
+ else
+ freeListIndex = -1;
+
+ /*
+ * If a suitable freelist entry exists, just recycle that context.
+ */
+ if (freeListIndex >= 0)
+ {
+ AllocSetFreeList *freelist = &context_freelists[freeListIndex];
+
+ if (freelist->first_free != NULL)
+ {
+ /* Remove entry from freelist */
+ set = freelist->first_free;
+ freelist->first_free = (AllocSet) set->header.nextchild;
+ freelist->num_free--;
+
+ /* Update its maxBlockSize; everything else should be OK */
+ set->maxBlockSize = maxBlockSize;
+
+ /* Reinitialize its header, installing correct name and parent */
+ MemoryContextCreate((MemoryContext) set,
+ T_AllocSetContext,
+ &AllocSetMethods,
+ parent,
+ name);
+
+ ((MemoryContext) set)->mem_allocated =
+ set->keeper->endptr - ((char *) set);
+
+ return (MemoryContext) set;
+ }
+ }
+
+ /* Determine size of initial block */
+ firstBlockSize = MAXALIGN(sizeof(AllocSetContext)) +
+ ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
+ if (minContextSize != 0)
+ firstBlockSize = Max(firstBlockSize, minContextSize);
+ else
+ firstBlockSize = Max(firstBlockSize, initBlockSize);
+
+ /*
+ * Allocate the initial block. Unlike other aset.c blocks, it starts with
+ * the context header and its block header follows that.
+ */
+ set = (AllocSet) malloc(firstBlockSize);
+ if (set == NULL)
+ {
+ if (TopMemoryContext)
+ MemoryContextStats(TopMemoryContext);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed while creating memory context \"%s\".",
+ name)));
+ }
+
+ /*
+ * Avoid writing code that can fail between here and MemoryContextCreate;
+ * we'd leak the header/initial block if we ereport in this stretch.
+ */
+
+ /* Fill in the initial block's block header */
+ block = (AllocBlock) (((char *) set) + MAXALIGN(sizeof(AllocSetContext)));
+ block->aset = set;
+ block->freeptr = ((char *) block) + ALLOC_BLOCKHDRSZ;
+ block->endptr = ((char *) set) + firstBlockSize;
+ block->prev = NULL;
+ block->next = NULL;
+
+ /* Mark unallocated space NOACCESS; leave the block header alone. */
+ VALGRIND_MAKE_MEM_NOACCESS(block->freeptr, block->endptr - block->freeptr);
+
+ /* Remember block as part of block list */
+ set->blocks = block;
+ /* Mark block as not to be released at reset time */
+ set->keeper = block;
+
+ /* Finish filling in aset-specific parts of the context header */
+ MemSetAligned(set->freelist, 0, sizeof(set->freelist));
+
+ set->initBlockSize = initBlockSize;
+ set->maxBlockSize = maxBlockSize;
+ set->nextBlockSize = initBlockSize;
+ set->freeListIndex = freeListIndex;
+
+ /*
+ * Compute the allocation chunk size limit for this context. It can't be
+ * more than ALLOC_CHUNK_LIMIT because of the fixed number of freelists.
+ * If maxBlockSize is small then requests exceeding the maxBlockSize, or
+ * even a significant fraction of it, should be treated as large chunks
+ * too. For the typical case of maxBlockSize a power of 2, the chunk size
+ * limit will be at most 1/8th maxBlockSize, so that given a stream of
+ * requests that are all the maximum chunk size we will waste at most
+ * 1/8th of the allocated space.
+ *
+ * We have to have allocChunkLimit a power of two, because the requested
+ * and actually-allocated sizes of any chunk must be on the same side of
+ * the limit, else we get confused about whether the chunk is "big".
+ *
+ * Also, allocChunkLimit must not exceed ALLOCSET_SEPARATE_THRESHOLD.
+ */
+ StaticAssertStmt(ALLOC_CHUNK_LIMIT == ALLOCSET_SEPARATE_THRESHOLD,
+ "ALLOC_CHUNK_LIMIT != ALLOCSET_SEPARATE_THRESHOLD");
+
+ set->allocChunkLimit = ALLOC_CHUNK_LIMIT;
+ while ((Size) (set->allocChunkLimit + ALLOC_CHUNKHDRSZ) >
+ (Size) ((maxBlockSize - ALLOC_BLOCKHDRSZ) / ALLOC_CHUNK_FRACTION))
+ set->allocChunkLimit >>= 1;
+
+ /* Finally, do the type-independent part of context creation */
+ MemoryContextCreate((MemoryContext) set,
+ T_AllocSetContext,
+ &AllocSetMethods,
+ parent,
+ name);
+
+ ((MemoryContext) set)->mem_allocated = firstBlockSize;
+
+ return (MemoryContext) set;
+}
+
+/*
+ * AllocSetReset
+ * Frees all memory which is allocated in the given set.
+ *
+ * Actually, this routine has some discretion about what to do.
+ * It should mark all allocated chunks freed, but it need not necessarily
+ * give back all the resources the set owns. Our actual implementation is
+ * that we give back all but the "keeper" block (which we must keep, since
+ * it shares a malloc chunk with the context header). In this way, we don't
+ * thrash malloc() when a context is repeatedly reset after small allocations,
+ * which is typical behavior for per-tuple contexts.
+ */
+static void
+AllocSetReset(MemoryContext context)
+{
+ AllocSet set = (AllocSet) context;
+ AllocBlock block;
+ Size keepersize PG_USED_FOR_ASSERTS_ONLY
+ = set->keeper->endptr - ((char *) set);
+
+ AssertArg(AllocSetIsValid(set));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* Check for corruption and leaks before freeing */
+ AllocSetCheck(context);
+#endif
+
+ /* Clear chunk freelists */
+ MemSetAligned(set->freelist, 0, sizeof(set->freelist));
+
+ block = set->blocks;
+
+ /* New blocks list will be just the keeper block */
+ set->blocks = set->keeper;
+
+ while (block != NULL)
+ {
+ AllocBlock next = block->next;
+
+ if (block == set->keeper)
+ {
+ /* Reset the block, but don't return it to malloc */
+ char *datastart = ((char *) block) + ALLOC_BLOCKHDRSZ;
+
+#ifdef CLOBBER_FREED_MEMORY
+ wipe_mem(datastart, block->freeptr - datastart);
+#else
+ /* wipe_mem() would have done this */
+ VALGRIND_MAKE_MEM_NOACCESS(datastart, block->freeptr - datastart);
+#endif
+ block->freeptr = datastart;
+ block->prev = NULL;
+ block->next = NULL;
+ }
+ else
+ {
+ /* Normal case, release the block */
+ context->mem_allocated -= block->endptr - ((char *) block);
+
+#ifdef CLOBBER_FREED_MEMORY
+ wipe_mem(block, block->freeptr - ((char *) block));
+#endif
+ free(block);
+ }
+ block = next;
+ }
+
+ Assert(context->mem_allocated == keepersize);
+
+ /* Reset block size allocation sequence, too */
+ set->nextBlockSize = set->initBlockSize;
+}
+
+/*
+ * AllocSetDelete
+ * Frees all memory which is allocated in the given set,
+ * in preparation for deletion of the set.
+ *
+ * Unlike AllocSetReset, this *must* free all resources of the set.
+ */
+static void
+AllocSetDelete(MemoryContext context)
+{
+ AllocSet set = (AllocSet) context;
+ AllocBlock block = set->blocks;
+ Size keepersize PG_USED_FOR_ASSERTS_ONLY
+ = set->keeper->endptr - ((char *) set);
+
+ AssertArg(AllocSetIsValid(set));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* Check for corruption and leaks before freeing */
+ AllocSetCheck(context);
+#endif
+
+ /*
+ * If the context is a candidate for a freelist, put it into that freelist
+ * instead of destroying it.
+ */
+ if (set->freeListIndex >= 0)
+ {
+ AllocSetFreeList *freelist = &context_freelists[set->freeListIndex];
+
+ /*
+ * Reset the context, if it needs it, so that we aren't hanging on to
+ * more than the initial malloc chunk.
+ */
+ if (!context->isReset)
+ MemoryContextResetOnly(context);
+
+ /*
+ * If the freelist is full, just discard what's already in it. See
+ * comments with context_freelists[].
+ */
+ if (freelist->num_free >= MAX_FREE_CONTEXTS)
+ {
+ while (freelist->first_free != NULL)
+ {
+ AllocSetContext *oldset = freelist->first_free;
+
+ freelist->first_free = (AllocSetContext *) oldset->header.nextchild;
+ freelist->num_free--;
+
+ /* All that remains is to free the header/initial block */
+ free(oldset);
+ }
+ Assert(freelist->num_free == 0);
+ }
+
+ /* Now add the just-deleted context to the freelist. */
+ set->header.nextchild = (MemoryContext) freelist->first_free;
+ freelist->first_free = set;
+ freelist->num_free++;
+
+ return;
+ }
+
+ /* Free all blocks, except the keeper which is part of context header */
+ while (block != NULL)
+ {
+ AllocBlock next = block->next;
+
+ if (block != set->keeper)
+ context->mem_allocated -= block->endptr - ((char *) block);
+
+#ifdef CLOBBER_FREED_MEMORY
+ wipe_mem(block, block->freeptr - ((char *) block));
+#endif
+
+ if (block != set->keeper)
+ free(block);
+
+ block = next;
+ }
+
+ Assert(context->mem_allocated == keepersize);
+
+ /* Finally, free the context header, including the keeper block */
+ free(set);
+}
+
+/*
+ * AllocSetAlloc
+ * Returns pointer to allocated memory of given size or NULL if
+ * request could not be completed; memory is added to the set.
+ *
+ * No request may exceed:
+ * MAXALIGN_DOWN(SIZE_MAX) - ALLOC_BLOCKHDRSZ - ALLOC_CHUNKHDRSZ
+ * All callers use a much-lower limit.
+ *
+ * Note: when using valgrind, it doesn't matter how the returned allocation
+ * is marked, as mcxt.c will set it to UNDEFINED. In some paths we will
+ * return space that is marked NOACCESS - AllocSetRealloc has to beware!
+ */
+static void *
+AllocSetAlloc(MemoryContext context, Size size)
+{
+ AllocSet set = (AllocSet) context;
+ AllocBlock block;
+ AllocChunk chunk;
+ int fidx;
+ Size chunk_size;
+ Size blksize;
+
+ AssertArg(AllocSetIsValid(set));
+
+ /*
+ * If requested size exceeds maximum for chunks, allocate an entire block
+ * for this request.
+ */
+ if (size > set->allocChunkLimit)
+ {
+ chunk_size = MAXALIGN(size);
+ blksize = chunk_size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
+ block = (AllocBlock) malloc(blksize);
+ if (block == NULL)
+ return NULL;
+
+ context->mem_allocated += blksize;
+
+ block->aset = set;
+ block->freeptr = block->endptr = ((char *) block) + blksize;
+
+ chunk = (AllocChunk) (((char *) block) + ALLOC_BLOCKHDRSZ);
+ chunk->aset = set;
+ chunk->size = chunk_size;
+#ifdef MEMORY_CONTEXT_CHECKING
+ chunk->requested_size = size;
+ /* set mark to catch clobber of "unused" space */
+ if (size < chunk_size)
+ set_sentinel(AllocChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+ /* fill the allocated space with junk */
+ randomize_mem((char *) AllocChunkGetPointer(chunk), size);
+#endif
+
+ /*
+ * Stick the new block underneath the active allocation block, if any,
+ * so that we don't lose the use of the space remaining therein.
+ */
+ if (set->blocks != NULL)
+ {
+ block->prev = set->blocks;
+ block->next = set->blocks->next;
+ if (block->next)
+ block->next->prev = block;
+ set->blocks->next = block;
+ }
+ else
+ {
+ block->prev = NULL;
+ block->next = NULL;
+ set->blocks = block;
+ }
+
+ /* Ensure any padding bytes are marked NOACCESS. */
+ VALGRIND_MAKE_MEM_NOACCESS((char *) AllocChunkGetPointer(chunk) + size,
+ chunk_size - size);
+
+ /* Disallow external access to private part of chunk header. */
+ VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN);
+
+ return AllocChunkGetPointer(chunk);
+ }
+
+ /*
+ * Request is small enough to be treated as a chunk. Look in the
+ * corresponding free list to see if there is a free chunk we could reuse.
+ * If one is found, remove it from the free list, make it again a member
+ * of the alloc set and return its data address.
+ */
+ fidx = AllocSetFreeIndex(size);
+ chunk = set->freelist[fidx];
+ if (chunk != NULL)
+ {
+ Assert(chunk->size >= size);
+
+ set->freelist[fidx] = (AllocChunk) chunk->aset;
+
+ chunk->aset = (void *) set;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ chunk->requested_size = size;
+ /* set mark to catch clobber of "unused" space */
+ if (size < chunk->size)
+ set_sentinel(AllocChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+ /* fill the allocated space with junk */
+ randomize_mem((char *) AllocChunkGetPointer(chunk), size);
+#endif
+
+ /* Ensure any padding bytes are marked NOACCESS. */
+ VALGRIND_MAKE_MEM_NOACCESS((char *) AllocChunkGetPointer(chunk) + size,
+ chunk->size - size);
+
+ /* Disallow external access to private part of chunk header. */
+ VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN);
+
+ return AllocChunkGetPointer(chunk);
+ }
+
+ /*
+ * Choose the actual chunk size to allocate.
+ */
+ chunk_size = (1 << ALLOC_MINBITS) << fidx;
+ Assert(chunk_size >= size);
+
+ /*
+ * If there is enough room in the active allocation block, we will put the
+ * chunk into that block. Else must start a new one.
+ */
+ if ((block = set->blocks) != NULL)
+ {
+ Size availspace = block->endptr - block->freeptr;
+
+ if (availspace < (chunk_size + ALLOC_CHUNKHDRSZ))
+ {
+ /*
+ * The existing active (top) block does not have enough room for
+ * the requested allocation, but it might still have a useful
+ * amount of space in it. Once we push it down in the block list,
+ * we'll never try to allocate more space from it. So, before we
+ * do that, carve up its free space into chunks that we can put on
+ * the set's freelists.
+ *
+ * Because we can only get here when there's less than
+ * ALLOC_CHUNK_LIMIT left in the block, this loop cannot iterate
+ * more than ALLOCSET_NUM_FREELISTS-1 times.
+ */
+ while (availspace >= ((1 << ALLOC_MINBITS) + ALLOC_CHUNKHDRSZ))
+ {
+ Size availchunk = availspace - ALLOC_CHUNKHDRSZ;
+ int a_fidx = AllocSetFreeIndex(availchunk);
+
+ /*
+ * In most cases, we'll get back the index of the next larger
+ * freelist than the one we need to put this chunk on. The
+ * exception is when availchunk is exactly a power of 2.
+ */
+ if (availchunk != ((Size) 1 << (a_fidx + ALLOC_MINBITS)))
+ {
+ a_fidx--;
+ Assert(a_fidx >= 0);
+ availchunk = ((Size) 1 << (a_fidx + ALLOC_MINBITS));
+ }
+
+ chunk = (AllocChunk) (block->freeptr);
+
+ /* Prepare to initialize the chunk header. */
+ VALGRIND_MAKE_MEM_UNDEFINED(chunk, ALLOC_CHUNKHDRSZ);
+
+ block->freeptr += (availchunk + ALLOC_CHUNKHDRSZ);
+ availspace -= (availchunk + ALLOC_CHUNKHDRSZ);
+
+ chunk->size = availchunk;
+#ifdef MEMORY_CONTEXT_CHECKING
+ chunk->requested_size = 0; /* mark it free */
+#endif
+ chunk->aset = (void *) set->freelist[a_fidx];
+ set->freelist[a_fidx] = chunk;
+ }
+
+ /* Mark that we need to create a new block */
+ block = NULL;
+ }
+ }
+
+ /*
+ * Time to create a new regular (multi-chunk) block?
+ */
+ if (block == NULL)
+ {
+ Size required_size;
+
+ /*
+ * The first such block has size initBlockSize, and we double the
+ * space in each succeeding block, but not more than maxBlockSize.
+ */
+ blksize = set->nextBlockSize;
+ set->nextBlockSize <<= 1;
+ if (set->nextBlockSize > set->maxBlockSize)
+ set->nextBlockSize = set->maxBlockSize;
+
+ /*
+ * If initBlockSize is less than ALLOC_CHUNK_LIMIT, we could need more
+ * space... but try to keep it a power of 2.
+ */
+ required_size = chunk_size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
+ while (blksize < required_size)
+ blksize <<= 1;
+
+ /* Try to allocate it */
+ block = (AllocBlock) malloc(blksize);
+
+ /*
+ * We could be asking for pretty big blocks here, so cope if malloc
+ * fails. But give up if there's less than 1 MB or so available...
+ */
+ while (block == NULL && blksize > 1024 * 1024)
+ {
+ blksize >>= 1;
+ if (blksize < required_size)
+ break;
+ block = (AllocBlock) malloc(blksize);
+ }
+
+ if (block == NULL)
+ return NULL;
+
+ context->mem_allocated += blksize;
+
+ block->aset = set;
+ block->freeptr = ((char *) block) + ALLOC_BLOCKHDRSZ;
+ block->endptr = ((char *) block) + blksize;
+
+ /* Mark unallocated space NOACCESS. */
+ VALGRIND_MAKE_MEM_NOACCESS(block->freeptr,
+ blksize - ALLOC_BLOCKHDRSZ);
+
+ block->prev = NULL;
+ block->next = set->blocks;
+ if (block->next)
+ block->next->prev = block;
+ set->blocks = block;
+ }
+
+ /*
+ * OK, do the allocation
+ */
+ chunk = (AllocChunk) (block->freeptr);
+
+ /* Prepare to initialize the chunk header. */
+ VALGRIND_MAKE_MEM_UNDEFINED(chunk, ALLOC_CHUNKHDRSZ);
+
+ block->freeptr += (chunk_size + ALLOC_CHUNKHDRSZ);
+ Assert(block->freeptr <= block->endptr);
+
+ chunk->aset = (void *) set;
+ chunk->size = chunk_size;
+#ifdef MEMORY_CONTEXT_CHECKING
+ chunk->requested_size = size;
+ /* set mark to catch clobber of "unused" space */
+ if (size < chunk->size)
+ set_sentinel(AllocChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+ /* fill the allocated space with junk */
+ randomize_mem((char *) AllocChunkGetPointer(chunk), size);
+#endif
+
+ /* Ensure any padding bytes are marked NOACCESS. */
+ VALGRIND_MAKE_MEM_NOACCESS((char *) AllocChunkGetPointer(chunk) + size,
+ chunk_size - size);
+
+ /* Disallow external access to private part of chunk header. */
+ VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN);
+
+ return AllocChunkGetPointer(chunk);
+}
+
+/*
+ * AllocSetFree
+ * Frees allocated memory; memory is removed from the set.
+ */
+static void
+AllocSetFree(MemoryContext context, void *pointer)
+{
+ AllocSet set = (AllocSet) context;
+ AllocChunk chunk = AllocPointerGetChunk(pointer);
+
+ /* Allow access to private part of chunk header. */
+ VALGRIND_MAKE_MEM_DEFINED(chunk, ALLOCCHUNK_PRIVATE_LEN);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* Test for someone scribbling on unused space in chunk */
+ if (chunk->requested_size < chunk->size)
+ if (!sentinel_ok(pointer, chunk->requested_size))
+ elog(WARNING, "detected write past chunk end in %s %p",
+ set->header.name, chunk);
+#endif
+
+ if (chunk->size > set->allocChunkLimit)
+ {
+ /*
+ * Big chunks are certain to have been allocated as single-chunk
+ * blocks. Just unlink that block and return it to malloc().
+ */
+ AllocBlock block = (AllocBlock) (((char *) chunk) - ALLOC_BLOCKHDRSZ);
+
+ /*
+ * Try to verify that we have a sane block pointer: it should
+ * reference the correct aset, and freeptr and endptr should point
+ * just past the chunk.
+ */
+ if (block->aset != set ||
+ block->freeptr != block->endptr ||
+ block->freeptr != ((char *) block) +
+ (chunk->size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ))
+ elog(ERROR, "could not find block containing chunk %p", chunk);
+
+ /* OK, remove block from aset's list and free it */
+ if (block->prev)
+ block->prev->next = block->next;
+ else
+ set->blocks = block->next;
+ if (block->next)
+ block->next->prev = block->prev;
+
+ context->mem_allocated -= block->endptr - ((char *) block);
+
+#ifdef CLOBBER_FREED_MEMORY
+ wipe_mem(block, block->freeptr - ((char *) block));
+#endif
+ free(block);
+ }
+ else
+ {
+ /* Normal case, put the chunk into appropriate freelist */
+ int fidx = AllocSetFreeIndex(chunk->size);
+
+ chunk->aset = (void *) set->freelist[fidx];
+
+#ifdef CLOBBER_FREED_MEMORY
+ wipe_mem(pointer, chunk->size);
+#endif
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* Reset requested_size to 0 in chunks that are on freelist */
+ chunk->requested_size = 0;
+#endif
+ set->freelist[fidx] = chunk;
+ }
+}
+
+/*
+ * AllocSetRealloc
+ * Returns new pointer to allocated memory of given size or NULL if
+ * request could not be completed; this memory is added to the set.
+ * Memory associated with given pointer is copied into the new memory,
+ * and the old memory is freed.
+ *
+ * Without MEMORY_CONTEXT_CHECKING, we don't know the old request size. This
+ * makes our Valgrind client requests less-precise, hazarding false negatives.
+ * (In principle, we could use VALGRIND_GET_VBITS() to rediscover the old
+ * request size.)
+ */
+static void *
+AllocSetRealloc(MemoryContext context, void *pointer, Size size)
+{
+ AllocSet set = (AllocSet) context;
+ AllocChunk chunk = AllocPointerGetChunk(pointer);
+ Size oldsize;
+
+ /* Allow access to private part of chunk header. */
+ VALGRIND_MAKE_MEM_DEFINED(chunk, ALLOCCHUNK_PRIVATE_LEN);
+
+ oldsize = chunk->size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* Test for someone scribbling on unused space in chunk */
+ if (chunk->requested_size < oldsize)
+ if (!sentinel_ok(pointer, chunk->requested_size))
+ elog(WARNING, "detected write past chunk end in %s %p",
+ set->header.name, chunk);
+#endif
+
+ if (oldsize > set->allocChunkLimit)
+ {
+ /*
+ * The chunk must have been allocated as a single-chunk block. Use
+ * realloc() to make the containing block bigger, or smaller, with
+ * minimum space wastage.
+ */
+ AllocBlock block = (AllocBlock) (((char *) chunk) - ALLOC_BLOCKHDRSZ);
+ Size chksize;
+ Size blksize;
+ Size oldblksize;
+
+ /*
+ * Try to verify that we have a sane block pointer: it should
+ * reference the correct aset, and freeptr and endptr should point
+ * just past the chunk.
+ */
+ if (block->aset != set ||
+ block->freeptr != block->endptr ||
+ block->freeptr != ((char *) block) +
+ (oldsize + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ))
+ elog(ERROR, "could not find block containing chunk %p", chunk);
+
+ /*
+ * Even if the new request is less than set->allocChunkLimit, we stick
+ * with the single-chunk block approach. Therefore we need
+ * chunk->size to be bigger than set->allocChunkLimit, so we don't get
+ * confused about the chunk's status in future calls.
+ */
+ chksize = Max(size, set->allocChunkLimit + 1);
+ chksize = MAXALIGN(chksize);
+
+ /* Do the realloc */
+ blksize = chksize + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
+ oldblksize = block->endptr - ((char *) block);
+
+ block = (AllocBlock) realloc(block, blksize);
+ if (block == NULL)
+ {
+ /* Disallow external access to private part of chunk header. */
+ VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN);
+ return NULL;
+ }
+
+ /* updated separately, not to underflow when (oldblksize > blksize) */
+ context->mem_allocated -= oldblksize;
+ context->mem_allocated += blksize;
+
+ block->freeptr = block->endptr = ((char *) block) + blksize;
+
+ /* Update pointers since block has likely been moved */
+ chunk = (AllocChunk) (((char *) block) + ALLOC_BLOCKHDRSZ);
+ pointer = AllocChunkGetPointer(chunk);
+ if (block->prev)
+ block->prev->next = block;
+ else
+ set->blocks = block;
+ if (block->next)
+ block->next->prev = block;
+ chunk->size = chksize;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+ /* We can only fill the extra space if we know the prior request */
+ if (size > chunk->requested_size)
+ randomize_mem((char *) pointer + chunk->requested_size,
+ size - chunk->requested_size);
+#endif
+
+ /*
+ * realloc() (or randomize_mem()) will have left any newly-allocated
+ * part UNDEFINED, but we may need to adjust trailing bytes from the
+ * old allocation.
+ */
+#ifdef USE_VALGRIND
+ if (oldsize > chunk->requested_size)
+ VALGRIND_MAKE_MEM_UNDEFINED((char *) pointer + chunk->requested_size,
+ oldsize - chunk->requested_size);
+#endif
+
+ chunk->requested_size = size;
+
+ /* set mark to catch clobber of "unused" space */
+ if (size < chunk->size)
+ set_sentinel(pointer, size);
+#else /* !MEMORY_CONTEXT_CHECKING */
+
+ /*
+ * We don't know how much of the old chunk size was the actual
+ * allocation; it could have been as small as one byte. We have to be
+ * conservative and just mark the entire old portion DEFINED.
+ */
+ VALGRIND_MAKE_MEM_DEFINED(pointer, oldsize);
+#endif
+
+ /* Ensure any padding bytes are marked NOACCESS. */
+ VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size, chksize - size);
+
+ /* Disallow external access to private part of chunk header. */
+ VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN);
+
+ return pointer;
+ }
+
+ /*
+ * Chunk sizes are aligned to power of 2 in AllocSetAlloc(). Maybe the
+ * allocated area already is >= the new size. (In particular, we will
+ * fall out here if the requested size is a decrease.)
+ */
+ else if (oldsize >= size)
+ {
+#ifdef MEMORY_CONTEXT_CHECKING
+ Size oldrequest = chunk->requested_size;
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+ /* We can only fill the extra space if we know the prior request */
+ if (size > oldrequest)
+ randomize_mem((char *) pointer + oldrequest,
+ size - oldrequest);
+#endif
+
+ chunk->requested_size = size;
+
+ /*
+ * If this is an increase, mark any newly-available part UNDEFINED.
+ * Otherwise, mark the obsolete part NOACCESS.
+ */
+ if (size > oldrequest)
+ VALGRIND_MAKE_MEM_UNDEFINED((char *) pointer + oldrequest,
+ size - oldrequest);
+ else
+ VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size,
+ oldsize - size);
+
+ /* set mark to catch clobber of "unused" space */
+ if (size < oldsize)
+ set_sentinel(pointer, size);
+#else /* !MEMORY_CONTEXT_CHECKING */
+
+ /*
+ * We don't have the information to determine whether we're growing
+ * the old request or shrinking it, so we conservatively mark the
+ * entire new allocation DEFINED.
+ */
+ VALGRIND_MAKE_MEM_NOACCESS(pointer, oldsize);
+ VALGRIND_MAKE_MEM_DEFINED(pointer, size);
+#endif
+
+ /* Disallow external access to private part of chunk header. */
+ VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN);
+
+ return pointer;
+ }
+ else
+ {
+ /*
+ * Enlarge-a-small-chunk case. We just do this by brute force, ie,
+ * allocate a new chunk and copy the data. Since we know the existing
+ * data isn't huge, this won't involve any great memcpy expense, so
+ * it's not worth being smarter. (At one time we tried to avoid
+ * memcpy when it was possible to enlarge the chunk in-place, but that
+ * turns out to misbehave unpleasantly for repeated cycles of
+ * palloc/repalloc/pfree: the eventually freed chunks go into the
+ * wrong freelist for the next initial palloc request, and so we leak
+ * memory indefinitely. See pgsql-hackers archives for 2007-08-11.)
+ */
+ AllocPointer newPointer;
+
+ /* allocate new chunk */
+ newPointer = AllocSetAlloc((MemoryContext) set, size);
+
+ /* leave immediately if request was not completed */
+ if (newPointer == NULL)
+ {
+ /* Disallow external access to private part of chunk header. */
+ VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN);
+ return NULL;
+ }
+
+ /*
+ * AllocSetAlloc() may have returned a region that is still NOACCESS.
+ * Change it to UNDEFINED for the moment; memcpy() will then transfer
+ * definedness from the old allocation to the new. If we know the old
+ * allocation, copy just that much. Otherwise, make the entire old
+ * chunk defined to avoid errors as we copy the currently-NOACCESS
+ * trailing bytes.
+ */
+ VALGRIND_MAKE_MEM_UNDEFINED(newPointer, size);
+#ifdef MEMORY_CONTEXT_CHECKING
+ oldsize = chunk->requested_size;
+#else
+ VALGRIND_MAKE_MEM_DEFINED(pointer, oldsize);
+#endif
+
+ /* transfer existing data (certain to fit) */
+ memcpy(newPointer, pointer, oldsize);
+
+ /* free old chunk */
+ AllocSetFree((MemoryContext) set, pointer);
+
+ return newPointer;
+ }
+}
+
+/*
+ * AllocSetGetChunkSpace
+ * Given a currently-allocated chunk, determine the total space
+ * it occupies (including all memory-allocation overhead).
+ */
+static Size
+AllocSetGetChunkSpace(MemoryContext context, void *pointer)
+{
+ AllocChunk chunk = AllocPointerGetChunk(pointer);
+ Size result;
+
+ VALGRIND_MAKE_MEM_DEFINED(chunk, ALLOCCHUNK_PRIVATE_LEN);
+ result = chunk->size + ALLOC_CHUNKHDRSZ;
+ VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN);
+ return result;
+}
+
+/*
+ * AllocSetIsEmpty
+ * Is an allocset empty of any allocated space?
+ */
+static bool
+AllocSetIsEmpty(MemoryContext context)
+{
+ /*
+ * For now, we say "empty" only if the context is new or just reset. We
+ * could examine the freelists to determine if all space has been freed,
+ * but it's not really worth the trouble for present uses of this
+ * functionality.
+ */
+ if (context->isReset)
+ return true;
+ return false;
+}
+
+/*
+ * AllocSetStats
+ * Compute stats about memory consumption of an allocset.
+ *
+ * printfunc: if not NULL, pass a human-readable stats string to this.
+ * passthru: pass this pointer through to printfunc.
+ * totals: if not NULL, add stats about this context into *totals.
+ * print_to_stderr: print stats to stderr if true, elog otherwise.
+ */
+static void
+AllocSetStats(MemoryContext context,
+ MemoryStatsPrintFunc printfunc, void *passthru,
+ MemoryContextCounters *totals, bool print_to_stderr)
+{
+ AllocSet set = (AllocSet) context;
+ Size nblocks = 0;
+ Size freechunks = 0;
+ Size totalspace;
+ Size freespace = 0;
+ AllocBlock block;
+ int fidx;
+
+ /* Include context header in totalspace */
+ totalspace = MAXALIGN(sizeof(AllocSetContext));
+
+ for (block = set->blocks; block != NULL; block = block->next)
+ {
+ nblocks++;
+ totalspace += block->endptr - ((char *) block);
+ freespace += block->endptr - block->freeptr;
+ }
+ for (fidx = 0; fidx < ALLOCSET_NUM_FREELISTS; fidx++)
+ {
+ AllocChunk chunk;
+
+ for (chunk = set->freelist[fidx]; chunk != NULL;
+ chunk = (AllocChunk) chunk->aset)
+ {
+ freechunks++;
+ freespace += chunk->size + ALLOC_CHUNKHDRSZ;
+ }
+ }
+
+ if (printfunc)
+ {
+ char stats_string[200];
+
+ snprintf(stats_string, sizeof(stats_string),
+ "%zu total in %zd blocks; %zu free (%zd chunks); %zu used",
+ totalspace, nblocks, freespace, freechunks,
+ totalspace - freespace);
+ printfunc(context, passthru, stats_string, print_to_stderr);
+ }
+
+ if (totals)
+ {
+ totals->nblocks += nblocks;
+ totals->freechunks += freechunks;
+ totals->totalspace += totalspace;
+ totals->freespace += freespace;
+ }
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * AllocSetCheck
+ * Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL. Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+AllocSetCheck(MemoryContext context)
+{
+ AllocSet set = (AllocSet) context;
+ const char *name = set->header.name;
+ AllocBlock prevblock;
+ AllocBlock block;
+ Size total_allocated = 0;
+
+ for (prevblock = NULL, block = set->blocks;
+ block != NULL;
+ prevblock = block, block = block->next)
+ {
+ char *bpoz = ((char *) block) + ALLOC_BLOCKHDRSZ;
+ long blk_used = block->freeptr - bpoz;
+ long blk_data = 0;
+ long nchunks = 0;
+
+ if (set->keeper == block)
+ total_allocated += block->endptr - ((char *) set);
+ else
+ total_allocated += block->endptr - ((char *) block);
+
+ /*
+ * Empty block - empty can be keeper-block only
+ */
+ if (!blk_used)
+ {
+ if (set->keeper != block)
+ elog(WARNING, "problem in alloc set %s: empty block %p",
+ name, block);
+ }
+
+ /*
+ * Check block header fields
+ */
+ if (block->aset != set ||
+ block->prev != prevblock ||
+ block->freeptr < bpoz ||
+ block->freeptr > block->endptr)
+ elog(WARNING, "problem in alloc set %s: corrupt header in block %p",
+ name, block);
+
+ /*
+ * Chunk walker
+ */
+ while (bpoz < block->freeptr)
+ {
+ AllocChunk chunk = (AllocChunk) bpoz;
+ Size chsize,
+ dsize;
+
+ /* Allow access to private part of chunk header. */
+ VALGRIND_MAKE_MEM_DEFINED(chunk, ALLOCCHUNK_PRIVATE_LEN);
+
+ chsize = chunk->size; /* aligned chunk size */
+ dsize = chunk->requested_size; /* real data */
+
+ /*
+ * Check chunk size
+ */
+ if (dsize > chsize)
+ elog(WARNING, "problem in alloc set %s: req size > alloc size for chunk %p in block %p",
+ name, chunk, block);
+ if (chsize < (1 << ALLOC_MINBITS))
+ elog(WARNING, "problem in alloc set %s: bad size %zu for chunk %p in block %p",
+ name, chsize, chunk, block);
+
+ /* single-chunk block? */
+ if (chsize > set->allocChunkLimit &&
+ chsize + ALLOC_CHUNKHDRSZ != blk_used)
+ elog(WARNING, "problem in alloc set %s: bad single-chunk %p in block %p",
+ name, chunk, block);
+
+ /*
+ * If chunk is allocated, check for correct aset pointer. (If it's
+ * free, the aset is the freelist pointer, which we can't check as
+ * easily...) Note this is an incomplete test, since palloc(0)
+ * produces an allocated chunk with requested_size == 0.
+ */
+ if (dsize > 0 && chunk->aset != (void *) set)
+ elog(WARNING, "problem in alloc set %s: bogus aset link in block %p, chunk %p",
+ name, block, chunk);
+
+ /*
+ * Check for overwrite of padding space in an allocated chunk.
+ */
+ if (chunk->aset == (void *) set && dsize < chsize &&
+ !sentinel_ok(chunk, ALLOC_CHUNKHDRSZ + dsize))
+ elog(WARNING, "problem in alloc set %s: detected write past chunk end in block %p, chunk %p",
+ name, block, chunk);
+
+ /*
+ * If chunk is allocated, disallow external access to private part
+ * of chunk header.
+ */
+ if (chunk->aset == (void *) set)
+ VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN);
+
+ blk_data += chsize;
+ nchunks++;
+
+ bpoz += ALLOC_CHUNKHDRSZ + chsize;
+ }
+
+ if ((blk_data + (nchunks * ALLOC_CHUNKHDRSZ)) != blk_used)
+ elog(WARNING, "problem in alloc set %s: found inconsistent memory block %p",
+ name, block);
+ }
+
+ Assert(total_allocated == context->mem_allocated);
+}
+
+#endif /* MEMORY_CONTEXT_CHECKING */
diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c
new file mode 100644
index 0000000..7e2a20b
--- /dev/null
+++ b/src/backend/utils/mmgr/dsa.c
@@ -0,0 +1,2287 @@
+/*-------------------------------------------------------------------------
+ *
+ * dsa.c
+ * Dynamic shared memory areas.
+ *
+ * This module provides dynamic shared memory areas which are built on top of
+ * DSM segments. While dsm.c allows segments of memory of shared memory to be
+ * created and shared between backends, it isn't designed to deal with small
+ * objects. A DSA area is a shared memory heap usually backed by one or more
+ * DSM segments which can allocate memory using dsa_allocate() and dsa_free().
+ * Alternatively, it can be created in pre-existing shared memory, including a
+ * DSM segment, and then create extra DSM segments as required. Unlike the
+ * regular system heap, it deals in pseudo-pointers which must be converted to
+ * backend-local pointers before they are dereferenced. These pseudo-pointers
+ * can however be shared with other backends, and can be used to construct
+ * shared data structures.
+ *
+ * Each DSA area manages a set of DSM segments, adding new segments as
+ * required and detaching them when they are no longer needed. Each segment
+ * contains a number of 4KB pages, a free page manager for tracking
+ * consecutive runs of free pages, and a page map for tracking the source of
+ * objects allocated on each page. Allocation requests above 8KB are handled
+ * by choosing a segment and finding consecutive free pages in its free page
+ * manager. Allocation requests for smaller sizes are handled using pools of
+ * objects of a selection of sizes. Each pool consists of a number of 16 page
+ * (64KB) superblocks allocated in the same way as large objects. Allocation
+ * of large objects and new superblocks is serialized by a single LWLock, but
+ * allocation of small objects from pre-existing superblocks uses one LWLock
+ * per pool. Currently there is one pool, and therefore one lock, per size
+ * class. Per-core pools to increase concurrency and strategies for reducing
+ * the resulting fragmentation are areas for future research. Each superblock
+ * is managed with a 'span', which tracks the superblock's freelist. Free
+ * requests are handled by looking in the page map to find which span an
+ * address was allocated from, so that small objects can be returned to the
+ * appropriate free list, and large object pages can be returned directly to
+ * the free page map. When allocating, simple heuristics for selecting
+ * segments and superblocks try to encourage occupied memory to be
+ * concentrated, increasing the likelihood that whole superblocks can become
+ * empty and be returned to the free page manager, and whole segments can
+ * become empty and be returned to the operating system.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/utils/mmgr/dsa.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "port/atomics.h"
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/shmem.h"
+#include "utils/dsa.h"
+#include "utils/freepage.h"
+#include "utils/memutils.h"
+
+/*
+ * The size of the initial DSM segment that backs a dsa_area created by
+ * dsa_create. After creating some number of segments of this size we'll
+ * double this size, and so on. Larger segments may be created if necessary
+ * to satisfy large requests.
+ */
+#define DSA_INITIAL_SEGMENT_SIZE ((size_t) (1 * 1024 * 1024))
+
+/*
+ * How many segments to create before we double the segment size. If this is
+ * low, then there is likely to be a lot of wasted space in the largest
+ * segment. If it is high, then we risk running out of segment slots (see
+ * dsm.c's limits on total number of segments), or limiting the total size
+ * an area can manage when using small pointers.
+ */
+#define DSA_NUM_SEGMENTS_AT_EACH_SIZE 2
+
+/*
+ * The number of bits used to represent the offset part of a dsa_pointer.
+ * This controls the maximum size of a segment, the maximum possible
+ * allocation size and also the maximum number of segments per area.
+ */
+#if SIZEOF_DSA_POINTER == 4
+#define DSA_OFFSET_WIDTH 27 /* 32 segments of size up to 128MB */
+#else
+#define DSA_OFFSET_WIDTH 40 /* 1024 segments of size up to 1TB */
+#endif
+
+/*
+ * The maximum number of DSM segments that an area can own, determined by
+ * the number of bits remaining (but capped at 1024).
+ */
+#define DSA_MAX_SEGMENTS \
+ Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH)))
+
+/* The bitmask for extracting the offset from a dsa_pointer. */
+#define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1)
+
+/* The maximum size of a DSM segment. */
+#define DSA_MAX_SEGMENT_SIZE ((size_t) 1 << DSA_OFFSET_WIDTH)
+
+/* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */
+#define DSA_PAGES_PER_SUPERBLOCK 16
+
+/*
+ * A magic number used as a sanity check for following DSM segments belonging
+ * to a DSA area (this number will be XORed with the area handle and
+ * the segment index).
+ */
+#define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608
+
+/* Build a dsa_pointer given a segment number and offset. */
+#define DSA_MAKE_POINTER(segment_number, offset) \
+ (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset))
+
+/* Extract the segment number from a dsa_pointer. */
+#define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH)
+
+/* Extract the offset from a dsa_pointer. */
+#define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK)
+
+/* The type used for index segment indexes (zero based). */
+typedef size_t dsa_segment_index;
+
+/* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */
+#define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0)
+
+/*
+ * How many bins of segments do we have? The bins are used to categorize
+ * segments by their largest contiguous run of free pages.
+ */
+#define DSA_NUM_SEGMENT_BINS 16
+
+/*
+ * What is the lowest bin that holds segments that *might* have n contiguous
+ * free pages? There is no point in looking in segments in lower bins; they
+ * definitely can't service a request for n free pages.
+ */
+#define contiguous_pages_to_segment_bin(n) Min(fls(n), DSA_NUM_SEGMENT_BINS - 1)
+
+/* Macros for access to locks. */
+#define DSA_AREA_LOCK(area) (&area->control->lock)
+#define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock)
+
+/*
+ * The header for an individual segment. This lives at the start of each DSM
+ * segment owned by a DSA area including the first segment (where it appears
+ * as part of the dsa_area_control struct).
+ */
+typedef struct
+{
+ /* Sanity check magic value. */
+ uint32 magic;
+ /* Total number of pages in this segment (excluding metadata area). */
+ size_t usable_pages;
+ /* Total size of this segment in bytes. */
+ size_t size;
+
+ /*
+ * Index of the segment that precedes this one in the same segment bin, or
+ * DSA_SEGMENT_INDEX_NONE if this is the first one.
+ */
+ dsa_segment_index prev;
+
+ /*
+ * Index of the segment that follows this one in the same segment bin, or
+ * DSA_SEGMENT_INDEX_NONE if this is the last one.
+ */
+ dsa_segment_index next;
+ /* The index of the bin that contains this segment. */
+ size_t bin;
+
+ /*
+ * A flag raised to indicate that this segment is being returned to the
+ * operating system and has been unpinned.
+ */
+ bool freed;
+} dsa_segment_header;
+
+/*
+ * Metadata for one superblock.
+ *
+ * For most blocks, span objects are stored out-of-line; that is, the span
+ * object is not stored within the block itself. But, as an exception, for a
+ * "span of spans", the span object is stored "inline". The allocation is
+ * always exactly one page, and the dsa_area_span object is located at
+ * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS,
+ * and the remaining fields are used just as they would be in an ordinary
+ * block. We can't allocate spans out of ordinary superblocks because
+ * creating an ordinary superblock requires us to be able to allocate a span
+ * *first*. Doing it this way avoids that circularity.
+ */
+typedef struct
+{
+ dsa_pointer pool; /* Containing pool. */
+ dsa_pointer prevspan; /* Previous span. */
+ dsa_pointer nextspan; /* Next span. */
+ dsa_pointer start; /* Starting address. */
+ size_t npages; /* Length of span in pages. */
+ uint16 size_class; /* Size class. */
+ uint16 ninitialized; /* Maximum number of objects ever allocated. */
+ uint16 nallocatable; /* Number of objects currently allocatable. */
+ uint16 firstfree; /* First object on free list. */
+ uint16 nmax; /* Maximum number of objects ever possible. */
+ uint16 fclass; /* Current fullness class. */
+} dsa_area_span;
+
+/*
+ * Given a pointer to an object in a span, access the index of the next free
+ * object in the same span (ie in the span's freelist) as an L-value.
+ */
+#define NextFreeObjectIndex(object) (* (uint16 *) (object))
+
+/*
+ * Small allocations are handled by dividing a single block of memory into
+ * many small objects of equal size. The possible allocation sizes are
+ * defined by the following array. Larger size classes are spaced more widely
+ * than smaller size classes. We fudge the spacing for size classes >1kB to
+ * avoid space wastage: based on the knowledge that we plan to allocate 64kB
+ * blocks, we bump the maximum object size up to the largest multiple of
+ * 8 bytes that still lets us fit the same number of objects into one block.
+ *
+ * NB: Because of this fudging, if we were ever to use differently-sized blocks
+ * for small allocations, these size classes would need to be reworked to be
+ * optimal for the new size.
+ *
+ * NB: The optimal spacing for size classes, as well as the size of the blocks
+ * out of which small objects are allocated, is not a question that has one
+ * right answer. Some allocators (such as tcmalloc) use more closely-spaced
+ * size classes than we do here, while others (like aset.c) use more
+ * widely-spaced classes. Spacing the classes more closely avoids wasting
+ * memory within individual chunks, but also means a larger number of
+ * potentially-unfilled blocks.
+ */
+static const uint16 dsa_size_classes[] = {
+ sizeof(dsa_area_span), 0, /* special size classes */
+ 8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */
+ 80, 96, 112, 128, /* 4 classes separated by 16 bytes */
+ 160, 192, 224, 256, /* 4 classes separated by 32 bytes */
+ 320, 384, 448, 512, /* 4 classes separated by 64 bytes */
+ 640, 768, 896, 1024, /* 4 classes separated by 128 bytes */
+ 1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */
+ 2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */
+ 5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */
+};
+#define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes)
+
+/* Special size classes. */
+#define DSA_SCLASS_BLOCK_OF_SPANS 0
+#define DSA_SCLASS_SPAN_LARGE 1
+
+/*
+ * The following lookup table is used to map the size of small objects
+ * (less than 1kB) onto the corresponding size class. To use this table,
+ * round the size of the object up to the next multiple of 8 bytes, and then
+ * index into this array.
+ */
+static const uint8 dsa_size_class_map[] = {
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
+ 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
+ 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
+ 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
+};
+#define DSA_SIZE_CLASS_MAP_QUANTUM 8
+
+/*
+ * Superblocks are binned by how full they are. Generally, each fullness
+ * class corresponds to one quartile, but the block being used for
+ * allocations is always at the head of the list for fullness class 1,
+ * regardless of how full it really is.
+ */
+#define DSA_FULLNESS_CLASSES 4
+
+/*
+ * A dsa_area_pool represents a set of objects of a given size class.
+ *
+ * Perhaps there should be multiple pools for the same size class for
+ * contention avoidance, but for now there is just one!
+ */
+typedef struct
+{
+ /* A lock protecting access to this pool. */
+ LWLock lock;
+ /* A set of linked lists of spans, arranged by fullness. */
+ dsa_pointer spans[DSA_FULLNESS_CLASSES];
+ /* Should we pad this out to a cacheline boundary? */
+} dsa_area_pool;
+
+/*
+ * The control block for an area. This lives in shared memory, at the start of
+ * the first DSM segment controlled by this area.
+ */
+typedef struct
+{
+ /* The segment header for the first segment. */
+ dsa_segment_header segment_header;
+ /* The handle for this area. */
+ dsa_handle handle;
+ /* The handles of the segments owned by this area. */
+ dsm_handle segment_handles[DSA_MAX_SEGMENTS];
+ /* Lists of segments, binned by maximum contiguous run of free pages. */
+ dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS];
+ /* The object pools for each size class. */
+ dsa_area_pool pools[DSA_NUM_SIZE_CLASSES];
+ /* The total size of all active segments. */
+ size_t total_segment_size;
+ /* The maximum total size of backing storage we are allowed. */
+ size_t max_total_segment_size;
+ /* Highest used segment index in the history of this area. */
+ dsa_segment_index high_segment_index;
+ /* The reference count for this area. */
+ int refcnt;
+ /* A flag indicating that this area has been pinned. */
+ bool pinned;
+ /* The number of times that segments have been freed. */
+ size_t freed_segment_counter;
+ /* The LWLock tranche ID. */
+ int lwlock_tranche_id;
+ /* The general lock (protects everything except object pools). */
+ LWLock lock;
+} dsa_area_control;
+
+/* Given a pointer to a pool, find a dsa_pointer. */
+#define DsaAreaPoolToDsaPointer(area, p) \
+ DSA_MAKE_POINTER(0, (char *) p - (char *) area->control)
+
+/*
+ * A dsa_segment_map is stored within the backend-private memory of each
+ * individual backend. It holds the base address of the segment within that
+ * backend, plus the addresses of key objects within the segment. Those
+ * could instead be derived from the base address but it's handy to have them
+ * around.
+ */
+typedef struct
+{
+ dsm_segment *segment; /* DSM segment */
+ char *mapped_address; /* Address at which segment is mapped */
+ dsa_segment_header *header; /* Header (same as mapped_address) */
+ FreePageManager *fpm; /* Free page manager within segment. */
+ dsa_pointer *pagemap; /* Page map within segment. */
+} dsa_segment_map;
+
+/*
+ * Per-backend state for a storage area. Backends obtain one of these by
+ * creating an area or attaching to an existing one using a handle. Each
+ * process that needs to use an area uses its own object to track where the
+ * segments are mapped.
+ */
+struct dsa_area
+{
+ /* Pointer to the control object in shared memory. */
+ dsa_area_control *control;
+
+ /* Has the mapping been pinned? */
+ bool mapping_pinned;
+
+ /*
+ * This backend's array of segment maps, ordered by segment index
+ * corresponding to control->segment_handles. Some of the area's segments
+ * may not be mapped in this backend yet, and some slots may have been
+ * freed and need to be detached; these operations happen on demand.
+ */
+ dsa_segment_map segment_maps[DSA_MAX_SEGMENTS];
+
+ /* The highest segment index this backend has ever mapped. */
+ dsa_segment_index high_segment_index;
+
+ /* The last observed freed_segment_counter. */
+ size_t freed_segment_counter;
+};
+
+#define DSA_SPAN_NOTHING_FREE ((uint16) -1)
+#define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
+
+/* Given a pointer to a segment_map, obtain a segment index number. */
+#define get_segment_index(area, segment_map_ptr) \
+ (segment_map_ptr - &area->segment_maps[0])
+
+static void init_span(dsa_area *area, dsa_pointer span_pointer,
+ dsa_area_pool *pool, dsa_pointer start, size_t npages,
+ uint16 size_class);
+static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool,
+ int fromclass, int toclass);
+static inline dsa_pointer alloc_object(dsa_area *area, int size_class);
+static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
+ int size_class);
+static dsa_segment_map *get_segment_by_index(dsa_area *area,
+ dsa_segment_index index);
+static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer);
+static void unlink_span(dsa_area *area, dsa_area_span *span);
+static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
+ dsa_pointer span_pointer, int fclass);
+static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map);
+static dsa_segment_map *get_best_segment(dsa_area *area, size_t npages);
+static dsa_segment_map *make_new_segment(dsa_area *area, size_t requested_pages);
+static dsa_area *create_internal(void *place, size_t size,
+ int tranche_id,
+ dsm_handle control_handle,
+ dsm_segment *control_segment);
+static dsa_area *attach_internal(void *place, dsm_segment *segment,
+ dsa_handle handle);
+static void check_for_freed_segments(dsa_area *area);
+static void check_for_freed_segments_locked(dsa_area *area);
+
+/*
+ * Create a new shared area in a new DSM segment. Further DSM segments will
+ * be allocated as required to extend the available space.
+ *
+ * We can't allocate a LWLock tranche_id within this function, because tranche
+ * IDs are a scarce resource; there are only 64k available, using low numbers
+ * when possible matters, and we have no provision for recycling them. So,
+ * we require the caller to provide one.
+ */
+dsa_area *
+dsa_create(int tranche_id)
+{
+ dsm_segment *segment;
+ dsa_area *area;
+
+ /*
+ * Create the DSM segment that will hold the shared control object and the
+ * first segment of usable space.
+ */
+ segment = dsm_create(DSA_INITIAL_SEGMENT_SIZE, 0);
+
+ /*
+ * All segments backing this area are pinned, so that DSA can explicitly
+ * control their lifetime (otherwise a newly created segment belonging to
+ * this area might be freed when the only backend that happens to have it
+ * mapped in ends, corrupting the area).
+ */
+ dsm_pin_segment(segment);
+
+ /* Create a new DSA area with the control object in this segment. */
+ area = create_internal(dsm_segment_address(segment),
+ DSA_INITIAL_SEGMENT_SIZE,
+ tranche_id,
+ dsm_segment_handle(segment), segment);
+
+ /* Clean up when the control segment detaches. */
+ on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
+ PointerGetDatum(dsm_segment_address(segment)));
+
+ return area;
+}
+
+/*
+ * Create a new shared area in an existing shared memory space, which may be
+ * either DSM or Postmaster-initialized memory. DSM segments will be
+ * allocated as required to extend the available space, though that can be
+ * prevented with dsa_set_size_limit(area, size) using the same size provided
+ * to dsa_create_in_place.
+ *
+ * Areas created in-place must eventually be released by the backend that
+ * created them and all backends that attach to them. This can be done
+ * explicitly with dsa_release_in_place, or, in the special case that 'place'
+ * happens to be in a pre-existing DSM segment, by passing in a pointer to the
+ * segment so that a detach hook can be registered with the containing DSM
+ * segment.
+ *
+ * See dsa_create() for a note about the tranche arguments.
+ */
+dsa_area *
+dsa_create_in_place(void *place, size_t size,
+ int tranche_id, dsm_segment *segment)
+{
+ dsa_area *area;
+
+ area = create_internal(place, size, tranche_id,
+ DSM_HANDLE_INVALID, NULL);
+
+ /*
+ * Clean up when the control segment detaches, if a containing DSM segment
+ * was provided.
+ */
+ if (segment != NULL)
+ on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
+ PointerGetDatum(place));
+
+ return area;
+}
+
+/*
+ * Obtain a handle that can be passed to other processes so that they can
+ * attach to the given area. Cannot be called for areas created with
+ * dsa_create_in_place.
+ */
+dsa_handle
+dsa_get_handle(dsa_area *area)
+{
+ Assert(area->control->handle != DSM_HANDLE_INVALID);
+ return area->control->handle;
+}
+
+/*
+ * Attach to an area given a handle generated (possibly in another process) by
+ * dsa_get_handle. The area must have been created with dsa_create (not
+ * dsa_create_in_place).
+ */
+dsa_area *
+dsa_attach(dsa_handle handle)
+{
+ dsm_segment *segment;
+ dsa_area *area;
+
+ /*
+ * An area handle is really a DSM segment handle for the first segment, so
+ * we go ahead and attach to that.
+ */
+ segment = dsm_attach(handle);
+ if (segment == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("could not attach to dynamic shared area")));
+
+ area = attach_internal(dsm_segment_address(segment), segment, handle);
+
+ /* Clean up when the control segment detaches. */
+ on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
+ PointerGetDatum(dsm_segment_address(segment)));
+
+ return area;
+}
+
+/*
+ * Attach to an area that was created with dsa_create_in_place. The caller
+ * must somehow know the location in memory that was used when the area was
+ * created, though it may be mapped at a different virtual address in this
+ * process.
+ *
+ * See dsa_create_in_place for note about releasing in-place areas, and the
+ * optional 'segment' argument which can be provided to allow automatic
+ * release if the containing memory happens to be a DSM segment.
+ */
+dsa_area *
+dsa_attach_in_place(void *place, dsm_segment *segment)
+{
+ dsa_area *area;
+
+ area = attach_internal(place, NULL, DSM_HANDLE_INVALID);
+
+ /*
+ * Clean up when the control segment detaches, if a containing DSM segment
+ * was provided.
+ */
+ if (segment != NULL)
+ on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place,
+ PointerGetDatum(place));
+
+ return area;
+}
+
+/*
+ * Release a DSA area that was produced by dsa_create_in_place or
+ * dsa_attach_in_place. The 'segment' argument is ignored but provides an
+ * interface suitable for on_dsm_detach, for the convenience of users who want
+ * to create a DSA segment inside an existing DSM segment and have it
+ * automatically released when the containing DSM segment is detached.
+ * 'place' should be the address of the place where the area was created.
+ *
+ * This callback is automatically registered for the DSM segment containing
+ * the control object of in-place areas when a segment is provided to
+ * dsa_create_in_place or dsa_attach_in_place, and also for all areas created
+ * with dsa_create.
+ */
+void
+dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place)
+{
+ dsa_release_in_place(DatumGetPointer(place));
+}
+
+/*
+ * Release a DSA area that was produced by dsa_create_in_place or
+ * dsa_attach_in_place. The 'code' argument is ignored but provides an
+ * interface suitable for on_shmem_exit or before_shmem_exit, for the
+ * convenience of users who want to create a DSA segment inside shared memory
+ * other than a DSM segment and have it automatically release at backend exit.
+ * 'place' should be the address of the place where the area was created.
+ */
+void
+dsa_on_shmem_exit_release_in_place(int code, Datum place)
+{
+ dsa_release_in_place(DatumGetPointer(place));
+}
+
+/*
+ * Release a DSA area that was produced by dsa_create_in_place or
+ * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX'
+ * callbacks so that this is managed automatically, because failure to release
+ * an area created in-place leaks its segments permanently.
+ *
+ * This is also called automatically for areas produced by dsa_create or
+ * dsa_attach as an implementation detail.
+ */
+void
+dsa_release_in_place(void *place)
+{
+ dsa_area_control *control = (dsa_area_control *) place;
+ int i;
+
+ LWLockAcquire(&control->lock, LW_EXCLUSIVE);
+ Assert(control->segment_header.magic ==
+ (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0));
+ Assert(control->refcnt > 0);
+ if (--control->refcnt == 0)
+ {
+ for (i = 0; i <= control->high_segment_index; ++i)
+ {
+ dsm_handle handle;
+
+ handle = control->segment_handles[i];
+ if (handle != DSM_HANDLE_INVALID)
+ dsm_unpin_segment(handle);
+ }
+ }
+ LWLockRelease(&control->lock);
+}
+
+/*
+ * Keep a DSA area attached until end of session or explicit detach.
+ *
+ * By default, areas are owned by the current resource owner, which means they
+ * are detached automatically when that scope ends.
+ */
+void
+dsa_pin_mapping(dsa_area *area)
+{
+ int i;
+
+ Assert(!area->mapping_pinned);
+ area->mapping_pinned = true;
+
+ for (i = 0; i <= area->high_segment_index; ++i)
+ if (area->segment_maps[i].segment != NULL)
+ dsm_pin_mapping(area->segment_maps[i].segment);
+}
+
+/*
+ * Allocate memory in this storage area. The return value is a dsa_pointer
+ * that can be passed to other processes, and converted to a local pointer
+ * with dsa_get_address. 'flags' is a bitmap which should be constructed
+ * from the following values:
+ *
+ * DSA_ALLOC_HUGE allows allocations >= 1GB. Otherwise, such allocations
+ * will result in an ERROR.
+ *
+ * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when
+ * no memory is available or a size limit established by dsa_set_size_limit
+ * would be exceeded. Otherwise, such allocations will result in an ERROR.
+ *
+ * DSA_ALLOC_ZERO causes the allocated memory to be zeroed. Otherwise, the
+ * contents of newly-allocated memory are indeterminate.
+ *
+ * These flags correspond to similarly named flags used by
+ * MemoryContextAllocExtended(). See also the macros dsa_allocate and
+ * dsa_allocate0 which expand to a call to this function with commonly used
+ * flags.
+ */
+dsa_pointer
+dsa_allocate_extended(dsa_area *area, size_t size, int flags)
+{
+ uint16 size_class;
+ dsa_pointer start_pointer;
+ dsa_segment_map *segment_map;
+ dsa_pointer result;
+
+ Assert(size > 0);
+
+ /* Sanity check on huge individual allocation size. */
+ if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
+ ((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
+ elog(ERROR, "invalid DSA memory alloc request size %zu", size);
+
+ /*
+ * If bigger than the largest size class, just grab a run of pages from
+ * the free page manager, instead of allocating an object from a pool.
+ * There will still be a span, but it's a special class of span that
+ * manages this whole allocation and simply gives all pages back to the
+ * free page manager when dsa_free is called.
+ */
+ if (size > dsa_size_classes[lengthof(dsa_size_classes) - 1])
+ {
+ size_t npages = fpm_size_to_pages(size);
+ size_t first_page;
+ dsa_pointer span_pointer;
+ dsa_area_pool *pool = &area->control->pools[DSA_SCLASS_SPAN_LARGE];
+
+ /* Obtain a span object. */
+ span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
+ if (!DsaPointerIsValid(span_pointer))
+ {
+ /* Raise error unless asked not to. */
+ if ((flags & DSA_ALLOC_NO_OOM) == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed on DSA request of size %zu.",
+ size)));
+ return InvalidDsaPointer;
+ }
+
+ LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+
+ /* Find a segment from which to allocate. */
+ segment_map = get_best_segment(area, npages);
+ if (segment_map == NULL)
+ segment_map = make_new_segment(area, npages);
+ if (segment_map == NULL)
+ {
+ /* Can't make any more segments: game over. */
+ LWLockRelease(DSA_AREA_LOCK(area));
+ dsa_free(area, span_pointer);
+
+ /* Raise error unless asked not to. */
+ if ((flags & DSA_ALLOC_NO_OOM) == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed on DSA request of size %zu.",
+ size)));
+ return InvalidDsaPointer;
+ }
+
+ /*
+ * Ask the free page manager for a run of pages. This should always
+ * succeed, since both get_best_segment and make_new_segment should
+ * only return a non-NULL pointer if it actually contains enough
+ * contiguous freespace. If it does fail, something in our backend
+ * private state is out of whack, so use FATAL to kill the process.
+ */
+ if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
+ elog(FATAL,
+ "dsa_allocate could not find %zu free pages", npages);
+ LWLockRelease(DSA_AREA_LOCK(area));
+
+ start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map),
+ first_page * FPM_PAGE_SIZE);
+
+ /* Initialize span and pagemap. */
+ LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE),
+ LW_EXCLUSIVE);
+ init_span(area, span_pointer, pool, start_pointer, npages,
+ DSA_SCLASS_SPAN_LARGE);
+ segment_map->pagemap[first_page] = span_pointer;
+ LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE));
+
+ /* Zero-initialize the memory if requested. */
+ if ((flags & DSA_ALLOC_ZERO) != 0)
+ memset(dsa_get_address(area, start_pointer), 0, size);
+
+ return start_pointer;
+ }
+
+ /* Map allocation to a size class. */
+ if (size < lengthof(dsa_size_class_map) * DSA_SIZE_CLASS_MAP_QUANTUM)
+ {
+ int mapidx;
+
+ /* For smaller sizes we have a lookup table... */
+ mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) /
+ DSA_SIZE_CLASS_MAP_QUANTUM) - 1;
+ size_class = dsa_size_class_map[mapidx];
+ }
+ else
+ {
+ uint16 min;
+ uint16 max;
+
+ /* ... and for the rest we search by binary chop. */
+ min = dsa_size_class_map[lengthof(dsa_size_class_map) - 1];
+ max = lengthof(dsa_size_classes) - 1;
+
+ while (min < max)
+ {
+ uint16 mid = (min + max) / 2;
+ uint16 class_size = dsa_size_classes[mid];
+
+ if (class_size < size)
+ min = mid + 1;
+ else
+ max = mid;
+ }
+
+ size_class = min;
+ }
+ Assert(size <= dsa_size_classes[size_class]);
+ Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]);
+
+ /* Attempt to allocate an object from the appropriate pool. */
+ result = alloc_object(area, size_class);
+
+ /* Check for failure to allocate. */
+ if (!DsaPointerIsValid(result))
+ {
+ /* Raise error unless asked not to. */
+ if ((flags & DSA_ALLOC_NO_OOM) == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed on DSA request of size %zu.", size)));
+ return InvalidDsaPointer;
+ }
+
+ /* Zero-initialize the memory if requested. */
+ if ((flags & DSA_ALLOC_ZERO) != 0)
+ memset(dsa_get_address(area, result), 0, size);
+
+ return result;
+}
+
+/*
+ * Free memory obtained with dsa_allocate.
+ */
+void
+dsa_free(dsa_area *area, dsa_pointer dp)
+{
+ dsa_segment_map *segment_map;
+ int pageno;
+ dsa_pointer span_pointer;
+ dsa_area_span *span;
+ char *superblock;
+ char *object;
+ size_t size;
+ int size_class;
+
+ /* Make sure we don't have a stale segment in the slot 'dp' refers to. */
+ check_for_freed_segments(area);
+
+ /* Locate the object, span and pool. */
+ segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp));
+ pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE;
+ span_pointer = segment_map->pagemap[pageno];
+ span = dsa_get_address(area, span_pointer);
+ superblock = dsa_get_address(area, span->start);
+ object = dsa_get_address(area, dp);
+ size_class = span->size_class;
+ size = dsa_size_classes[size_class];
+
+ /*
+ * Special case for large objects that live in a special span: we return
+ * those pages directly to the free page manager and free the span.
+ */
+ if (span->size_class == DSA_SCLASS_SPAN_LARGE)
+ {
+
+#ifdef CLOBBER_FREED_MEMORY
+ memset(object, 0x7f, span->npages * FPM_PAGE_SIZE);
+#endif
+
+ /* Give pages back to free page manager. */
+ LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+ FreePageManagerPut(segment_map->fpm,
+ DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE,
+ span->npages);
+ LWLockRelease(DSA_AREA_LOCK(area));
+ /* Unlink span. */
+ LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE),
+ LW_EXCLUSIVE);
+ unlink_span(area, span);
+ LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE));
+ /* Free the span object so it can be reused. */
+ dsa_free(area, span_pointer);
+ return;
+ }
+
+#ifdef CLOBBER_FREED_MEMORY
+ memset(object, 0x7f, size);
+#endif
+
+ LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
+
+ /* Put the object on the span's freelist. */
+ Assert(object >= superblock);
+ Assert(object < superblock + DSA_SUPERBLOCK_SIZE);
+ Assert((object - superblock) % size == 0);
+ NextFreeObjectIndex(object) = span->firstfree;
+ span->firstfree = (object - superblock) / size;
+ ++span->nallocatable;
+
+ /*
+ * See if the span needs to moved to a different fullness class, or be
+ * freed so its pages can be given back to the segment.
+ */
+ if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1)
+ {
+ /*
+ * The block was completely full and is located in the
+ * highest-numbered fullness class, which is never scanned for free
+ * chunks. We must move it to the next-lower fullness class.
+ */
+ unlink_span(area, span);
+ add_span_to_fullness_class(area, span, span_pointer,
+ DSA_FULLNESS_CLASSES - 2);
+
+ /*
+ * If this is the only span, and there is no active span, then we
+ * should probably move this span to fullness class 1. (Otherwise if
+ * you allocate exactly all the objects in the only span, it moves to
+ * class 3, then you free them all, it moves to 2, and then is given
+ * back, leaving no active span).
+ */
+ }
+ else if (span->nallocatable == span->nmax &&
+ (span->fclass != 1 || span->prevspan != InvalidDsaPointer))
+ {
+ /*
+ * This entire block is free, and it's not the active block for this
+ * size class. Return the memory to the free page manager. We don't
+ * do this for the active block to prevent hysteresis: if we
+ * repeatedly allocate and free the only chunk in the active block, it
+ * will be very inefficient if we deallocate and reallocate the block
+ * every time.
+ */
+ destroy_superblock(area, span_pointer);
+ }
+
+ LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
+}
+
+/*
+ * Obtain a backend-local address for a dsa_pointer. 'dp' must point to
+ * memory allocated by the given area (possibly in another process) that
+ * hasn't yet been freed. This may cause a segment to be mapped into the
+ * current process if required, and may cause freed segments to be unmapped.
+ */
+void *
+dsa_get_address(dsa_area *area, dsa_pointer dp)
+{
+ dsa_segment_index index;
+ size_t offset;
+
+ /* Convert InvalidDsaPointer to NULL. */
+ if (!DsaPointerIsValid(dp))
+ return NULL;
+
+ /* Process any requests to detach from freed segments. */
+ check_for_freed_segments(area);
+
+ /* Break the dsa_pointer into its components. */
+ index = DSA_EXTRACT_SEGMENT_NUMBER(dp);
+ offset = DSA_EXTRACT_OFFSET(dp);
+ Assert(index < DSA_MAX_SEGMENTS);
+
+ /* Check if we need to cause this segment to be mapped in. */
+ if (unlikely(area->segment_maps[index].mapped_address == NULL))
+ {
+ /* Call for effect (we don't need the result). */
+ get_segment_by_index(area, index);
+ }
+
+ return area->segment_maps[index].mapped_address + offset;
+}
+
+/*
+ * Pin this area, so that it will continue to exist even if all backends
+ * detach from it. In that case, the area can still be reattached to if a
+ * handle has been recorded somewhere.
+ */
+void
+dsa_pin(dsa_area *area)
+{
+ LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+ if (area->control->pinned)
+ {
+ LWLockRelease(DSA_AREA_LOCK(area));
+ elog(ERROR, "dsa_area already pinned");
+ }
+ area->control->pinned = true;
+ ++area->control->refcnt;
+ LWLockRelease(DSA_AREA_LOCK(area));
+}
+
+/*
+ * Undo the effects of dsa_pin, so that the given area can be freed when no
+ * backends are attached to it. May be called only if dsa_pin has been
+ * called.
+ */
+void
+dsa_unpin(dsa_area *area)
+{
+ LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+ Assert(area->control->refcnt > 1);
+ if (!area->control->pinned)
+ {
+ LWLockRelease(DSA_AREA_LOCK(area));
+ elog(ERROR, "dsa_area not pinned");
+ }
+ area->control->pinned = false;
+ --area->control->refcnt;
+ LWLockRelease(DSA_AREA_LOCK(area));
+}
+
+/*
+ * Set the total size limit for this area. This limit is checked whenever new
+ * segments need to be allocated from the operating system. If the new size
+ * limit is already exceeded, this has no immediate effect.
+ *
+ * Note that the total virtual memory usage may be temporarily larger than
+ * this limit when segments have been freed, but not yet detached by all
+ * backends that have attached to them.
+ */
+void
+dsa_set_size_limit(dsa_area *area, size_t limit)
+{
+ LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+ area->control->max_total_segment_size = limit;
+ LWLockRelease(DSA_AREA_LOCK(area));
+}
+
+/*
+ * Aggressively free all spare memory in the hope of returning DSM segments to
+ * the operating system.
+ */
+void
+dsa_trim(dsa_area *area)
+{
+ int size_class;
+
+ /*
+ * Trim in reverse pool order so we get to the spans-of-spans last, just
+ * in case any become entirely free while processing all the other pools.
+ */
+ for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class)
+ {
+ dsa_area_pool *pool = &area->control->pools[size_class];
+ dsa_pointer span_pointer;
+
+ if (size_class == DSA_SCLASS_SPAN_LARGE)
+ {
+ /* Large object frees give back segments aggressively already. */
+ continue;
+ }
+
+ /*
+ * Search fullness class 1 only. That is where we expect to find an
+ * entirely empty superblock (entirely empty superblocks in other
+ * fullness classes are returned to the free page map by dsa_free).
+ */
+ LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
+ span_pointer = pool->spans[1];
+ while (DsaPointerIsValid(span_pointer))
+ {
+ dsa_area_span *span = dsa_get_address(area, span_pointer);
+ dsa_pointer next = span->nextspan;
+
+ if (span->nallocatable == span->nmax)
+ destroy_superblock(area, span_pointer);
+
+ span_pointer = next;
+ }
+ LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
+ }
+}
+
+/*
+ * Print out debugging information about the internal state of the shared
+ * memory area.
+ */
+void
+dsa_dump(dsa_area *area)
+{
+ size_t i,
+ j;
+
+ /*
+ * Note: This gives an inconsistent snapshot as it acquires and releases
+ * individual locks as it goes...
+ */
+
+ LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+ check_for_freed_segments_locked(area);
+ fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
+ fprintf(stderr, " max_total_segment_size: %zu\n",
+ area->control->max_total_segment_size);
+ fprintf(stderr, " total_segment_size: %zu\n",
+ area->control->total_segment_size);
+ fprintf(stderr, " refcnt: %d\n", area->control->refcnt);
+ fprintf(stderr, " pinned: %c\n", area->control->pinned ? 't' : 'f');
+ fprintf(stderr, " segment bins:\n");
+ for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
+ {
+ if (area->control->segment_bins[i] != DSA_SEGMENT_INDEX_NONE)
+ {
+ dsa_segment_index segment_index;
+
+ fprintf(stderr,
+ " segment bin %zu (at least %d contiguous pages free):\n",
+ i, 1 << (i - 1));
+ segment_index = area->control->segment_bins[i];
+ while (segment_index != DSA_SEGMENT_INDEX_NONE)
+ {
+ dsa_segment_map *segment_map;
+
+ segment_map =
+ get_segment_by_index(area, segment_index);
+
+ fprintf(stderr,
+ " segment index %zu, usable_pages = %zu, "
+ "contiguous_pages = %zu, mapped at %p\n",
+ segment_index,
+ segment_map->header->usable_pages,
+ fpm_largest(segment_map->fpm),
+ segment_map->mapped_address);
+ segment_index = segment_map->header->next;
+ }
+ }
+ }
+ LWLockRelease(DSA_AREA_LOCK(area));
+
+ fprintf(stderr, " pools:\n");
+ for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
+ {
+ bool found = false;
+
+ LWLockAcquire(DSA_SCLASS_LOCK(area, i), LW_EXCLUSIVE);
+ for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
+ if (DsaPointerIsValid(area->control->pools[i].spans[j]))
+ found = true;
+ if (found)
+ {
+ if (i == DSA_SCLASS_BLOCK_OF_SPANS)
+ fprintf(stderr, " pool for blocks of span objects:\n");
+ else if (i == DSA_SCLASS_SPAN_LARGE)
+ fprintf(stderr, " pool for large object spans:\n");
+ else
+ fprintf(stderr,
+ " pool for size class %zu (object size %hu bytes):\n",
+ i, dsa_size_classes[i]);
+ for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
+ {
+ if (!DsaPointerIsValid(area->control->pools[i].spans[j]))
+ fprintf(stderr, " fullness class %zu is empty\n", j);
+ else
+ {
+ dsa_pointer span_pointer = area->control->pools[i].spans[j];
+
+ fprintf(stderr, " fullness class %zu:\n", j);
+ while (DsaPointerIsValid(span_pointer))
+ {
+ dsa_area_span *span;
+
+ span = dsa_get_address(area, span_pointer);
+ fprintf(stderr,
+ " span descriptor at "
+ DSA_POINTER_FORMAT ", superblock at "
+ DSA_POINTER_FORMAT
+ ", pages = %zu, objects free = %hu/%hu\n",
+ span_pointer, span->start, span->npages,
+ span->nallocatable, span->nmax);
+ span_pointer = span->nextspan;
+ }
+ }
+ }
+ }
+ LWLockRelease(DSA_SCLASS_LOCK(area, i));
+ }
+}
+
+/*
+ * Return the smallest size that you can successfully provide to
+ * dsa_create_in_place.
+ */
+size_t
+dsa_minimum_size(void)
+{
+ size_t size;
+ int pages = 0;
+
+ size = MAXALIGN(sizeof(dsa_area_control)) +
+ MAXALIGN(sizeof(FreePageManager));
+
+ /* Figure out how many pages we need, including the page map... */
+ while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages)
+ {
+ ++pages;
+ size += sizeof(dsa_pointer);
+ }
+
+ return pages * FPM_PAGE_SIZE;
+}
+
+/*
+ * Workhorse function for dsa_create and dsa_create_in_place.
+ */
+static dsa_area *
+create_internal(void *place, size_t size,
+ int tranche_id,
+ dsm_handle control_handle,
+ dsm_segment *control_segment)
+{
+ dsa_area_control *control;
+ dsa_area *area;
+ dsa_segment_map *segment_map;
+ size_t usable_pages;
+ size_t total_pages;
+ size_t metadata_bytes;
+ int i;
+
+ /* Sanity check on the space we have to work in. */
+ if (size < dsa_minimum_size())
+ elog(ERROR, "dsa_area space must be at least %zu, but %zu provided",
+ dsa_minimum_size(), size);
+
+ /* Now figure out how much space is usable */
+ total_pages = size / FPM_PAGE_SIZE;
+ metadata_bytes =
+ MAXALIGN(sizeof(dsa_area_control)) +
+ MAXALIGN(sizeof(FreePageManager)) +
+ total_pages * sizeof(dsa_pointer);
+ /* Add padding up to next page boundary. */
+ if (metadata_bytes % FPM_PAGE_SIZE != 0)
+ metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
+ Assert(metadata_bytes <= size);
+ usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE;
+
+ /*
+ * Initialize the dsa_area_control object located at the start of the
+ * space.
+ */
+ control = (dsa_area_control *) place;
+ memset(place, 0, sizeof(*control));
+ control->segment_header.magic =
+ DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
+ control->segment_header.next = DSA_SEGMENT_INDEX_NONE;
+ control->segment_header.prev = DSA_SEGMENT_INDEX_NONE;
+ control->segment_header.usable_pages = usable_pages;
+ control->segment_header.freed = false;
+ control->segment_header.size = DSA_INITIAL_SEGMENT_SIZE;
+ control->handle = control_handle;
+ control->max_total_segment_size = (size_t) -1;
+ control->total_segment_size = size;
+ control->segment_handles[0] = control_handle;
+ for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
+ control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE;
+ control->refcnt = 1;
+ control->lwlock_tranche_id = tranche_id;
+
+ /*
+ * Create the dsa_area object that this backend will use to access the
+ * area. Other backends will need to obtain their own dsa_area object by
+ * attaching.
+ */
+ area = palloc(sizeof(dsa_area));
+ area->control = control;
+ area->mapping_pinned = false;
+ memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
+ area->high_segment_index = 0;
+ area->freed_segment_counter = 0;
+ LWLockInitialize(&control->lock, control->lwlock_tranche_id);
+ for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
+ LWLockInitialize(DSA_SCLASS_LOCK(area, i),
+ control->lwlock_tranche_id);
+
+ /* Set up the segment map for this process's mapping. */
+ segment_map = &area->segment_maps[0];
+ segment_map->segment = control_segment;
+ segment_map->mapped_address = place;
+ segment_map->header = (dsa_segment_header *) place;
+ segment_map->fpm = (FreePageManager *)
+ (segment_map->mapped_address +
+ MAXALIGN(sizeof(dsa_area_control)));
+ segment_map->pagemap = (dsa_pointer *)
+ (segment_map->mapped_address +
+ MAXALIGN(sizeof(dsa_area_control)) +
+ MAXALIGN(sizeof(FreePageManager)));
+
+ /* Set up the free page map. */
+ FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
+ /* There can be 0 usable pages if size is dsa_minimum_size(). */
+
+ if (usable_pages > 0)
+ FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
+ usable_pages);
+
+ /* Put this segment into the appropriate bin. */
+ control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0;
+ segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
+
+ return area;
+}
+
+/*
+ * Workhorse function for dsa_attach and dsa_attach_in_place.
+ */
+static dsa_area *
+attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
+{
+ dsa_area_control *control;
+ dsa_area *area;
+ dsa_segment_map *segment_map;
+
+ control = (dsa_area_control *) place;
+ Assert(control->handle == handle);
+ Assert(control->segment_handles[0] == handle);
+ Assert(control->segment_header.magic ==
+ (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0));
+
+ /* Build the backend-local area object. */
+ area = palloc(sizeof(dsa_area));
+ area->control = control;
+ area->mapping_pinned = false;
+ memset(&area->segment_maps[0], 0,
+ sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
+ area->high_segment_index = 0;
+
+ /* Set up the segment map for this process's mapping. */
+ segment_map = &area->segment_maps[0];
+ segment_map->segment = segment; /* NULL for in-place */
+ segment_map->mapped_address = place;
+ segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
+ segment_map->fpm = (FreePageManager *)
+ (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)));
+ segment_map->pagemap = (dsa_pointer *)
+ (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
+ MAXALIGN(sizeof(FreePageManager)));
+
+ /* Bump the reference count. */
+ LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+ if (control->refcnt == 0)
+ {
+ /* We can't attach to a DSA area that has already been destroyed. */
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("could not attach to dynamic shared area")));
+ }
+ ++control->refcnt;
+ area->freed_segment_counter = area->control->freed_segment_counter;
+ LWLockRelease(DSA_AREA_LOCK(area));
+
+ return area;
+}
+
+/*
+ * Add a new span to fullness class 1 of the indicated pool.
+ */
+static void
+init_span(dsa_area *area,
+ dsa_pointer span_pointer,
+ dsa_area_pool *pool, dsa_pointer start, size_t npages,
+ uint16 size_class)
+{
+ dsa_area_span *span = dsa_get_address(area, span_pointer);
+ size_t obsize = dsa_size_classes[size_class];
+
+ /*
+ * The per-pool lock must be held because we manipulate the span list for
+ * this pool.
+ */
+ Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
+
+ /* Push this span onto the front of the span list for fullness class 1. */
+ if (DsaPointerIsValid(pool->spans[1]))
+ {
+ dsa_area_span *head = (dsa_area_span *)
+ dsa_get_address(area, pool->spans[1]);
+
+ head->prevspan = span_pointer;
+ }
+ span->pool = DsaAreaPoolToDsaPointer(area, pool);
+ span->nextspan = pool->spans[1];
+ span->prevspan = InvalidDsaPointer;
+ pool->spans[1] = span_pointer;
+
+ span->start = start;
+ span->npages = npages;
+ span->size_class = size_class;
+ span->ninitialized = 0;
+ if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
+ {
+ /*
+ * A block-of-spans contains its own descriptor, so mark one object as
+ * initialized and reduce the count of allocatable objects by one.
+ * Doing this here has the side effect of also reducing nmax by one,
+ * which is important to make sure we free this object at the correct
+ * time.
+ */
+ span->ninitialized = 1;
+ span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
+ }
+ else if (size_class != DSA_SCLASS_SPAN_LARGE)
+ span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize;
+ span->firstfree = DSA_SPAN_NOTHING_FREE;
+ span->nmax = span->nallocatable;
+ span->fclass = 1;
+}
+
+/*
+ * Transfer the first span in one fullness class to the head of another
+ * fullness class.
+ */
+static bool
+transfer_first_span(dsa_area *area,
+ dsa_area_pool *pool, int fromclass, int toclass)
+{
+ dsa_pointer span_pointer;
+ dsa_area_span *span;
+ dsa_area_span *nextspan;
+
+ /* Can't do it if source list is empty. */
+ span_pointer = pool->spans[fromclass];
+ if (!DsaPointerIsValid(span_pointer))
+ return false;
+
+ /* Remove span from head of source list. */
+ span = dsa_get_address(area, span_pointer);
+ pool->spans[fromclass] = span->nextspan;
+ if (DsaPointerIsValid(span->nextspan))
+ {
+ nextspan = (dsa_area_span *)
+ dsa_get_address(area, span->nextspan);
+ nextspan->prevspan = InvalidDsaPointer;
+ }
+
+ /* Add span to head of target list. */
+ span->nextspan = pool->spans[toclass];
+ pool->spans[toclass] = span_pointer;
+ if (DsaPointerIsValid(span->nextspan))
+ {
+ nextspan = (dsa_area_span *)
+ dsa_get_address(area, span->nextspan);
+ nextspan->prevspan = span_pointer;
+ }
+ span->fclass = toclass;
+
+ return true;
+}
+
+/*
+ * Allocate one object of the requested size class from the given area.
+ */
+static inline dsa_pointer
+alloc_object(dsa_area *area, int size_class)
+{
+ dsa_area_pool *pool = &area->control->pools[size_class];
+ dsa_area_span *span;
+ dsa_pointer block;
+ dsa_pointer result;
+ char *object;
+ size_t size;
+
+ /*
+ * Even though ensure_active_superblock can in turn call alloc_object if
+ * it needs to allocate a new span, that's always from a different pool,
+ * and the order of lock acquisition is always the same, so it's OK that
+ * we hold this lock for the duration of this function.
+ */
+ Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
+ LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
+
+ /*
+ * If there's no active superblock, we must successfully obtain one or
+ * fail the request.
+ */
+ if (!DsaPointerIsValid(pool->spans[1]) &&
+ !ensure_active_superblock(area, pool, size_class))
+ {
+ result = InvalidDsaPointer;
+ }
+ else
+ {
+ /*
+ * There should be a block in fullness class 1 at this point, and it
+ * should never be completely full. Thus we can either pop an object
+ * from the free list or, failing that, initialize a new object.
+ */
+ Assert(DsaPointerIsValid(pool->spans[1]));
+ span = (dsa_area_span *)
+ dsa_get_address(area, pool->spans[1]);
+ Assert(span->nallocatable > 0);
+ block = span->start;
+ Assert(size_class < DSA_NUM_SIZE_CLASSES);
+ size = dsa_size_classes[size_class];
+ if (span->firstfree != DSA_SPAN_NOTHING_FREE)
+ {
+ result = block + span->firstfree * size;
+ object = dsa_get_address(area, result);
+ span->firstfree = NextFreeObjectIndex(object);
+ }
+ else
+ {
+ result = block + span->ninitialized * size;
+ ++span->ninitialized;
+ }
+ --span->nallocatable;
+
+ /* If it's now full, move it to the highest-numbered fullness class. */
+ if (span->nallocatable == 0)
+ transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1);
+ }
+
+ Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
+ LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
+
+ return result;
+}
+
+/*
+ * Ensure an active (i.e. fullness class 1) superblock, unless all existing
+ * superblocks are completely full and no more can be allocated.
+ *
+ * Fullness classes K of 0..N are loosely intended to represent blocks whose
+ * utilization percentage is at least K/N, but we only enforce this rigorously
+ * for the highest-numbered fullness class, which always contains exactly
+ * those blocks that are completely full. It's otherwise acceptable for a
+ * block to be in a higher-numbered fullness class than the one to which it
+ * logically belongs. In addition, the active block, which is always the
+ * first block in fullness class 1, is permitted to have a higher allocation
+ * percentage than would normally be allowable for that fullness class; we
+ * don't move it until it's completely full, and then it goes to the
+ * highest-numbered fullness class.
+ *
+ * It might seem odd that the active block is the head of fullness class 1
+ * rather than fullness class 0, but experience with other allocators has
+ * shown that it's usually better to allocate from a block that's moderately
+ * full rather than one that's nearly empty. Insofar as is reasonably
+ * possible, we want to avoid performing new allocations in a block that would
+ * otherwise become empty soon.
+ */
+static bool
+ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
+ int size_class)
+{
+ dsa_pointer span_pointer;
+ dsa_pointer start_pointer;
+ size_t obsize = dsa_size_classes[size_class];
+ size_t nmax;
+ int fclass;
+ size_t npages = 1;
+ size_t first_page;
+ size_t i;
+ dsa_segment_map *segment_map;
+
+ Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
+
+ /*
+ * Compute the number of objects that will fit in a block of this size
+ * class. Span-of-spans blocks are just a single page, and the first
+ * object isn't available for use because it describes the block-of-spans
+ * itself.
+ */
+ if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
+ nmax = FPM_PAGE_SIZE / obsize - 1;
+ else
+ nmax = DSA_SUPERBLOCK_SIZE / obsize;
+
+ /*
+ * If fullness class 1 is empty, try to find a span to put in it by
+ * scanning higher-numbered fullness classes (excluding the last one,
+ * whose blocks are certain to all be completely full).
+ */
+ for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
+ {
+ span_pointer = pool->spans[fclass];
+
+ while (DsaPointerIsValid(span_pointer))
+ {
+ int tfclass;
+ dsa_area_span *span;
+ dsa_area_span *nextspan;
+ dsa_area_span *prevspan;
+ dsa_pointer next_span_pointer;
+
+ span = (dsa_area_span *)
+ dsa_get_address(area, span_pointer);
+ next_span_pointer = span->nextspan;
+
+ /* Figure out what fullness class should contain this span. */
+ tfclass = (nmax - span->nallocatable)
+ * (DSA_FULLNESS_CLASSES - 1) / nmax;
+
+ /* Look up next span. */
+ if (DsaPointerIsValid(span->nextspan))
+ nextspan = (dsa_area_span *)
+ dsa_get_address(area, span->nextspan);
+ else
+ nextspan = NULL;
+
+ /*
+ * If utilization has dropped enough that this now belongs in some
+ * other fullness class, move it there.
+ */
+ if (tfclass < fclass)
+ {
+ /* Remove from the current fullness class list. */
+ if (pool->spans[fclass] == span_pointer)
+ {
+ /* It was the head; remove it. */
+ Assert(!DsaPointerIsValid(span->prevspan));
+ pool->spans[fclass] = span->nextspan;
+ if (nextspan != NULL)
+ nextspan->prevspan = InvalidDsaPointer;
+ }
+ else
+ {
+ /* It was not the head. */
+ Assert(DsaPointerIsValid(span->prevspan));
+ prevspan = (dsa_area_span *)
+ dsa_get_address(area, span->prevspan);
+ prevspan->nextspan = span->nextspan;
+ }
+ if (nextspan != NULL)
+ nextspan->prevspan = span->prevspan;
+
+ /* Push onto the head of the new fullness class list. */
+ span->nextspan = pool->spans[tfclass];
+ pool->spans[tfclass] = span_pointer;
+ span->prevspan = InvalidDsaPointer;
+ if (DsaPointerIsValid(span->nextspan))
+ {
+ nextspan = (dsa_area_span *)
+ dsa_get_address(area, span->nextspan);
+ nextspan->prevspan = span_pointer;
+ }
+ span->fclass = tfclass;
+ }
+
+ /* Advance to next span on list. */
+ span_pointer = next_span_pointer;
+ }
+
+ /* Stop now if we found a suitable block. */
+ if (DsaPointerIsValid(pool->spans[1]))
+ return true;
+ }
+
+ /*
+ * If there are no blocks that properly belong in fullness class 1, pick
+ * one from some other fullness class and move it there anyway, so that we
+ * have an allocation target. Our last choice is to transfer a block
+ * that's almost empty (and might become completely empty soon if left
+ * alone), but even that is better than failing, which is what we must do
+ * if there are no blocks at all with freespace.
+ */
+ Assert(!DsaPointerIsValid(pool->spans[1]));
+ for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
+ if (transfer_first_span(area, pool, fclass, 1))
+ return true;
+ if (!DsaPointerIsValid(pool->spans[1]) &&
+ transfer_first_span(area, pool, 0, 1))
+ return true;
+
+ /*
+ * We failed to find an existing span with free objects, so we need to
+ * allocate a new superblock and construct a new span to manage it.
+ *
+ * First, get a dsa_area_span object to describe the new superblock block
+ * ... unless this allocation is for a dsa_area_span object, in which case
+ * that's surely not going to work. We handle that case by storing the
+ * span describing a block-of-spans inline.
+ */
+ if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
+ {
+ span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
+ if (!DsaPointerIsValid(span_pointer))
+ return false;
+ npages = DSA_PAGES_PER_SUPERBLOCK;
+ }
+
+ /* Find or create a segment and allocate the superblock. */
+ LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+ segment_map = get_best_segment(area, npages);
+ if (segment_map == NULL)
+ {
+ segment_map = make_new_segment(area, npages);
+ if (segment_map == NULL)
+ {
+ LWLockRelease(DSA_AREA_LOCK(area));
+ return false;
+ }
+ }
+
+ /*
+ * This shouldn't happen: get_best_segment() or make_new_segment()
+ * promised that we can successfully allocate npages.
+ */
+ if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
+ elog(FATAL,
+ "dsa_allocate could not find %zu free pages for superblock",
+ npages);
+ LWLockRelease(DSA_AREA_LOCK(area));
+
+ /* Compute the start of the superblock. */
+ start_pointer =
+ DSA_MAKE_POINTER(get_segment_index(area, segment_map),
+ first_page * FPM_PAGE_SIZE);
+
+ /*
+ * If this is a block-of-spans, carve the descriptor right out of the
+ * allocated space.
+ */
+ if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
+ {
+ /*
+ * We have a pointer into the segment. We need to build a dsa_pointer
+ * from the segment index and offset into the segment.
+ */
+ span_pointer = start_pointer;
+ }
+
+ /* Initialize span and pagemap. */
+ init_span(area, span_pointer, pool, start_pointer, npages, size_class);
+ for (i = 0; i < npages; ++i)
+ segment_map->pagemap[first_page + i] = span_pointer;
+
+ return true;
+}
+
+/*
+ * Return the segment map corresponding to a given segment index, mapping the
+ * segment in if necessary. For internal segment book-keeping, this is called
+ * with the area lock held. It is also called by dsa_free and dsa_get_address
+ * without any locking, relying on the fact they have a known live segment
+ * index and they always call check_for_freed_segments to ensures that any
+ * freed segment occupying the same slot is detached first.
+ */
+static dsa_segment_map *
+get_segment_by_index(dsa_area *area, dsa_segment_index index)
+{
+ if (unlikely(area->segment_maps[index].mapped_address == NULL))
+ {
+ dsm_handle handle;
+ dsm_segment *segment;
+ dsa_segment_map *segment_map;
+
+ /*
+ * If we are reached by dsa_free or dsa_get_address, there must be at
+ * least one object allocated in the referenced segment. Otherwise,
+ * their caller has a double-free or access-after-free bug, which we
+ * have no hope of detecting. So we know it's safe to access this
+ * array slot without holding a lock; it won't change underneath us.
+ * Furthermore, we know that we can see the latest contents of the
+ * slot, as explained in check_for_freed_segments, which those
+ * functions call before arriving here.
+ */
+ handle = area->control->segment_handles[index];
+
+ /* It's an error to try to access an unused slot. */
+ if (handle == DSM_HANDLE_INVALID)
+ elog(ERROR,
+ "dsa_area could not attach to a segment that has been freed");
+
+ segment = dsm_attach(handle);
+ if (segment == NULL)
+ elog(ERROR, "dsa_area could not attach to segment");
+ if (area->mapping_pinned)
+ dsm_pin_mapping(segment);
+ segment_map = &area->segment_maps[index];
+ segment_map->segment = segment;
+ segment_map->mapped_address = dsm_segment_address(segment);
+ segment_map->header =
+ (dsa_segment_header *) segment_map->mapped_address;
+ segment_map->fpm = (FreePageManager *)
+ (segment_map->mapped_address +
+ MAXALIGN(sizeof(dsa_segment_header)));
+ segment_map->pagemap = (dsa_pointer *)
+ (segment_map->mapped_address +
+ MAXALIGN(sizeof(dsa_segment_header)) +
+ MAXALIGN(sizeof(FreePageManager)));
+
+ /* Remember the highest index this backend has ever mapped. */
+ if (area->high_segment_index < index)
+ area->high_segment_index = index;
+
+ Assert(segment_map->header->magic ==
+ (DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ index));
+ }
+
+ /*
+ * Callers of dsa_get_address() and dsa_free() don't hold the area lock,
+ * but it's a bug in the calling code and undefined behavior if the
+ * address is not live (ie if the segment might possibly have been freed,
+ * they're trying to use a dangling pointer).
+ *
+ * For dsa.c code that holds the area lock to manipulate segment_bins
+ * lists, it would be a bug if we ever reach a freed segment here. After
+ * it's marked as freed, the only thing any backend should do with it is
+ * unmap it, and it should always have done that in
+ * check_for_freed_segments_locked() before arriving here to resolve an
+ * index to a segment_map.
+ *
+ * Either way we can assert that we aren't returning a freed segment.
+ */
+ Assert(!area->segment_maps[index].header->freed);
+
+ return &area->segment_maps[index];
+}
+
+/*
+ * Return a superblock to the free page manager. If the underlying segment
+ * has become entirely free, then return it to the operating system.
+ *
+ * The appropriate pool lock must be held.
+ */
+static void
+destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
+{
+ dsa_area_span *span = dsa_get_address(area, span_pointer);
+ int size_class = span->size_class;
+ dsa_segment_map *segment_map;
+
+
+ /* Remove it from its fullness class list. */
+ unlink_span(area, span);
+
+ /*
+ * Note: Here we acquire the area lock while we already hold a per-pool
+ * lock. We never hold the area lock and then take a pool lock, or we
+ * could deadlock.
+ */
+ LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+ check_for_freed_segments_locked(area);
+ segment_map =
+ get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(span->start));
+ FreePageManagerPut(segment_map->fpm,
+ DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE,
+ span->npages);
+ /* Check if the segment is now entirely free. */
+ if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages)
+ {
+ dsa_segment_index index = get_segment_index(area, segment_map);
+
+ /* If it's not the segment with extra control data, free it. */
+ if (index != 0)
+ {
+ /*
+ * Give it back to the OS, and allow other backends to detect that
+ * they need to detach.
+ */
+ unlink_segment(area, segment_map);
+ segment_map->header->freed = true;
+ Assert(area->control->total_segment_size >=
+ segment_map->header->size);
+ area->control->total_segment_size -=
+ segment_map->header->size;
+ dsm_unpin_segment(dsm_segment_handle(segment_map->segment));
+ dsm_detach(segment_map->segment);
+ area->control->segment_handles[index] = DSM_HANDLE_INVALID;
+ ++area->control->freed_segment_counter;
+ segment_map->segment = NULL;
+ segment_map->header = NULL;
+ segment_map->mapped_address = NULL;
+ }
+ }
+ LWLockRelease(DSA_AREA_LOCK(area));
+
+ /*
+ * Span-of-spans blocks store the span which describes them within the
+ * block itself, so freeing the storage implicitly frees the descriptor
+ * also. If this is a block of any other type, we need to separately free
+ * the span object also. This recursive call to dsa_free will acquire the
+ * span pool's lock. We can't deadlock because the acquisition order is
+ * always some other pool and then the span pool.
+ */
+ if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
+ dsa_free(area, span_pointer);
+}
+
+static void
+unlink_span(dsa_area *area, dsa_area_span *span)
+{
+ if (DsaPointerIsValid(span->nextspan))
+ {
+ dsa_area_span *next = dsa_get_address(area, span->nextspan);
+
+ next->prevspan = span->prevspan;
+ }
+ if (DsaPointerIsValid(span->prevspan))
+ {
+ dsa_area_span *prev = dsa_get_address(area, span->prevspan);
+
+ prev->nextspan = span->nextspan;
+ }
+ else
+ {
+ dsa_area_pool *pool = dsa_get_address(area, span->pool);
+
+ pool->spans[span->fclass] = span->nextspan;
+ }
+}
+
+static void
+add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
+ dsa_pointer span_pointer,
+ int fclass)
+{
+ dsa_area_pool *pool = dsa_get_address(area, span->pool);
+
+ if (DsaPointerIsValid(pool->spans[fclass]))
+ {
+ dsa_area_span *head = dsa_get_address(area,
+ pool->spans[fclass]);
+
+ head->prevspan = span_pointer;
+ }
+ span->prevspan = InvalidDsaPointer;
+ span->nextspan = pool->spans[fclass];
+ pool->spans[fclass] = span_pointer;
+ span->fclass = fclass;
+}
+
+/*
+ * Detach from an area that was either created or attached to by this process.
+ */
+void
+dsa_detach(dsa_area *area)
+{
+ int i;
+
+ /* Detach from all segments. */
+ for (i = 0; i <= area->high_segment_index; ++i)
+ if (area->segment_maps[i].segment != NULL)
+ dsm_detach(area->segment_maps[i].segment);
+
+ /*
+ * Note that 'detaching' (= detaching from DSM segments) doesn't include
+ * 'releasing' (= adjusting the reference count). It would be nice to
+ * combine these operations, but client code might never get around to
+ * calling dsa_detach because of an error path, and a detach hook on any
+ * particular segment is too late to detach other segments in the area
+ * without risking a 'leak' warning in the non-error path.
+ */
+
+ /* Free the backend-local area object. */
+ pfree(area);
+}
+
+/*
+ * Unlink a segment from the bin that contains it.
+ */
+static void
+unlink_segment(dsa_area *area, dsa_segment_map *segment_map)
+{
+ if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE)
+ {
+ dsa_segment_map *prev;
+
+ prev = get_segment_by_index(area, segment_map->header->prev);
+ prev->header->next = segment_map->header->next;
+ }
+ else
+ {
+ Assert(area->control->segment_bins[segment_map->header->bin] ==
+ get_segment_index(area, segment_map));
+ area->control->segment_bins[segment_map->header->bin] =
+ segment_map->header->next;
+ }
+ if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
+ {
+ dsa_segment_map *next;
+
+ next = get_segment_by_index(area, segment_map->header->next);
+ next->header->prev = segment_map->header->prev;
+ }
+}
+
+/*
+ * Find a segment that could satisfy a request for 'npages' of contiguous
+ * memory, or return NULL if none can be found. This may involve attaching to
+ * segments that weren't previously attached so that we can query their free
+ * pages map.
+ */
+static dsa_segment_map *
+get_best_segment(dsa_area *area, size_t npages)
+{
+ size_t bin;
+
+ Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
+ check_for_freed_segments_locked(area);
+
+ /*
+ * Start searching from the first bin that *might* have enough contiguous
+ * pages.
+ */
+ for (bin = contiguous_pages_to_segment_bin(npages);
+ bin < DSA_NUM_SEGMENT_BINS;
+ ++bin)
+ {
+ /*
+ * The minimum contiguous size that any segment in this bin should
+ * have. We'll re-bin if we see segments with fewer.
+ */
+ size_t threshold = (size_t) 1 << (bin - 1);
+ dsa_segment_index segment_index;
+
+ /* Search this bin for a segment with enough contiguous space. */
+ segment_index = area->control->segment_bins[bin];
+ while (segment_index != DSA_SEGMENT_INDEX_NONE)
+ {
+ dsa_segment_map *segment_map;
+ dsa_segment_index next_segment_index;
+ size_t contiguous_pages;
+
+ segment_map = get_segment_by_index(area, segment_index);
+ next_segment_index = segment_map->header->next;
+ contiguous_pages = fpm_largest(segment_map->fpm);
+
+ /* Not enough for the request, still enough for this bin. */
+ if (contiguous_pages >= threshold && contiguous_pages < npages)
+ {
+ segment_index = next_segment_index;
+ continue;
+ }
+
+ /* Re-bin it if it's no longer in the appropriate bin. */
+ if (contiguous_pages < threshold)
+ {
+ size_t new_bin;
+
+ new_bin = contiguous_pages_to_segment_bin(contiguous_pages);
+
+ /* Remove it from its current bin. */
+ unlink_segment(area, segment_map);
+
+ /* Push it onto the front of its new bin. */
+ segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
+ segment_map->header->next =
+ area->control->segment_bins[new_bin];
+ segment_map->header->bin = new_bin;
+ area->control->segment_bins[new_bin] = segment_index;
+ if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
+ {
+ dsa_segment_map *next;
+
+ next = get_segment_by_index(area,
+ segment_map->header->next);
+ Assert(next->header->bin == new_bin);
+ next->header->prev = segment_index;
+ }
+
+ /*
+ * But fall through to see if it's enough to satisfy this
+ * request anyway....
+ */
+ }
+
+ /* Check if we are done. */
+ if (contiguous_pages >= npages)
+ return segment_map;
+
+ /* Continue searching the same bin. */
+ segment_index = next_segment_index;
+ }
+ }
+
+ /* Not found. */
+ return NULL;
+}
+
+/*
+ * Create a new segment that can handle at least requested_pages. Returns
+ * NULL if the requested total size limit or maximum allowed number of
+ * segments would be exceeded.
+ */
+static dsa_segment_map *
+make_new_segment(dsa_area *area, size_t requested_pages)
+{
+ dsa_segment_index new_index;
+ size_t metadata_bytes;
+ size_t total_size;
+ size_t total_pages;
+ size_t usable_pages;
+ dsa_segment_map *segment_map;
+ dsm_segment *segment;
+
+ Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
+
+ /* Find a segment slot that is not in use (linearly for now). */
+ for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index)
+ {
+ if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID)
+ break;
+ }
+ if (new_index == DSA_MAX_SEGMENTS)
+ return NULL;
+
+ /*
+ * If the total size limit is already exceeded, then we exit early and
+ * avoid arithmetic wraparound in the unsigned expressions below.
+ */
+ if (area->control->total_segment_size >=
+ area->control->max_total_segment_size)
+ return NULL;
+
+ /*
+ * The size should be at least as big as requested, and at least big
+ * enough to follow a geometric series that approximately doubles the
+ * total storage each time we create a new segment. We use geometric
+ * growth because the underlying DSM system isn't designed for large
+ * numbers of segments (otherwise we might even consider just using one
+ * DSM segment for each large allocation and for each superblock, and then
+ * we wouldn't need to use FreePageManager).
+ *
+ * We decide on a total segment size first, so that we produce tidy
+ * power-of-two sized segments. This is a good property to have if we
+ * move to huge pages in the future. Then we work back to the number of
+ * pages we can fit.
+ */
+ total_size = DSA_INITIAL_SEGMENT_SIZE *
+ ((size_t) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE));
+ total_size = Min(total_size, DSA_MAX_SEGMENT_SIZE);
+ total_size = Min(total_size,
+ area->control->max_total_segment_size -
+ area->control->total_segment_size);
+
+ total_pages = total_size / FPM_PAGE_SIZE;
+ metadata_bytes =
+ MAXALIGN(sizeof(dsa_segment_header)) +
+ MAXALIGN(sizeof(FreePageManager)) +
+ sizeof(dsa_pointer) * total_pages;
+
+ /* Add padding up to next page boundary. */
+ if (metadata_bytes % FPM_PAGE_SIZE != 0)
+ metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
+ if (total_size <= metadata_bytes)
+ return NULL;
+ usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE;
+ Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size);
+
+ /* See if that is enough... */
+ if (requested_pages > usable_pages)
+ {
+ /*
+ * We'll make an odd-sized segment, working forward from the requested
+ * number of pages.
+ */
+ usable_pages = requested_pages;
+ metadata_bytes =
+ MAXALIGN(sizeof(dsa_segment_header)) +
+ MAXALIGN(sizeof(FreePageManager)) +
+ usable_pages * sizeof(dsa_pointer);
+
+ /* Add padding up to next page boundary. */
+ if (metadata_bytes % FPM_PAGE_SIZE != 0)
+ metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
+ total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE;
+
+ /* Is that too large for dsa_pointer's addressing scheme? */
+ if (total_size > DSA_MAX_SEGMENT_SIZE)
+ return NULL;
+
+ /* Would that exceed the limit? */
+ if (total_size > area->control->max_total_segment_size -
+ area->control->total_segment_size)
+ return NULL;
+ }
+
+ /* Create the segment. */
+ segment = dsm_create(total_size, 0);
+ if (segment == NULL)
+ return NULL;
+ dsm_pin_segment(segment);
+ if (area->mapping_pinned)
+ dsm_pin_mapping(segment);
+
+ /* Store the handle in shared memory to be found by index. */
+ area->control->segment_handles[new_index] =
+ dsm_segment_handle(segment);
+ /* Track the highest segment index in the history of the area. */
+ if (area->control->high_segment_index < new_index)
+ area->control->high_segment_index = new_index;
+ /* Track the highest segment index this backend has ever mapped. */
+ if (area->high_segment_index < new_index)
+ area->high_segment_index = new_index;
+ /* Track total size of all segments. */
+ area->control->total_segment_size += total_size;
+ Assert(area->control->total_segment_size <=
+ area->control->max_total_segment_size);
+
+ /* Build a segment map for this segment in this backend. */
+ segment_map = &area->segment_maps[new_index];
+ segment_map->segment = segment;
+ segment_map->mapped_address = dsm_segment_address(segment);
+ segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
+ segment_map->fpm = (FreePageManager *)
+ (segment_map->mapped_address +
+ MAXALIGN(sizeof(dsa_segment_header)));
+ segment_map->pagemap = (dsa_pointer *)
+ (segment_map->mapped_address +
+ MAXALIGN(sizeof(dsa_segment_header)) +
+ MAXALIGN(sizeof(FreePageManager)));
+
+ /* Set up the free page map. */
+ FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
+ FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
+ usable_pages);
+
+ /* Set up the segment header and put it in the appropriate bin. */
+ segment_map->header->magic =
+ DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index;
+ segment_map->header->usable_pages = usable_pages;
+ segment_map->header->size = total_size;
+ segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
+ segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
+ segment_map->header->next =
+ area->control->segment_bins[segment_map->header->bin];
+ segment_map->header->freed = false;
+ area->control->segment_bins[segment_map->header->bin] = new_index;
+ if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
+ {
+ dsa_segment_map *next =
+ get_segment_by_index(area, segment_map->header->next);
+
+ Assert(next->header->bin == segment_map->header->bin);
+ next->header->prev = new_index;
+ }
+
+ return segment_map;
+}
+
+/*
+ * Check if any segments have been freed by destroy_superblock, so we can
+ * detach from them in this backend. This function is called by
+ * dsa_get_address and dsa_free to make sure that a dsa_pointer they have
+ * received can be resolved to the correct segment.
+ *
+ * The danger we want to defend against is that there could be an old segment
+ * mapped into a given slot in this backend, and the dsa_pointer they have
+ * might refer to some new segment in the same slot. So those functions must
+ * be sure to process all instructions to detach from a freed segment that had
+ * been generated by the time this process received the dsa_pointer, before
+ * they call get_segment_by_index.
+ */
+static void
+check_for_freed_segments(dsa_area *area)
+{
+ size_t freed_segment_counter;
+
+ /*
+ * Any other process that has freed a segment has incremented
+ * freed_segment_counter while holding an LWLock, and that must precede
+ * any backend creating a new segment in the same slot while holding an
+ * LWLock, and that must precede the creation of any dsa_pointer pointing
+ * into the new segment which might reach us here, and the caller must
+ * have sent the dsa_pointer to this process using appropriate memory
+ * synchronization (some kind of locking or atomic primitive or system
+ * call). So all we need to do on the reading side is ask for the load of
+ * freed_segment_counter to follow the caller's load of the dsa_pointer it
+ * has, and we can be sure to detect any segments that had been freed as
+ * of the time that the dsa_pointer reached this process.
+ */
+ pg_read_barrier();
+ freed_segment_counter = area->control->freed_segment_counter;
+ if (unlikely(area->freed_segment_counter != freed_segment_counter))
+ {
+ /* Check all currently mapped segments to find what's been freed. */
+ LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+ check_for_freed_segments_locked(area);
+ LWLockRelease(DSA_AREA_LOCK(area));
+ }
+}
+
+/*
+ * Workhorse for check_for_freed_segments(), and also used directly in path
+ * where the area lock is already held. This should be called after acquiring
+ * the lock but before looking up any segment by index number, to make sure we
+ * unmap any stale segments that might have previously had the same index as a
+ * current segment.
+ */
+static void
+check_for_freed_segments_locked(dsa_area *area)
+{
+ size_t freed_segment_counter;
+ int i;
+
+ Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
+ freed_segment_counter = area->control->freed_segment_counter;
+ if (unlikely(area->freed_segment_counter != freed_segment_counter))
+ {
+ for (i = 0; i <= area->high_segment_index; ++i)
+ {
+ if (area->segment_maps[i].header != NULL &&
+ area->segment_maps[i].header->freed)
+ {
+ dsm_detach(area->segment_maps[i].segment);
+ area->segment_maps[i].segment = NULL;
+ area->segment_maps[i].header = NULL;
+ area->segment_maps[i].mapped_address = NULL;
+ }
+ }
+ area->freed_segment_counter = freed_segment_counter;
+ }
+}
diff --git a/src/backend/utils/mmgr/freepage.c b/src/backend/utils/mmgr/freepage.c
new file mode 100644
index 0000000..4208317
--- /dev/null
+++ b/src/backend/utils/mmgr/freepage.c
@@ -0,0 +1,1886 @@
+/*-------------------------------------------------------------------------
+ *
+ * freepage.c
+ * Management of free memory pages.
+ *
+ * The intention of this code is to provide infrastructure for memory
+ * allocators written specifically for PostgreSQL. At least in the case
+ * of dynamic shared memory, we can't simply use malloc() or even
+ * relatively thin wrappers like palloc() which sit on top of it, because
+ * no allocator built into the operating system will deal with relative
+ * pointers. In the future, we may find other cases in which greater
+ * control over our own memory management seems desirable.
+ *
+ * A FreePageManager keeps track of which 4kB pages of memory are currently
+ * unused from the point of view of some higher-level memory allocator.
+ * Unlike a user-facing allocator such as palloc(), a FreePageManager can
+ * only allocate and free in units of whole pages, and freeing an
+ * allocation can only be done given knowledge of its length in pages.
+ *
+ * Since a free page manager has only a fixed amount of dedicated memory,
+ * and since there is no underlying allocator, it uses the free pages
+ * it is given to manage to store its bookkeeping data. It keeps multiple
+ * freelists of runs of pages, sorted by the size of the run; the head of
+ * each freelist is stored in the FreePageManager itself, and the first
+ * page of each run contains a relative pointer to the next run. See
+ * FreePageManagerGetInternal for more details on how the freelists are
+ * managed.
+ *
+ * To avoid memory fragmentation, it's important to consolidate adjacent
+ * spans of pages whenever possible; otherwise, large allocation requests
+ * might not be satisfied even when sufficient contiguous space is
+ * available. Therefore, in addition to the freelists, we maintain an
+ * in-memory btree of free page ranges ordered by page number. If a
+ * range being freed precedes or follows a range that is already free,
+ * the existing range is extended; if it exactly bridges the gap between
+ * free ranges, then the two existing ranges are consolidated with the
+ * newly-freed range to form one great big range of free pages.
+ *
+ * When there is only one range of free pages, the btree is trivial and
+ * is stored within the FreePageManager proper; otherwise, pages are
+ * allocated from the area under management as needed. Even in cases
+ * where memory fragmentation is very severe, only a tiny fraction of
+ * the pages under management are consumed by this btree.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/utils/mmgr/freepage.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "lib/stringinfo.h"
+#include "miscadmin.h"
+
+#include "utils/freepage.h"
+#include "utils/relptr.h"
+
+
+/* Magic numbers to identify various page types */
+#define FREE_PAGE_SPAN_LEADER_MAGIC 0xea4020f0
+#define FREE_PAGE_LEAF_MAGIC 0x98eae728
+#define FREE_PAGE_INTERNAL_MAGIC 0x19aa32c9
+
+/* Doubly linked list of spans of free pages; stored in first page of span. */
+struct FreePageSpanLeader
+{
+ int magic; /* always FREE_PAGE_SPAN_LEADER_MAGIC */
+ Size npages; /* number of pages in span */
+ RelptrFreePageSpanLeader prev;
+ RelptrFreePageSpanLeader next;
+};
+
+/* Common header for btree leaf and internal pages. */
+typedef struct FreePageBtreeHeader
+{
+ int magic; /* FREE_PAGE_LEAF_MAGIC or
+ * FREE_PAGE_INTERNAL_MAGIC */
+ Size nused; /* number of items used */
+ RelptrFreePageBtree parent; /* uplink */
+} FreePageBtreeHeader;
+
+/* Internal key; points to next level of btree. */
+typedef struct FreePageBtreeInternalKey
+{
+ Size first_page; /* low bound for keys on child page */
+ RelptrFreePageBtree child; /* downlink */
+} FreePageBtreeInternalKey;
+
+/* Leaf key; no payload data. */
+typedef struct FreePageBtreeLeafKey
+{
+ Size first_page; /* first page in span */
+ Size npages; /* number of pages in span */
+} FreePageBtreeLeafKey;
+
+/* Work out how many keys will fit on a page. */
+#define FPM_ITEMS_PER_INTERNAL_PAGE \
+ ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
+ sizeof(FreePageBtreeInternalKey))
+#define FPM_ITEMS_PER_LEAF_PAGE \
+ ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \
+ sizeof(FreePageBtreeLeafKey))
+
+/* A btree page of either sort */
+struct FreePageBtree
+{
+ FreePageBtreeHeader hdr;
+ union
+ {
+ FreePageBtreeInternalKey internal_key[FPM_ITEMS_PER_INTERNAL_PAGE];
+ FreePageBtreeLeafKey leaf_key[FPM_ITEMS_PER_LEAF_PAGE];
+ } u;
+};
+
+/* Results of a btree search */
+typedef struct FreePageBtreeSearchResult
+{
+ FreePageBtree *page;
+ Size index;
+ bool found;
+ unsigned split_pages;
+} FreePageBtreeSearchResult;
+
+/* Helper functions */
+static void FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm,
+ FreePageBtree *btp);
+static Size FreePageBtreeCleanup(FreePageManager *fpm);
+static FreePageBtree *FreePageBtreeFindLeftSibling(char *base,
+ FreePageBtree *btp);
+static FreePageBtree *FreePageBtreeFindRightSibling(char *base,
+ FreePageBtree *btp);
+static Size FreePageBtreeFirstKey(FreePageBtree *btp);
+static FreePageBtree *FreePageBtreeGetRecycled(FreePageManager *fpm);
+static void FreePageBtreeInsertInternal(char *base, FreePageBtree *btp,
+ Size index, Size first_page, FreePageBtree *child);
+static void FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index,
+ Size first_page, Size npages);
+static void FreePageBtreeRecycle(FreePageManager *fpm, Size pageno);
+static void FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp,
+ Size index);
+static void FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp);
+static void FreePageBtreeSearch(FreePageManager *fpm, Size first_page,
+ FreePageBtreeSearchResult *result);
+static Size FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page);
+static Size FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page);
+static FreePageBtree *FreePageBtreeSplitPage(FreePageManager *fpm,
+ FreePageBtree *btp);
+static void FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp);
+static void FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp,
+ FreePageBtree *parent, int level, StringInfo buf);
+static void FreePageManagerDumpSpans(FreePageManager *fpm,
+ FreePageSpanLeader *span, Size expected_pages,
+ StringInfo buf);
+static bool FreePageManagerGetInternal(FreePageManager *fpm, Size npages,
+ Size *first_page);
+static Size FreePageManagerPutInternal(FreePageManager *fpm, Size first_page,
+ Size npages, bool soft);
+static void FreePagePopSpanLeader(FreePageManager *fpm, Size pageno);
+static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page,
+ Size npages);
+static Size FreePageManagerLargestContiguous(FreePageManager *fpm);
+static void FreePageManagerUpdateLargest(FreePageManager *fpm);
+
+#ifdef FPM_EXTRA_ASSERTS
+static Size sum_free_pages(FreePageManager *fpm);
+#endif
+
+/*
+ * Initialize a new, empty free page manager.
+ *
+ * 'fpm' should reference caller-provided memory large enough to contain a
+ * FreePageManager. We'll initialize it here.
+ *
+ * 'base' is the address to which all pointers are relative. When managing
+ * a dynamic shared memory segment, it should normally be the base of the
+ * segment. When managing backend-private memory, it can be either NULL or,
+ * if managing a single contiguous extent of memory, the start of that extent.
+ */
+void
+FreePageManagerInitialize(FreePageManager *fpm, char *base)
+{
+ Size f;
+
+ relptr_store(base, fpm->self, fpm);
+ relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
+ relptr_store(base, fpm->btree_recycle, (FreePageSpanLeader *) NULL);
+ fpm->btree_depth = 0;
+ fpm->btree_recycle_count = 0;
+ fpm->singleton_first_page = 0;
+ fpm->singleton_npages = 0;
+ fpm->contiguous_pages = 0;
+ fpm->contiguous_pages_dirty = true;
+#ifdef FPM_EXTRA_ASSERTS
+ fpm->free_pages = 0;
+#endif
+
+ for (f = 0; f < FPM_NUM_FREELISTS; f++)
+ relptr_store(base, fpm->freelist[f], (FreePageSpanLeader *) NULL);
+}
+
+/*
+ * Allocate a run of pages of the given length from the free page manager.
+ * The return value indicates whether we were able to satisfy the request;
+ * if true, the first page of the allocation is stored in *first_page.
+ */
+bool
+FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
+{
+ bool result;
+ Size contiguous_pages;
+
+ result = FreePageManagerGetInternal(fpm, npages, first_page);
+
+ /*
+ * It's a bit counterintuitive, but allocating pages can actually create
+ * opportunities for cleanup that create larger ranges. We might pull a
+ * key out of the btree that enables the item at the head of the btree
+ * recycle list to be inserted; and then if there are more items behind it
+ * one of those might cause two currently-separated ranges to merge,
+ * creating a single range of contiguous pages larger than any that
+ * existed previously. It might be worth trying to improve the cleanup
+ * algorithm to avoid such corner cases, but for now we just notice the
+ * condition and do the appropriate reporting.
+ */
+ contiguous_pages = FreePageBtreeCleanup(fpm);
+ if (fpm->contiguous_pages < contiguous_pages)
+ fpm->contiguous_pages = contiguous_pages;
+
+ /*
+ * FreePageManagerGetInternal may have set contiguous_pages_dirty.
+ * Recompute contiguous_pages if so.
+ */
+ FreePageManagerUpdateLargest(fpm);
+
+#ifdef FPM_EXTRA_ASSERTS
+ if (result)
+ {
+ Assert(fpm->free_pages >= npages);
+ fpm->free_pages -= npages;
+ }
+ Assert(fpm->free_pages == sum_free_pages(fpm));
+ Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm));
+#endif
+ return result;
+}
+
+#ifdef FPM_EXTRA_ASSERTS
+static void
+sum_free_pages_recurse(FreePageManager *fpm, FreePageBtree *btp, Size *sum)
+{
+ char *base = fpm_segment_base(fpm);
+
+ Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ||
+ btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
+ ++*sum;
+ if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
+ {
+ Size index;
+
+
+ for (index = 0; index < btp->hdr.nused; ++index)
+ {
+ FreePageBtree *child;
+
+ child = relptr_access(base, btp->u.internal_key[index].child);
+ sum_free_pages_recurse(fpm, child, sum);
+ }
+ }
+}
+static Size
+sum_free_pages(FreePageManager *fpm)
+{
+ FreePageSpanLeader *recycle;
+ char *base = fpm_segment_base(fpm);
+ Size sum = 0;
+ int list;
+
+ /* Count the spans by scanning the freelists. */
+ for (list = 0; list < FPM_NUM_FREELISTS; ++list)
+ {
+
+ if (!relptr_is_null(fpm->freelist[list]))
+ {
+ FreePageSpanLeader *candidate =
+ relptr_access(base, fpm->freelist[list]);
+
+ do
+ {
+ sum += candidate->npages;
+ candidate = relptr_access(base, candidate->next);
+ } while (candidate != NULL);
+ }
+ }
+
+ /* Count btree internal pages. */
+ if (fpm->btree_depth > 0)
+ {
+ FreePageBtree *root = relptr_access(base, fpm->btree_root);
+
+ sum_free_pages_recurse(fpm, root, &sum);
+ }
+
+ /* Count the recycle list. */
+ for (recycle = relptr_access(base, fpm->btree_recycle);
+ recycle != NULL;
+ recycle = relptr_access(base, recycle->next))
+ {
+ Assert(recycle->npages == 1);
+ ++sum;
+ }
+
+ return sum;
+}
+#endif
+
+/*
+ * Compute the size of the largest run of pages that the user could
+ * successfully get.
+ */
+static Size
+FreePageManagerLargestContiguous(FreePageManager *fpm)
+{
+ char *base;
+ Size largest;
+
+ base = fpm_segment_base(fpm);
+ largest = 0;
+ if (!relptr_is_null(fpm->freelist[FPM_NUM_FREELISTS - 1]))
+ {
+ FreePageSpanLeader *candidate;
+
+ candidate = relptr_access(base, fpm->freelist[FPM_NUM_FREELISTS - 1]);
+ do
+ {
+ if (candidate->npages > largest)
+ largest = candidate->npages;
+ candidate = relptr_access(base, candidate->next);
+ } while (candidate != NULL);
+ }
+ else
+ {
+ Size f = FPM_NUM_FREELISTS - 1;
+
+ do
+ {
+ --f;
+ if (!relptr_is_null(fpm->freelist[f]))
+ {
+ largest = f + 1;
+ break;
+ }
+ } while (f > 0);
+ }
+
+ return largest;
+}
+
+/*
+ * Recompute the size of the largest run of pages that the user could
+ * successfully get, if it has been marked dirty.
+ */
+static void
+FreePageManagerUpdateLargest(FreePageManager *fpm)
+{
+ if (!fpm->contiguous_pages_dirty)
+ return;
+
+ fpm->contiguous_pages = FreePageManagerLargestContiguous(fpm);
+ fpm->contiguous_pages_dirty = false;
+}
+
+/*
+ * Transfer a run of pages to the free page manager.
+ */
+void
+FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
+{
+ Size contiguous_pages;
+
+ Assert(npages > 0);
+
+ /* Record the new pages. */
+ contiguous_pages =
+ FreePageManagerPutInternal(fpm, first_page, npages, false);
+
+ /*
+ * If the new range we inserted into the page manager was contiguous with
+ * an existing range, it may have opened up cleanup opportunities.
+ */
+ if (contiguous_pages > npages)
+ {
+ Size cleanup_contiguous_pages;
+
+ cleanup_contiguous_pages = FreePageBtreeCleanup(fpm);
+ if (cleanup_contiguous_pages > contiguous_pages)
+ contiguous_pages = cleanup_contiguous_pages;
+ }
+
+ /* See if we now have a new largest chunk. */
+ if (fpm->contiguous_pages < contiguous_pages)
+ fpm->contiguous_pages = contiguous_pages;
+
+ /*
+ * The earlier call to FreePageManagerPutInternal may have set
+ * contiguous_pages_dirty if it needed to allocate internal pages, so
+ * recompute contiguous_pages if necessary.
+ */
+ FreePageManagerUpdateLargest(fpm);
+
+#ifdef FPM_EXTRA_ASSERTS
+ fpm->free_pages += npages;
+ Assert(fpm->free_pages == sum_free_pages(fpm));
+ Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm));
+#endif
+}
+
+/*
+ * Produce a debugging dump of the state of a free page manager.
+ */
+char *
+FreePageManagerDump(FreePageManager *fpm)
+{
+ char *base = fpm_segment_base(fpm);
+ StringInfoData buf;
+ FreePageSpanLeader *recycle;
+ bool dumped_any_freelist = false;
+ Size f;
+
+ /* Initialize output buffer. */
+ initStringInfo(&buf);
+
+ /* Dump general stuff. */
+ appendStringInfo(&buf, "metadata: self %zu max contiguous pages = %zu\n",
+ relptr_offset(fpm->self), fpm->contiguous_pages);
+
+ /* Dump btree. */
+ if (fpm->btree_depth > 0)
+ {
+ FreePageBtree *root;
+
+ appendStringInfo(&buf, "btree depth %u:\n", fpm->btree_depth);
+ root = relptr_access(base, fpm->btree_root);
+ FreePageManagerDumpBtree(fpm, root, NULL, 0, &buf);
+ }
+ else if (fpm->singleton_npages > 0)
+ {
+ appendStringInfo(&buf, "singleton: %zu(%zu)\n",
+ fpm->singleton_first_page, fpm->singleton_npages);
+ }
+
+ /* Dump btree recycle list. */
+ recycle = relptr_access(base, fpm->btree_recycle);
+ if (recycle != NULL)
+ {
+ appendStringInfoString(&buf, "btree recycle:");
+ FreePageManagerDumpSpans(fpm, recycle, 1, &buf);
+ }
+
+ /* Dump free lists. */
+ for (f = 0; f < FPM_NUM_FREELISTS; ++f)
+ {
+ FreePageSpanLeader *span;
+
+ if (relptr_is_null(fpm->freelist[f]))
+ continue;
+ if (!dumped_any_freelist)
+ {
+ appendStringInfoString(&buf, "freelists:\n");
+ dumped_any_freelist = true;
+ }
+ appendStringInfo(&buf, " %zu:", f + 1);
+ span = relptr_access(base, fpm->freelist[f]);
+ FreePageManagerDumpSpans(fpm, span, f + 1, &buf);
+ }
+
+ /* And return result to caller. */
+ return buf.data;
+}
+
+
+/*
+ * The first_page value stored at index zero in any non-root page must match
+ * the first_page value stored in its parent at the index which points to that
+ * page. So when the value stored at index zero in a btree page changes, we've
+ * got to walk up the tree adjusting ancestor keys until we reach an ancestor
+ * where that key isn't index zero. This function should be called after
+ * updating the first key on the target page; it will propagate the change
+ * upward as far as needed.
+ *
+ * We assume here that the first key on the page has not changed enough to
+ * require changes in the ordering of keys on its ancestor pages. Thus,
+ * if we search the parent page for the first key greater than or equal to
+ * the first key on the current page, the downlink to this page will be either
+ * the exact index returned by the search (if the first key decreased)
+ * or one less (if the first key increased).
+ */
+static void
+FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, FreePageBtree *btp)
+{
+ char *base = fpm_segment_base(fpm);
+ Size first_page;
+ FreePageBtree *parent;
+ FreePageBtree *child;
+
+ /* This might be either a leaf or an internal page. */
+ Assert(btp->hdr.nused > 0);
+ if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+ {
+ Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE);
+ first_page = btp->u.leaf_key[0].first_page;
+ }
+ else
+ {
+ Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+ Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE);
+ first_page = btp->u.internal_key[0].first_page;
+ }
+ child = btp;
+
+ /* Loop until we find an ancestor that does not require adjustment. */
+ for (;;)
+ {
+ Size s;
+
+ parent = relptr_access(base, child->hdr.parent);
+ if (parent == NULL)
+ break;
+ s = FreePageBtreeSearchInternal(parent, first_page);
+
+ /* Key is either at index s or index s-1; figure out which. */
+ if (s >= parent->hdr.nused)
+ {
+ Assert(s == parent->hdr.nused);
+ --s;
+ }
+ else
+ {
+ FreePageBtree *check;
+
+ check = relptr_access(base, parent->u.internal_key[s].child);
+ if (check != child)
+ {
+ Assert(s > 0);
+ --s;
+ }
+ }
+
+#ifdef USE_ASSERT_CHECKING
+ /* Debugging double-check. */
+ {
+ FreePageBtree *check;
+
+ check = relptr_access(base, parent->u.internal_key[s].child);
+ Assert(s < parent->hdr.nused);
+ Assert(child == check);
+ }
+#endif
+
+ /* Update the parent key. */
+ parent->u.internal_key[s].first_page = first_page;
+
+ /*
+ * If this is the first key in the parent, go up another level; else
+ * done.
+ */
+ if (s > 0)
+ break;
+ child = parent;
+ }
+}
+
+/*
+ * Attempt to reclaim space from the free-page btree. The return value is
+ * the largest range of contiguous pages created by the cleanup operation.
+ */
+static Size
+FreePageBtreeCleanup(FreePageManager *fpm)
+{
+ char *base = fpm_segment_base(fpm);
+ Size max_contiguous_pages = 0;
+
+ /* Attempt to shrink the depth of the btree. */
+ while (!relptr_is_null(fpm->btree_root))
+ {
+ FreePageBtree *root = relptr_access(base, fpm->btree_root);
+
+ /* If the root contains only one key, reduce depth by one. */
+ if (root->hdr.nused == 1)
+ {
+ /* Shrink depth of tree by one. */
+ Assert(fpm->btree_depth > 0);
+ --fpm->btree_depth;
+ if (root->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+ {
+ /* If root is a leaf, convert only entry to singleton range. */
+ relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
+ fpm->singleton_first_page = root->u.leaf_key[0].first_page;
+ fpm->singleton_npages = root->u.leaf_key[0].npages;
+ }
+ else
+ {
+ FreePageBtree *newroot;
+
+ /* If root is an internal page, make only child the root. */
+ Assert(root->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+ relptr_copy(fpm->btree_root, root->u.internal_key[0].child);
+ newroot = relptr_access(base, fpm->btree_root);
+ relptr_store(base, newroot->hdr.parent, (FreePageBtree *) NULL);
+ }
+ FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, root));
+ }
+ else if (root->hdr.nused == 2 &&
+ root->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+ {
+ Size end_of_first;
+ Size start_of_second;
+
+ end_of_first = root->u.leaf_key[0].first_page +
+ root->u.leaf_key[0].npages;
+ start_of_second = root->u.leaf_key[1].first_page;
+
+ if (end_of_first + 1 == start_of_second)
+ {
+ Size root_page = fpm_pointer_to_page(base, root);
+
+ if (end_of_first == root_page)
+ {
+ FreePagePopSpanLeader(fpm, root->u.leaf_key[0].first_page);
+ FreePagePopSpanLeader(fpm, root->u.leaf_key[1].first_page);
+ fpm->singleton_first_page = root->u.leaf_key[0].first_page;
+ fpm->singleton_npages = root->u.leaf_key[0].npages +
+ root->u.leaf_key[1].npages + 1;
+ fpm->btree_depth = 0;
+ relptr_store(base, fpm->btree_root,
+ (FreePageBtree *) NULL);
+ FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
+ fpm->singleton_npages);
+ Assert(max_contiguous_pages == 0);
+ max_contiguous_pages = fpm->singleton_npages;
+ }
+ }
+
+ /* Whether it worked or not, it's time to stop. */
+ break;
+ }
+ else
+ {
+ /* Nothing more to do. Stop. */
+ break;
+ }
+ }
+
+ /*
+ * Attempt to free recycled btree pages. We skip this if releasing the
+ * recycled page would require a btree page split, because the page we're
+ * trying to recycle would be consumed by the split, which would be
+ * counterproductive.
+ *
+ * We also currently only ever attempt to recycle the first page on the
+ * list; that could be made more aggressive, but it's not clear that the
+ * complexity would be worthwhile.
+ */
+ while (fpm->btree_recycle_count > 0)
+ {
+ FreePageBtree *btp;
+ Size first_page;
+ Size contiguous_pages;
+
+ btp = FreePageBtreeGetRecycled(fpm);
+ first_page = fpm_pointer_to_page(base, btp);
+ contiguous_pages = FreePageManagerPutInternal(fpm, first_page, 1, true);
+ if (contiguous_pages == 0)
+ {
+ FreePageBtreeRecycle(fpm, first_page);
+ break;
+ }
+ else
+ {
+ if (contiguous_pages > max_contiguous_pages)
+ max_contiguous_pages = contiguous_pages;
+ }
+ }
+
+ return max_contiguous_pages;
+}
+
+/*
+ * Consider consolidating the given page with its left or right sibling,
+ * if it's fairly empty.
+ */
+static void
+FreePageBtreeConsolidate(FreePageManager *fpm, FreePageBtree *btp)
+{
+ char *base = fpm_segment_base(fpm);
+ FreePageBtree *np;
+ Size max;
+
+ /*
+ * We only try to consolidate pages that are less than a third full. We
+ * could be more aggressive about this, but that might risk performing
+ * consolidation only to end up splitting again shortly thereafter. Since
+ * the btree should be very small compared to the space under management,
+ * our goal isn't so much to ensure that it always occupies the absolutely
+ * smallest possible number of pages as to reclaim pages before things get
+ * too egregiously out of hand.
+ */
+ if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+ max = FPM_ITEMS_PER_LEAF_PAGE;
+ else
+ {
+ Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+ max = FPM_ITEMS_PER_INTERNAL_PAGE;
+ }
+ if (btp->hdr.nused >= max / 3)
+ return;
+
+ /*
+ * If we can fit our right sibling's keys onto this page, consolidate.
+ */
+ np = FreePageBtreeFindRightSibling(base, btp);
+ if (np != NULL && btp->hdr.nused + np->hdr.nused <= max)
+ {
+ if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+ {
+ memcpy(&btp->u.leaf_key[btp->hdr.nused], &np->u.leaf_key[0],
+ sizeof(FreePageBtreeLeafKey) * np->hdr.nused);
+ btp->hdr.nused += np->hdr.nused;
+ }
+ else
+ {
+ memcpy(&btp->u.internal_key[btp->hdr.nused], &np->u.internal_key[0],
+ sizeof(FreePageBtreeInternalKey) * np->hdr.nused);
+ btp->hdr.nused += np->hdr.nused;
+ FreePageBtreeUpdateParentPointers(base, btp);
+ }
+ FreePageBtreeRemovePage(fpm, np);
+ return;
+ }
+
+ /*
+ * If we can fit our keys onto our left sibling's page, consolidate. In
+ * this case, we move our keys onto the other page rather than vice versa,
+ * to avoid having to adjust ancestor keys.
+ */
+ np = FreePageBtreeFindLeftSibling(base, btp);
+ if (np != NULL && btp->hdr.nused + np->hdr.nused <= max)
+ {
+ if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+ {
+ memcpy(&np->u.leaf_key[np->hdr.nused], &btp->u.leaf_key[0],
+ sizeof(FreePageBtreeLeafKey) * btp->hdr.nused);
+ np->hdr.nused += btp->hdr.nused;
+ }
+ else
+ {
+ memcpy(&np->u.internal_key[np->hdr.nused], &btp->u.internal_key[0],
+ sizeof(FreePageBtreeInternalKey) * btp->hdr.nused);
+ np->hdr.nused += btp->hdr.nused;
+ FreePageBtreeUpdateParentPointers(base, np);
+ }
+ FreePageBtreeRemovePage(fpm, btp);
+ return;
+ }
+}
+
+/*
+ * Find the passed page's left sibling; that is, the page at the same level
+ * of the tree whose keyspace immediately precedes ours.
+ */
+static FreePageBtree *
+FreePageBtreeFindLeftSibling(char *base, FreePageBtree *btp)
+{
+ FreePageBtree *p = btp;
+ int levels = 0;
+
+ /* Move up until we can move left. */
+ for (;;)
+ {
+ Size first_page;
+ Size index;
+
+ first_page = FreePageBtreeFirstKey(p);
+ p = relptr_access(base, p->hdr.parent);
+
+ if (p == NULL)
+ return NULL; /* we were passed the rightmost page */
+
+ index = FreePageBtreeSearchInternal(p, first_page);
+ if (index > 0)
+ {
+ Assert(p->u.internal_key[index].first_page == first_page);
+ p = relptr_access(base, p->u.internal_key[index - 1].child);
+ break;
+ }
+ Assert(index == 0);
+ ++levels;
+ }
+
+ /* Descend left. */
+ while (levels > 0)
+ {
+ Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+ p = relptr_access(base, p->u.internal_key[p->hdr.nused - 1].child);
+ --levels;
+ }
+ Assert(p->hdr.magic == btp->hdr.magic);
+
+ return p;
+}
+
+/*
+ * Find the passed page's right sibling; that is, the page at the same level
+ * of the tree whose keyspace immediately follows ours.
+ */
+static FreePageBtree *
+FreePageBtreeFindRightSibling(char *base, FreePageBtree *btp)
+{
+ FreePageBtree *p = btp;
+ int levels = 0;
+
+ /* Move up until we can move right. */
+ for (;;)
+ {
+ Size first_page;
+ Size index;
+
+ first_page = FreePageBtreeFirstKey(p);
+ p = relptr_access(base, p->hdr.parent);
+
+ if (p == NULL)
+ return NULL; /* we were passed the rightmost page */
+
+ index = FreePageBtreeSearchInternal(p, first_page);
+ if (index < p->hdr.nused - 1)
+ {
+ Assert(p->u.internal_key[index].first_page == first_page);
+ p = relptr_access(base, p->u.internal_key[index + 1].child);
+ break;
+ }
+ Assert(index == p->hdr.nused - 1);
+ ++levels;
+ }
+
+ /* Descend left. */
+ while (levels > 0)
+ {
+ Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+ p = relptr_access(base, p->u.internal_key[0].child);
+ --levels;
+ }
+ Assert(p->hdr.magic == btp->hdr.magic);
+
+ return p;
+}
+
+/*
+ * Get the first key on a btree page.
+ */
+static Size
+FreePageBtreeFirstKey(FreePageBtree *btp)
+{
+ Assert(btp->hdr.nused > 0);
+
+ if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+ return btp->u.leaf_key[0].first_page;
+ else
+ {
+ Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+ return btp->u.internal_key[0].first_page;
+ }
+}
+
+/*
+ * Get a page from the btree recycle list for use as a btree page.
+ */
+static FreePageBtree *
+FreePageBtreeGetRecycled(FreePageManager *fpm)
+{
+ char *base = fpm_segment_base(fpm);
+ FreePageSpanLeader *victim = relptr_access(base, fpm->btree_recycle);
+ FreePageSpanLeader *newhead;
+
+ Assert(victim != NULL);
+ newhead = relptr_access(base, victim->next);
+ if (newhead != NULL)
+ relptr_copy(newhead->prev, victim->prev);
+ relptr_store(base, fpm->btree_recycle, newhead);
+ Assert(fpm_pointer_is_page_aligned(base, victim));
+ fpm->btree_recycle_count--;
+ return (FreePageBtree *) victim;
+}
+
+/*
+ * Insert an item into an internal page.
+ */
+static void
+FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, Size index,
+ Size first_page, FreePageBtree *child)
+{
+ Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+ Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE);
+ Assert(index <= btp->hdr.nused);
+ memmove(&btp->u.internal_key[index + 1], &btp->u.internal_key[index],
+ sizeof(FreePageBtreeInternalKey) * (btp->hdr.nused - index));
+ btp->u.internal_key[index].first_page = first_page;
+ relptr_store(base, btp->u.internal_key[index].child, child);
+ ++btp->hdr.nused;
+}
+
+/*
+ * Insert an item into a leaf page.
+ */
+static void
+FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, Size first_page,
+ Size npages)
+{
+ Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
+ Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE);
+ Assert(index <= btp->hdr.nused);
+ memmove(&btp->u.leaf_key[index + 1], &btp->u.leaf_key[index],
+ sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index));
+ btp->u.leaf_key[index].first_page = first_page;
+ btp->u.leaf_key[index].npages = npages;
+ ++btp->hdr.nused;
+}
+
+/*
+ * Put a page on the btree recycle list.
+ */
+static void
+FreePageBtreeRecycle(FreePageManager *fpm, Size pageno)
+{
+ char *base = fpm_segment_base(fpm);
+ FreePageSpanLeader *head = relptr_access(base, fpm->btree_recycle);
+ FreePageSpanLeader *span;
+
+ span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno);
+ span->magic = FREE_PAGE_SPAN_LEADER_MAGIC;
+ span->npages = 1;
+ relptr_store(base, span->next, head);
+ relptr_store(base, span->prev, (FreePageSpanLeader *) NULL);
+ if (head != NULL)
+ relptr_store(base, head->prev, span);
+ relptr_store(base, fpm->btree_recycle, span);
+ fpm->btree_recycle_count++;
+}
+
+/*
+ * Remove an item from the btree at the given position on the given page.
+ */
+static void
+FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, Size index)
+{
+ Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
+ Assert(index < btp->hdr.nused);
+
+ /* When last item is removed, extirpate entire page from btree. */
+ if (btp->hdr.nused == 1)
+ {
+ FreePageBtreeRemovePage(fpm, btp);
+ return;
+ }
+
+ /* Physically remove the key from the page. */
+ --btp->hdr.nused;
+ if (index < btp->hdr.nused)
+ memmove(&btp->u.leaf_key[index], &btp->u.leaf_key[index + 1],
+ sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index));
+
+ /* If we just removed the first key, adjust ancestor keys. */
+ if (index == 0)
+ FreePageBtreeAdjustAncestorKeys(fpm, btp);
+
+ /* Consider whether to consolidate this page with a sibling. */
+ FreePageBtreeConsolidate(fpm, btp);
+}
+
+/*
+ * Remove a page from the btree. Caller is responsible for having relocated
+ * any keys from this page that are still wanted. The page is placed on the
+ * recycled list.
+ */
+static void
+FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp)
+{
+ char *base = fpm_segment_base(fpm);
+ FreePageBtree *parent;
+ Size index;
+ Size first_page;
+
+ for (;;)
+ {
+ /* Find parent page. */
+ parent = relptr_access(base, btp->hdr.parent);
+ if (parent == NULL)
+ {
+ /* We are removing the root page. */
+ relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL);
+ fpm->btree_depth = 0;
+ Assert(fpm->singleton_first_page == 0);
+ Assert(fpm->singleton_npages == 0);
+ return;
+ }
+
+ /*
+ * If the parent contains only one item, we need to remove it as well.
+ */
+ if (parent->hdr.nused > 1)
+ break;
+ FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp));
+ btp = parent;
+ }
+
+ /* Find and remove the downlink. */
+ first_page = FreePageBtreeFirstKey(btp);
+ if (parent->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+ {
+ index = FreePageBtreeSearchLeaf(parent, first_page);
+ Assert(index < parent->hdr.nused);
+ if (index < parent->hdr.nused - 1)
+ memmove(&parent->u.leaf_key[index],
+ &parent->u.leaf_key[index + 1],
+ sizeof(FreePageBtreeLeafKey)
+ * (parent->hdr.nused - index - 1));
+ }
+ else
+ {
+ index = FreePageBtreeSearchInternal(parent, first_page);
+ Assert(index < parent->hdr.nused);
+ if (index < parent->hdr.nused - 1)
+ memmove(&parent->u.internal_key[index],
+ &parent->u.internal_key[index + 1],
+ sizeof(FreePageBtreeInternalKey)
+ * (parent->hdr.nused - index - 1));
+ }
+ parent->hdr.nused--;
+ Assert(parent->hdr.nused > 0);
+
+ /* Recycle the page. */
+ FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp));
+
+ /* Adjust ancestor keys if needed. */
+ if (index == 0)
+ FreePageBtreeAdjustAncestorKeys(fpm, parent);
+
+ /* Consider whether to consolidate the parent with a sibling. */
+ FreePageBtreeConsolidate(fpm, parent);
+}
+
+/*
+ * Search the btree for an entry for the given first page and initialize
+ * *result with the results of the search. result->page and result->index
+ * indicate either the position of an exact match or the position at which
+ * the new key should be inserted. result->found is true for an exact match,
+ * otherwise false. result->split_pages will contain the number of additional
+ * btree pages that will be needed when performing a split to insert a key.
+ * Except as described above, the contents of fields in the result object are
+ * undefined on return.
+ */
+static void
+FreePageBtreeSearch(FreePageManager *fpm, Size first_page,
+ FreePageBtreeSearchResult *result)
+{
+ char *base = fpm_segment_base(fpm);
+ FreePageBtree *btp = relptr_access(base, fpm->btree_root);
+ Size index;
+
+ result->split_pages = 1;
+
+ /* If the btree is empty, there's nothing to find. */
+ if (btp == NULL)
+ {
+ result->page = NULL;
+ result->found = false;
+ return;
+ }
+
+ /* Descend until we hit a leaf. */
+ while (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
+ {
+ FreePageBtree *child;
+ bool found_exact;
+
+ index = FreePageBtreeSearchInternal(btp, first_page);
+ found_exact = index < btp->hdr.nused &&
+ btp->u.internal_key[index].first_page == first_page;
+
+ /*
+ * If we found an exact match we descend directly. Otherwise, we
+ * descend into the child to the left if possible so that we can find
+ * the insertion point at that child's high end.
+ */
+ if (!found_exact && index > 0)
+ --index;
+
+ /* Track required split depth for leaf insert. */
+ if (btp->hdr.nused >= FPM_ITEMS_PER_INTERNAL_PAGE)
+ {
+ Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE);
+ result->split_pages++;
+ }
+ else
+ result->split_pages = 0;
+
+ /* Descend to appropriate child page. */
+ Assert(index < btp->hdr.nused);
+ child = relptr_access(base, btp->u.internal_key[index].child);
+ Assert(relptr_access(base, child->hdr.parent) == btp);
+ btp = child;
+ }
+
+ /* Track required split depth for leaf insert. */
+ if (btp->hdr.nused >= FPM_ITEMS_PER_LEAF_PAGE)
+ {
+ Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE);
+ result->split_pages++;
+ }
+ else
+ result->split_pages = 0;
+
+ /* Search leaf page. */
+ index = FreePageBtreeSearchLeaf(btp, first_page);
+
+ /* Assemble results. */
+ result->page = btp;
+ result->index = index;
+ result->found = index < btp->hdr.nused &&
+ first_page == btp->u.leaf_key[index].first_page;
+}
+
+/*
+ * Search an internal page for the first key greater than or equal to a given
+ * page number. Returns the index of that key, or one greater than the number
+ * of keys on the page if none.
+ */
+static Size
+FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page)
+{
+ Size low = 0;
+ Size high = btp->hdr.nused;
+
+ Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+ Assert(high > 0 && high <= FPM_ITEMS_PER_INTERNAL_PAGE);
+
+ while (low < high)
+ {
+ Size mid = (low + high) / 2;
+ Size val = btp->u.internal_key[mid].first_page;
+
+ if (first_page == val)
+ return mid;
+ else if (first_page < val)
+ high = mid;
+ else
+ low = mid + 1;
+ }
+
+ return low;
+}
+
+/*
+ * Search a leaf page for the first key greater than or equal to a given
+ * page number. Returns the index of that key, or one greater than the number
+ * of keys on the page if none.
+ */
+static Size
+FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page)
+{
+ Size low = 0;
+ Size high = btp->hdr.nused;
+
+ Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC);
+ Assert(high > 0 && high <= FPM_ITEMS_PER_LEAF_PAGE);
+
+ while (low < high)
+ {
+ Size mid = (low + high) / 2;
+ Size val = btp->u.leaf_key[mid].first_page;
+
+ if (first_page == val)
+ return mid;
+ else if (first_page < val)
+ high = mid;
+ else
+ low = mid + 1;
+ }
+
+ return low;
+}
+
+/*
+ * Allocate a new btree page and move half the keys from the provided page
+ * to the new page. Caller is responsible for making sure that there's a
+ * page available from fpm->btree_recycle. Returns a pointer to the new page,
+ * to which caller must add a downlink.
+ */
+static FreePageBtree *
+FreePageBtreeSplitPage(FreePageManager *fpm, FreePageBtree *btp)
+{
+ FreePageBtree *newsibling;
+
+ newsibling = FreePageBtreeGetRecycled(fpm);
+ newsibling->hdr.magic = btp->hdr.magic;
+ newsibling->hdr.nused = btp->hdr.nused / 2;
+ relptr_copy(newsibling->hdr.parent, btp->hdr.parent);
+ btp->hdr.nused -= newsibling->hdr.nused;
+
+ if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC)
+ memcpy(&newsibling->u.leaf_key,
+ &btp->u.leaf_key[btp->hdr.nused],
+ sizeof(FreePageBtreeLeafKey) * newsibling->hdr.nused);
+ else
+ {
+ Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+ memcpy(&newsibling->u.internal_key,
+ &btp->u.internal_key[btp->hdr.nused],
+ sizeof(FreePageBtreeInternalKey) * newsibling->hdr.nused);
+ FreePageBtreeUpdateParentPointers(fpm_segment_base(fpm), newsibling);
+ }
+
+ return newsibling;
+}
+
+/*
+ * When internal pages are split or merged, the parent pointers of their
+ * children must be updated.
+ */
+static void
+FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp)
+{
+ Size i;
+
+ Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC);
+ for (i = 0; i < btp->hdr.nused; ++i)
+ {
+ FreePageBtree *child;
+
+ child = relptr_access(base, btp->u.internal_key[i].child);
+ relptr_store(base, child->hdr.parent, btp);
+ }
+}
+
+/*
+ * Debugging dump of btree data.
+ */
+static void
+FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp,
+ FreePageBtree *parent, int level, StringInfo buf)
+{
+ char *base = fpm_segment_base(fpm);
+ Size pageno = fpm_pointer_to_page(base, btp);
+ Size index;
+ FreePageBtree *check_parent;
+
+ check_stack_depth();
+ check_parent = relptr_access(base, btp->hdr.parent);
+ appendStringInfo(buf, " %zu@%d %c", pageno, level,
+ btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ? 'i' : 'l');
+ if (parent != check_parent)
+ appendStringInfo(buf, " [actual parent %zu, expected %zu]",
+ fpm_pointer_to_page(base, check_parent),
+ fpm_pointer_to_page(base, parent));
+ appendStringInfoChar(buf, ':');
+ for (index = 0; index < btp->hdr.nused; ++index)
+ {
+ if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
+ appendStringInfo(buf, " %zu->%zu",
+ btp->u.internal_key[index].first_page,
+ relptr_offset(btp->u.internal_key[index].child) / FPM_PAGE_SIZE);
+ else
+ appendStringInfo(buf, " %zu(%zu)",
+ btp->u.leaf_key[index].first_page,
+ btp->u.leaf_key[index].npages);
+ }
+ appendStringInfoChar(buf, '\n');
+
+ if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
+ {
+ for (index = 0; index < btp->hdr.nused; ++index)
+ {
+ FreePageBtree *child;
+
+ child = relptr_access(base, btp->u.internal_key[index].child);
+ FreePageManagerDumpBtree(fpm, child, btp, level + 1, buf);
+ }
+ }
+}
+
+/*
+ * Debugging dump of free-span data.
+ */
+static void
+FreePageManagerDumpSpans(FreePageManager *fpm, FreePageSpanLeader *span,
+ Size expected_pages, StringInfo buf)
+{
+ char *base = fpm_segment_base(fpm);
+
+ while (span != NULL)
+ {
+ if (span->npages != expected_pages)
+ appendStringInfo(buf, " %zu(%zu)", fpm_pointer_to_page(base, span),
+ span->npages);
+ else
+ appendStringInfo(buf, " %zu", fpm_pointer_to_page(base, span));
+ span = relptr_access(base, span->next);
+ }
+
+ appendStringInfoChar(buf, '\n');
+}
+
+/*
+ * This function allocates a run of pages of the given length from the free
+ * page manager.
+ */
+static bool
+FreePageManagerGetInternal(FreePageManager *fpm, Size npages, Size *first_page)
+{
+ char *base = fpm_segment_base(fpm);
+ FreePageSpanLeader *victim = NULL;
+ FreePageSpanLeader *prev;
+ FreePageSpanLeader *next;
+ FreePageBtreeSearchResult result;
+ Size victim_page = 0; /* placate compiler */
+ Size f;
+
+ /*
+ * Search for a free span.
+ *
+ * Right now, we use a simple best-fit policy here, but it's possible for
+ * this to result in memory fragmentation if we're repeatedly asked to
+ * allocate chunks just a little smaller than what we have available.
+ * Hopefully, this is unlikely, because we expect most requests to be
+ * single pages or superblock-sized chunks -- but no policy can be optimal
+ * under all circumstances unless it has knowledge of future allocation
+ * patterns.
+ */
+ for (f = Min(npages, FPM_NUM_FREELISTS) - 1; f < FPM_NUM_FREELISTS; ++f)
+ {
+ /* Skip empty freelists. */
+ if (relptr_is_null(fpm->freelist[f]))
+ continue;
+
+ /*
+ * All of the freelists except the last one contain only items of a
+ * single size, so we just take the first one. But the final free
+ * list contains everything too big for any of the other lists, so we
+ * need to search the list.
+ */
+ if (f < FPM_NUM_FREELISTS - 1)
+ victim = relptr_access(base, fpm->freelist[f]);
+ else
+ {
+ FreePageSpanLeader *candidate;
+
+ candidate = relptr_access(base, fpm->freelist[f]);
+ do
+ {
+ if (candidate->npages >= npages && (victim == NULL ||
+ victim->npages > candidate->npages))
+ {
+ victim = candidate;
+ if (victim->npages == npages)
+ break;
+ }
+ candidate = relptr_access(base, candidate->next);
+ } while (candidate != NULL);
+ }
+ break;
+ }
+
+ /* If we didn't find an allocatable span, return failure. */
+ if (victim == NULL)
+ return false;
+
+ /* Remove span from free list. */
+ Assert(victim->magic == FREE_PAGE_SPAN_LEADER_MAGIC);
+ prev = relptr_access(base, victim->prev);
+ next = relptr_access(base, victim->next);
+ if (prev != NULL)
+ relptr_copy(prev->next, victim->next);
+ else
+ relptr_copy(fpm->freelist[f], victim->next);
+ if (next != NULL)
+ relptr_copy(next->prev, victim->prev);
+ victim_page = fpm_pointer_to_page(base, victim);
+
+ /* Decide whether we might be invalidating contiguous_pages. */
+ if (f == FPM_NUM_FREELISTS - 1 &&
+ victim->npages == fpm->contiguous_pages)
+ {
+ /*
+ * The victim span came from the oversized freelist, and had the same
+ * size as the longest span. There may or may not be another one of
+ * the same size, so contiguous_pages must be recomputed just to be
+ * safe.
+ */
+ fpm->contiguous_pages_dirty = true;
+ }
+ else if (f + 1 == fpm->contiguous_pages &&
+ relptr_is_null(fpm->freelist[f]))
+ {
+ /*
+ * The victim span came from a fixed sized freelist, and it was the
+ * list for spans of the same size as the current longest span, and
+ * the list is now empty after removing the victim. So
+ * contiguous_pages must be recomputed without a doubt.
+ */
+ fpm->contiguous_pages_dirty = true;
+ }
+
+ /*
+ * If we haven't initialized the btree yet, the victim must be the single
+ * span stored within the FreePageManager itself. Otherwise, we need to
+ * update the btree.
+ */
+ if (relptr_is_null(fpm->btree_root))
+ {
+ Assert(victim_page == fpm->singleton_first_page);
+ Assert(victim->npages == fpm->singleton_npages);
+ Assert(victim->npages >= npages);
+ fpm->singleton_first_page += npages;
+ fpm->singleton_npages -= npages;
+ if (fpm->singleton_npages > 0)
+ FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
+ fpm->singleton_npages);
+ }
+ else
+ {
+ /*
+ * If the span we found is exactly the right size, remove it from the
+ * btree completely. Otherwise, adjust the btree entry to reflect the
+ * still-unallocated portion of the span, and put that portion on the
+ * appropriate free list.
+ */
+ FreePageBtreeSearch(fpm, victim_page, &result);
+ Assert(result.found);
+ if (victim->npages == npages)
+ FreePageBtreeRemove(fpm, result.page, result.index);
+ else
+ {
+ FreePageBtreeLeafKey *key;
+
+ /* Adjust btree to reflect remaining pages. */
+ Assert(victim->npages > npages);
+ key = &result.page->u.leaf_key[result.index];
+ Assert(key->npages == victim->npages);
+ key->first_page += npages;
+ key->npages -= npages;
+ if (result.index == 0)
+ FreePageBtreeAdjustAncestorKeys(fpm, result.page);
+
+ /* Put the unallocated pages back on the appropriate free list. */
+ FreePagePushSpanLeader(fpm, victim_page + npages,
+ victim->npages - npages);
+ }
+ }
+
+ /* Return results to caller. */
+ *first_page = fpm_pointer_to_page(base, victim);
+ return true;
+}
+
+/*
+ * Put a range of pages into the btree and freelists, consolidating it with
+ * existing free spans just before and/or after it. If 'soft' is true,
+ * only perform the insertion if it can be done without allocating new btree
+ * pages; if false, do it always. Returns 0 if the soft flag caused the
+ * insertion to be skipped, or otherwise the size of the contiguous span
+ * created by the insertion. This may be larger than npages if we're able
+ * to consolidate with an adjacent range.
+ */
+static Size
+FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages,
+ bool soft)
+{
+ char *base = fpm_segment_base(fpm);
+ FreePageBtreeSearchResult result;
+ FreePageBtreeLeafKey *prevkey = NULL;
+ FreePageBtreeLeafKey *nextkey = NULL;
+ FreePageBtree *np;
+ Size nindex;
+
+ Assert(npages > 0);
+
+ /* We can store a single free span without initializing the btree. */
+ if (fpm->btree_depth == 0)
+ {
+ if (fpm->singleton_npages == 0)
+ {
+ /* Don't have a span yet; store this one. */
+ fpm->singleton_first_page = first_page;
+ fpm->singleton_npages = npages;
+ FreePagePushSpanLeader(fpm, first_page, npages);
+ return fpm->singleton_npages;
+ }
+ else if (fpm->singleton_first_page + fpm->singleton_npages ==
+ first_page)
+ {
+ /* New span immediately follows sole existing span. */
+ fpm->singleton_npages += npages;
+ FreePagePopSpanLeader(fpm, fpm->singleton_first_page);
+ FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
+ fpm->singleton_npages);
+ return fpm->singleton_npages;
+ }
+ else if (first_page + npages == fpm->singleton_first_page)
+ {
+ /* New span immediately precedes sole existing span. */
+ FreePagePopSpanLeader(fpm, fpm->singleton_first_page);
+ fpm->singleton_first_page = first_page;
+ fpm->singleton_npages += npages;
+ FreePagePushSpanLeader(fpm, fpm->singleton_first_page,
+ fpm->singleton_npages);
+ return fpm->singleton_npages;
+ }
+ else
+ {
+ /* Not contiguous; we need to initialize the btree. */
+ Size root_page;
+ FreePageBtree *root;
+
+ if (!relptr_is_null(fpm->btree_recycle))
+ root = FreePageBtreeGetRecycled(fpm);
+ else if (soft)
+ return 0; /* Should not allocate if soft. */
+ else if (FreePageManagerGetInternal(fpm, 1, &root_page))
+ root = (FreePageBtree *) fpm_page_to_pointer(base, root_page);
+ else
+ {
+ /* We'd better be able to get a page from the existing range. */
+ elog(FATAL, "free page manager btree is corrupt");
+ }
+
+ /* Create the btree and move the preexisting range into it. */
+ root->hdr.magic = FREE_PAGE_LEAF_MAGIC;
+ root->hdr.nused = 1;
+ relptr_store(base, root->hdr.parent, (FreePageBtree *) NULL);
+ root->u.leaf_key[0].first_page = fpm->singleton_first_page;
+ root->u.leaf_key[0].npages = fpm->singleton_npages;
+ relptr_store(base, fpm->btree_root, root);
+ fpm->singleton_first_page = 0;
+ fpm->singleton_npages = 0;
+ fpm->btree_depth = 1;
+
+ /*
+ * Corner case: it may be that the btree root took the very last
+ * free page. In that case, the sole btree entry covers a zero
+ * page run, which is invalid. Overwrite it with the entry we're
+ * trying to insert and get out.
+ */
+ if (root->u.leaf_key[0].npages == 0)
+ {
+ root->u.leaf_key[0].first_page = first_page;
+ root->u.leaf_key[0].npages = npages;
+ FreePagePushSpanLeader(fpm, first_page, npages);
+ return npages;
+ }
+
+ /* Fall through to insert the new key. */
+ }
+ }
+
+ /* Search the btree. */
+ FreePageBtreeSearch(fpm, first_page, &result);
+ Assert(!result.found);
+ if (result.index > 0)
+ prevkey = &result.page->u.leaf_key[result.index - 1];
+ if (result.index < result.page->hdr.nused)
+ {
+ np = result.page;
+ nindex = result.index;
+ nextkey = &result.page->u.leaf_key[result.index];
+ }
+ else
+ {
+ np = FreePageBtreeFindRightSibling(base, result.page);
+ nindex = 0;
+ if (np != NULL)
+ nextkey = &np->u.leaf_key[0];
+ }
+
+ /* Consolidate with the previous entry if possible. */
+ if (prevkey != NULL && prevkey->first_page + prevkey->npages >= first_page)
+ {
+ bool remove_next = false;
+ Size result;
+
+ Assert(prevkey->first_page + prevkey->npages == first_page);
+ prevkey->npages = (first_page - prevkey->first_page) + npages;
+
+ /* Check whether we can *also* consolidate with the following entry. */
+ if (nextkey != NULL &&
+ prevkey->first_page + prevkey->npages >= nextkey->first_page)
+ {
+ Assert(prevkey->first_page + prevkey->npages ==
+ nextkey->first_page);
+ prevkey->npages = (nextkey->first_page - prevkey->first_page)
+ + nextkey->npages;
+ FreePagePopSpanLeader(fpm, nextkey->first_page);
+ remove_next = true;
+ }
+
+ /* Put the span on the correct freelist and save size. */
+ FreePagePopSpanLeader(fpm, prevkey->first_page);
+ FreePagePushSpanLeader(fpm, prevkey->first_page, prevkey->npages);
+ result = prevkey->npages;
+
+ /*
+ * If we consolidated with both the preceding and following entries,
+ * we must remove the following entry. We do this last, because
+ * removing an element from the btree may invalidate pointers we hold
+ * into the current data structure.
+ *
+ * NB: The btree is technically in an invalid state a this point
+ * because we've already updated prevkey to cover the same key space
+ * as nextkey. FreePageBtreeRemove() shouldn't notice that, though.
+ */
+ if (remove_next)
+ FreePageBtreeRemove(fpm, np, nindex);
+
+ return result;
+ }
+
+ /* Consolidate with the next entry if possible. */
+ if (nextkey != NULL && first_page + npages >= nextkey->first_page)
+ {
+ Size newpages;
+
+ /* Compute new size for span. */
+ Assert(first_page + npages == nextkey->first_page);
+ newpages = (nextkey->first_page - first_page) + nextkey->npages;
+
+ /* Put span on correct free list. */
+ FreePagePopSpanLeader(fpm, nextkey->first_page);
+ FreePagePushSpanLeader(fpm, first_page, newpages);
+
+ /* Update key in place. */
+ nextkey->first_page = first_page;
+ nextkey->npages = newpages;
+
+ /* If reducing first key on page, ancestors might need adjustment. */
+ if (nindex == 0)
+ FreePageBtreeAdjustAncestorKeys(fpm, np);
+
+ return nextkey->npages;
+ }
+
+ /* Split leaf page and as many of its ancestors as necessary. */
+ if (result.split_pages > 0)
+ {
+ /*
+ * NB: We could consider various coping strategies here to avoid a
+ * split; most obviously, if np != result.page, we could target that
+ * page instead. More complicated shuffling strategies could be
+ * possible as well; basically, unless every single leaf page is 100%
+ * full, we can jam this key in there if we try hard enough. It's
+ * unlikely that trying that hard is worthwhile, but it's possible we
+ * might need to make more than no effort. For now, we just do the
+ * easy thing, which is nothing.
+ */
+
+ /* If this is a soft insert, it's time to give up. */
+ if (soft)
+ return 0;
+
+ /* Check whether we need to allocate more btree pages to split. */
+ if (result.split_pages > fpm->btree_recycle_count)
+ {
+ Size pages_needed;
+ Size recycle_page;
+ Size i;
+
+ /*
+ * Allocate the required number of pages and split each one in
+ * turn. This should never fail, because if we've got enough
+ * spans of free pages kicking around that we need additional
+ * storage space just to remember them all, then we should
+ * certainly have enough to expand the btree, which should only
+ * ever use a tiny number of pages compared to the number under
+ * management. If it does, something's badly screwed up.
+ */
+ pages_needed = result.split_pages - fpm->btree_recycle_count;
+ for (i = 0; i < pages_needed; ++i)
+ {
+ if (!FreePageManagerGetInternal(fpm, 1, &recycle_page))
+ elog(FATAL, "free page manager btree is corrupt");
+ FreePageBtreeRecycle(fpm, recycle_page);
+ }
+
+ /*
+ * The act of allocating pages to recycle may have invalidated the
+ * results of our previous btree research, so repeat it. (We could
+ * recheck whether any of our split-avoidance strategies that were
+ * not viable before now are, but it hardly seems worthwhile, so
+ * we don't bother. Consolidation can't be possible now if it
+ * wasn't previously.)
+ */
+ FreePageBtreeSearch(fpm, first_page, &result);
+
+ /*
+ * The act of allocating pages for use in constructing our btree
+ * should never cause any page to become more full, so the new
+ * split depth should be no greater than the old one, and perhaps
+ * less if we fortuitously allocated a chunk that freed up a slot
+ * on the page we need to update.
+ */
+ Assert(result.split_pages <= fpm->btree_recycle_count);
+ }
+
+ /* If we still need to perform a split, do it. */
+ if (result.split_pages > 0)
+ {
+ FreePageBtree *split_target = result.page;
+ FreePageBtree *child = NULL;
+ Size key = first_page;
+
+ for (;;)
+ {
+ FreePageBtree *newsibling;
+ FreePageBtree *parent;
+
+ /* Identify parent page, which must receive downlink. */
+ parent = relptr_access(base, split_target->hdr.parent);
+
+ /* Split the page - downlink not added yet. */
+ newsibling = FreePageBtreeSplitPage(fpm, split_target);
+
+ /*
+ * At this point in the loop, we're always carrying a pending
+ * insertion. On the first pass, it's the actual key we're
+ * trying to insert; on subsequent passes, it's the downlink
+ * that needs to be added as a result of the split performed
+ * during the previous loop iteration. Since we've just split
+ * the page, there's definitely room on one of the two
+ * resulting pages.
+ */
+ if (child == NULL)
+ {
+ Size index;
+ FreePageBtree *insert_into;
+
+ insert_into = key < newsibling->u.leaf_key[0].first_page ?
+ split_target : newsibling;
+ index = FreePageBtreeSearchLeaf(insert_into, key);
+ FreePageBtreeInsertLeaf(insert_into, index, key, npages);
+ if (index == 0 && insert_into == split_target)
+ FreePageBtreeAdjustAncestorKeys(fpm, split_target);
+ }
+ else
+ {
+ Size index;
+ FreePageBtree *insert_into;
+
+ insert_into =
+ key < newsibling->u.internal_key[0].first_page ?
+ split_target : newsibling;
+ index = FreePageBtreeSearchInternal(insert_into, key);
+ FreePageBtreeInsertInternal(base, insert_into, index,
+ key, child);
+ relptr_store(base, child->hdr.parent, insert_into);
+ if (index == 0 && insert_into == split_target)
+ FreePageBtreeAdjustAncestorKeys(fpm, split_target);
+ }
+
+ /* If the page we just split has no parent, split the root. */
+ if (parent == NULL)
+ {
+ FreePageBtree *newroot;
+
+ newroot = FreePageBtreeGetRecycled(fpm);
+ newroot->hdr.magic = FREE_PAGE_INTERNAL_MAGIC;
+ newroot->hdr.nused = 2;
+ relptr_store(base, newroot->hdr.parent,
+ (FreePageBtree *) NULL);
+ newroot->u.internal_key[0].first_page =
+ FreePageBtreeFirstKey(split_target);
+ relptr_store(base, newroot->u.internal_key[0].child,
+ split_target);
+ relptr_store(base, split_target->hdr.parent, newroot);
+ newroot->u.internal_key[1].first_page =
+ FreePageBtreeFirstKey(newsibling);
+ relptr_store(base, newroot->u.internal_key[1].child,
+ newsibling);
+ relptr_store(base, newsibling->hdr.parent, newroot);
+ relptr_store(base, fpm->btree_root, newroot);
+ fpm->btree_depth++;
+
+ break;
+ }
+
+ /* If the parent page isn't full, insert the downlink. */
+ key = newsibling->u.internal_key[0].first_page;
+ if (parent->hdr.nused < FPM_ITEMS_PER_INTERNAL_PAGE)
+ {
+ Size index;
+
+ index = FreePageBtreeSearchInternal(parent, key);
+ FreePageBtreeInsertInternal(base, parent, index,
+ key, newsibling);
+ relptr_store(base, newsibling->hdr.parent, parent);
+ if (index == 0)
+ FreePageBtreeAdjustAncestorKeys(fpm, parent);
+ break;
+ }
+
+ /* The parent also needs to be split, so loop around. */
+ child = newsibling;
+ split_target = parent;
+ }
+
+ /*
+ * The loop above did the insert, so just need to update the free
+ * list, and we're done.
+ */
+ FreePagePushSpanLeader(fpm, first_page, npages);
+
+ return npages;
+ }
+ }
+
+ /* Physically add the key to the page. */
+ Assert(result.page->hdr.nused < FPM_ITEMS_PER_LEAF_PAGE);
+ FreePageBtreeInsertLeaf(result.page, result.index, first_page, npages);
+
+ /* If new first key on page, ancestors might need adjustment. */
+ if (result.index == 0)
+ FreePageBtreeAdjustAncestorKeys(fpm, result.page);
+
+ /* Put it on the free list. */
+ FreePagePushSpanLeader(fpm, first_page, npages);
+
+ return npages;
+}
+
+/*
+ * Remove a FreePageSpanLeader from the linked-list that contains it, either
+ * because we're changing the size of the span, or because we're allocating it.
+ */
+static void
+FreePagePopSpanLeader(FreePageManager *fpm, Size pageno)
+{
+ char *base = fpm_segment_base(fpm);
+ FreePageSpanLeader *span;
+ FreePageSpanLeader *next;
+ FreePageSpanLeader *prev;
+
+ span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno);
+
+ next = relptr_access(base, span->next);
+ prev = relptr_access(base, span->prev);
+ if (next != NULL)
+ relptr_copy(next->prev, span->prev);
+ if (prev != NULL)
+ relptr_copy(prev->next, span->next);
+ else
+ {
+ Size f = Min(span->npages, FPM_NUM_FREELISTS) - 1;
+
+ Assert(relptr_offset(fpm->freelist[f]) == pageno * FPM_PAGE_SIZE);
+ relptr_copy(fpm->freelist[f], span->next);
+ }
+}
+
+/*
+ * Initialize a new FreePageSpanLeader and put it on the appropriate free list.
+ */
+static void
+FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, Size npages)
+{
+ char *base = fpm_segment_base(fpm);
+ Size f = Min(npages, FPM_NUM_FREELISTS) - 1;
+ FreePageSpanLeader *head = relptr_access(base, fpm->freelist[f]);
+ FreePageSpanLeader *span;
+
+ span = (FreePageSpanLeader *) fpm_page_to_pointer(base, first_page);
+ span->magic = FREE_PAGE_SPAN_LEADER_MAGIC;
+ span->npages = npages;
+ relptr_store(base, span->next, head);
+ relptr_store(base, span->prev, (FreePageSpanLeader *) NULL);
+ if (head != NULL)
+ relptr_store(base, head->prev, span);
+ relptr_store(base, fpm->freelist[f], span);
+}
diff --git a/src/backend/utils/mmgr/generation.c b/src/backend/utils/mmgr/generation.c
new file mode 100644
index 0000000..584cd61
--- /dev/null
+++ b/src/backend/utils/mmgr/generation.c
@@ -0,0 +1,832 @@
+/*-------------------------------------------------------------------------
+ *
+ * generation.c
+ * Generational allocator definitions.
+ *
+ * Generation is a custom MemoryContext implementation designed for cases of
+ * chunks with similar lifespan.
+ *
+ * Portions Copyright (c) 2017-2021, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/utils/mmgr/generation.c
+ *
+ *
+ * This memory context is based on the assumption that the chunks are freed
+ * roughly in the same order as they were allocated (FIFO), or in groups with
+ * similar lifespan (generations - hence the name of the context). This is
+ * typical for various queue-like use cases, i.e. when tuples are constructed,
+ * processed and then thrown away.
+ *
+ * The memory context uses a very simple approach to free space management.
+ * Instead of a complex global freelist, each block tracks a number
+ * of allocated and freed chunks. Freed chunks are not reused, and once all
+ * chunks in a block are freed, the whole block is thrown away. When the
+ * chunks allocated in the same block have similar lifespan, this works
+ * very well and is very cheap.
+ *
+ * The current implementation only uses a fixed block size - maybe it should
+ * adapt a min/max block size range, and grow the blocks automatically.
+ * It already uses dedicated blocks for oversized chunks.
+ *
+ * XXX It might be possible to improve this by keeping a small freelist for
+ * only a small number of recent blocks, but it's not clear it's worth the
+ * additional complexity.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "lib/ilist.h"
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+
+
+#define Generation_BLOCKHDRSZ MAXALIGN(sizeof(GenerationBlock))
+#define Generation_CHUNKHDRSZ sizeof(GenerationChunk)
+
+typedef struct GenerationBlock GenerationBlock; /* forward reference */
+typedef struct GenerationChunk GenerationChunk;
+
+typedef void *GenerationPointer;
+
+/*
+ * GenerationContext is a simple memory context not reusing allocated chunks,
+ * and freeing blocks once all chunks are freed.
+ */
+typedef struct GenerationContext
+{
+ MemoryContextData header; /* Standard memory-context fields */
+
+ /* Generational context parameters */
+ Size blockSize; /* standard block size */
+
+ GenerationBlock *block; /* current (most recently allocated) block */
+ dlist_head blocks; /* list of blocks */
+} GenerationContext;
+
+/*
+ * GenerationBlock
+ * GenerationBlock is the unit of memory that is obtained by generation.c
+ * from malloc(). It contains one or more GenerationChunks, which are
+ * the units requested by palloc() and freed by pfree(). GenerationChunks
+ * cannot be returned to malloc() individually, instead pfree()
+ * updates the free counter of the block and when all chunks in a block
+ * are free the whole block is returned to malloc().
+ *
+ * GenerationBlock is the header data for a block --- the usable space
+ * within the block begins at the next alignment boundary.
+ */
+struct GenerationBlock
+{
+ dlist_node node; /* doubly-linked list of blocks */
+ Size blksize; /* allocated size of this block */
+ int nchunks; /* number of chunks in the block */
+ int nfree; /* number of free chunks */
+ char *freeptr; /* start of free space in this block */
+ char *endptr; /* end of space in this block */
+};
+
+/*
+ * GenerationChunk
+ * The prefix of each piece of memory in a GenerationBlock
+ *
+ * Note: to meet the memory context APIs, the payload area of the chunk must
+ * be maxaligned, and the "context" link must be immediately adjacent to the
+ * payload area (cf. GetMemoryChunkContext). We simplify matters for this
+ * module by requiring sizeof(GenerationChunk) to be maxaligned, and then
+ * we can ensure things work by adding any required alignment padding before
+ * the pointer fields. There is a static assertion below that the alignment
+ * is done correctly.
+ */
+struct GenerationChunk
+{
+ /* size is always the size of the usable space in the chunk */
+ Size size;
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* when debugging memory usage, also store actual requested size */
+ /* this is zero in a free chunk */
+ Size requested_size;
+
+#define GENERATIONCHUNK_RAWSIZE (SIZEOF_SIZE_T * 2 + SIZEOF_VOID_P * 2)
+#else
+#define GENERATIONCHUNK_RAWSIZE (SIZEOF_SIZE_T + SIZEOF_VOID_P * 2)
+#endif /* MEMORY_CONTEXT_CHECKING */
+
+ /* ensure proper alignment by adding padding if needed */
+#if (GENERATIONCHUNK_RAWSIZE % MAXIMUM_ALIGNOF) != 0
+ char padding[MAXIMUM_ALIGNOF - GENERATIONCHUNK_RAWSIZE % MAXIMUM_ALIGNOF];
+#endif
+
+ GenerationBlock *block; /* block owning this chunk */
+ GenerationContext *context; /* owning context, or NULL if freed chunk */
+ /* there must not be any padding to reach a MAXALIGN boundary here! */
+};
+
+/*
+ * Only the "context" field should be accessed outside this module.
+ * We keep the rest of an allocated chunk's header marked NOACCESS when using
+ * valgrind. But note that freed chunk headers are kept accessible, for
+ * simplicity.
+ */
+#define GENERATIONCHUNK_PRIVATE_LEN offsetof(GenerationChunk, context)
+
+/*
+ * GenerationIsValid
+ * True iff set is valid allocation set.
+ */
+#define GenerationIsValid(set) PointerIsValid(set)
+
+#define GenerationPointerGetChunk(ptr) \
+ ((GenerationChunk *)(((char *)(ptr)) - Generation_CHUNKHDRSZ))
+#define GenerationChunkGetPointer(chk) \
+ ((GenerationPointer *)(((char *)(chk)) + Generation_CHUNKHDRSZ))
+
+/*
+ * These functions implement the MemoryContext API for Generation contexts.
+ */
+static void *GenerationAlloc(MemoryContext context, Size size);
+static void GenerationFree(MemoryContext context, void *pointer);
+static void *GenerationRealloc(MemoryContext context, void *pointer, Size size);
+static void GenerationReset(MemoryContext context);
+static void GenerationDelete(MemoryContext context);
+static Size GenerationGetChunkSpace(MemoryContext context, void *pointer);
+static bool GenerationIsEmpty(MemoryContext context);
+static void GenerationStats(MemoryContext context,
+ MemoryStatsPrintFunc printfunc, void *passthru,
+ MemoryContextCounters *totals,
+ bool print_to_stderr);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+static void GenerationCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for Generation contexts.
+ */
+static const MemoryContextMethods GenerationMethods = {
+ GenerationAlloc,
+ GenerationFree,
+ GenerationRealloc,
+ GenerationReset,
+ GenerationDelete,
+ GenerationGetChunkSpace,
+ GenerationIsEmpty,
+ GenerationStats
+#ifdef MEMORY_CONTEXT_CHECKING
+ ,GenerationCheck
+#endif
+};
+
+
+/*
+ * Public routines
+ */
+
+
+/*
+ * GenerationContextCreate
+ * Create a new Generation context.
+ *
+ * parent: parent context, or NULL if top-level context
+ * name: name of context (must be statically allocated)
+ * blockSize: generation block size
+ */
+MemoryContext
+GenerationContextCreate(MemoryContext parent,
+ const char *name,
+ Size blockSize)
+{
+ GenerationContext *set;
+
+ /* Assert we padded GenerationChunk properly */
+ StaticAssertStmt(Generation_CHUNKHDRSZ == MAXALIGN(Generation_CHUNKHDRSZ),
+ "sizeof(GenerationChunk) is not maxaligned");
+ StaticAssertStmt(offsetof(GenerationChunk, context) + sizeof(MemoryContext) ==
+ Generation_CHUNKHDRSZ,
+ "padding calculation in GenerationChunk is wrong");
+
+ /*
+ * First, validate allocation parameters. (If we're going to throw an
+ * error, we should do so before the context is created, not after.) We
+ * somewhat arbitrarily enforce a minimum 1K block size, mostly because
+ * that's what AllocSet does.
+ */
+ if (blockSize != MAXALIGN(blockSize) ||
+ blockSize < 1024 ||
+ !AllocHugeSizeIsValid(blockSize))
+ elog(ERROR, "invalid blockSize for memory context: %zu",
+ blockSize);
+
+ /*
+ * Allocate the context header. Unlike aset.c, we never try to combine
+ * this with the first regular block, since that would prevent us from
+ * freeing the first generation of allocations.
+ */
+
+ set = (GenerationContext *) malloc(MAXALIGN(sizeof(GenerationContext)));
+ if (set == NULL)
+ {
+ MemoryContextStats(TopMemoryContext);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed while creating memory context \"%s\".",
+ name)));
+ }
+
+ /*
+ * Avoid writing code that can fail between here and MemoryContextCreate;
+ * we'd leak the header if we ereport in this stretch.
+ */
+
+ /* Fill in GenerationContext-specific header fields */
+ set->blockSize = blockSize;
+ set->block = NULL;
+ dlist_init(&set->blocks);
+
+ /* Finally, do the type-independent part of context creation */
+ MemoryContextCreate((MemoryContext) set,
+ T_GenerationContext,
+ &GenerationMethods,
+ parent,
+ name);
+
+ return (MemoryContext) set;
+}
+
+/*
+ * GenerationReset
+ * Frees all memory which is allocated in the given set.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */
+static void
+GenerationReset(MemoryContext context)
+{
+ GenerationContext *set = (GenerationContext *) context;
+ dlist_mutable_iter miter;
+
+ AssertArg(GenerationIsValid(set));
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* Check for corruption and leaks before freeing */
+ GenerationCheck(context);
+#endif
+
+ dlist_foreach_modify(miter, &set->blocks)
+ {
+ GenerationBlock *block = dlist_container(GenerationBlock, node, miter.cur);
+
+ dlist_delete(miter.cur);
+
+ context->mem_allocated -= block->blksize;
+
+#ifdef CLOBBER_FREED_MEMORY
+ wipe_mem(block, block->blksize);
+#endif
+
+ free(block);
+ }
+
+ set->block = NULL;
+
+ Assert(dlist_is_empty(&set->blocks));
+}
+
+/*
+ * GenerationDelete
+ * Free all memory which is allocated in the given context.
+ */
+static void
+GenerationDelete(MemoryContext context)
+{
+ /* Reset to release all the GenerationBlocks */
+ GenerationReset(context);
+ /* And free the context header */
+ free(context);
+}
+
+/*
+ * GenerationAlloc
+ * Returns pointer to allocated memory of given size or NULL if
+ * request could not be completed; memory is added to the set.
+ *
+ * No request may exceed:
+ * MAXALIGN_DOWN(SIZE_MAX) - Generation_BLOCKHDRSZ - Generation_CHUNKHDRSZ
+ * All callers use a much-lower limit.
+ *
+ * Note: when using valgrind, it doesn't matter how the returned allocation
+ * is marked, as mcxt.c will set it to UNDEFINED. In some paths we will
+ * return space that is marked NOACCESS - GenerationRealloc has to beware!
+ */
+static void *
+GenerationAlloc(MemoryContext context, Size size)
+{
+ GenerationContext *set = (GenerationContext *) context;
+ GenerationBlock *block;
+ GenerationChunk *chunk;
+ Size chunk_size = MAXALIGN(size);
+
+ /* is it an over-sized chunk? if yes, allocate special block */
+ if (chunk_size > set->blockSize / 8)
+ {
+ Size blksize = chunk_size + Generation_BLOCKHDRSZ + Generation_CHUNKHDRSZ;
+
+ block = (GenerationBlock *) malloc(blksize);
+ if (block == NULL)
+ return NULL;
+
+ context->mem_allocated += blksize;
+
+ /* block with a single (used) chunk */
+ block->blksize = blksize;
+ block->nchunks = 1;
+ block->nfree = 0;
+
+ /* the block is completely full */
+ block->freeptr = block->endptr = ((char *) block) + blksize;
+
+ chunk = (GenerationChunk *) (((char *) block) + Generation_BLOCKHDRSZ);
+ chunk->block = block;
+ chunk->context = set;
+ chunk->size = chunk_size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ chunk->requested_size = size;
+ /* set mark to catch clobber of "unused" space */
+ if (size < chunk_size)
+ set_sentinel(GenerationChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+ /* fill the allocated space with junk */
+ randomize_mem((char *) GenerationChunkGetPointer(chunk), size);
+#endif
+
+ /* add the block to the list of allocated blocks */
+ dlist_push_head(&set->blocks, &block->node);
+
+ /* Ensure any padding bytes are marked NOACCESS. */
+ VALGRIND_MAKE_MEM_NOACCESS((char *) GenerationChunkGetPointer(chunk) + size,
+ chunk_size - size);
+
+ /* Disallow external access to private part of chunk header. */
+ VALGRIND_MAKE_MEM_NOACCESS(chunk, GENERATIONCHUNK_PRIVATE_LEN);
+
+ return GenerationChunkGetPointer(chunk);
+ }
+
+ /*
+ * Not an over-sized chunk. Is there enough space in the current block? If
+ * not, allocate a new "regular" block.
+ */
+ block = set->block;
+
+ if ((block == NULL) ||
+ (block->endptr - block->freeptr) < Generation_CHUNKHDRSZ + chunk_size)
+ {
+ Size blksize = set->blockSize;
+
+ block = (GenerationBlock *) malloc(blksize);
+
+ if (block == NULL)
+ return NULL;
+
+ context->mem_allocated += blksize;
+
+ block->blksize = blksize;
+ block->nchunks = 0;
+ block->nfree = 0;
+
+ block->freeptr = ((char *) block) + Generation_BLOCKHDRSZ;
+ block->endptr = ((char *) block) + blksize;
+
+ /* Mark unallocated space NOACCESS. */
+ VALGRIND_MAKE_MEM_NOACCESS(block->freeptr,
+ blksize - Generation_BLOCKHDRSZ);
+
+ /* add it to the doubly-linked list of blocks */
+ dlist_push_head(&set->blocks, &block->node);
+
+ /* and also use it as the current allocation block */
+ set->block = block;
+ }
+
+ /* we're supposed to have a block with enough free space now */
+ Assert(block != NULL);
+ Assert((block->endptr - block->freeptr) >= Generation_CHUNKHDRSZ + chunk_size);
+
+ chunk = (GenerationChunk *) block->freeptr;
+
+ /* Prepare to initialize the chunk header. */
+ VALGRIND_MAKE_MEM_UNDEFINED(chunk, Generation_CHUNKHDRSZ);
+
+ block->nchunks += 1;
+ block->freeptr += (Generation_CHUNKHDRSZ + chunk_size);
+
+ Assert(block->freeptr <= block->endptr);
+
+ chunk->block = block;
+ chunk->context = set;
+ chunk->size = chunk_size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ chunk->requested_size = size;
+ /* set mark to catch clobber of "unused" space */
+ if (size < chunk->size)
+ set_sentinel(GenerationChunkGetPointer(chunk), size);
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+ /* fill the allocated space with junk */
+ randomize_mem((char *) GenerationChunkGetPointer(chunk), size);
+#endif
+
+ /* Ensure any padding bytes are marked NOACCESS. */
+ VALGRIND_MAKE_MEM_NOACCESS((char *) GenerationChunkGetPointer(chunk) + size,
+ chunk_size - size);
+
+ /* Disallow external access to private part of chunk header. */
+ VALGRIND_MAKE_MEM_NOACCESS(chunk, GENERATIONCHUNK_PRIVATE_LEN);
+
+ return GenerationChunkGetPointer(chunk);
+}
+
+/*
+ * GenerationFree
+ * Update number of chunks in the block, and if all chunks in the block
+ * are now free then discard the block.
+ */
+static void
+GenerationFree(MemoryContext context, void *pointer)
+{
+ GenerationContext *set = (GenerationContext *) context;
+ GenerationChunk *chunk = GenerationPointerGetChunk(pointer);
+ GenerationBlock *block;
+
+ /* Allow access to private part of chunk header. */
+ VALGRIND_MAKE_MEM_DEFINED(chunk, GENERATIONCHUNK_PRIVATE_LEN);
+
+ block = chunk->block;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* Test for someone scribbling on unused space in chunk */
+ if (chunk->requested_size < chunk->size)
+ if (!sentinel_ok(pointer, chunk->requested_size))
+ elog(WARNING, "detected write past chunk end in %s %p",
+ ((MemoryContext) set)->name, chunk);
+#endif
+
+#ifdef CLOBBER_FREED_MEMORY
+ wipe_mem(pointer, chunk->size);
+#endif
+
+ /* Reset context to NULL in freed chunks */
+ chunk->context = NULL;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* Reset requested_size to 0 in freed chunks */
+ chunk->requested_size = 0;
+#endif
+
+ block->nfree += 1;
+
+ Assert(block->nchunks > 0);
+ Assert(block->nfree <= block->nchunks);
+
+ /* If there are still allocated chunks in the block, we're done. */
+ if (block->nfree < block->nchunks)
+ return;
+
+ /*
+ * The block is empty, so let's get rid of it. First remove it from the
+ * list of blocks, then return it to malloc().
+ */
+ dlist_delete(&block->node);
+
+ /* Also make sure the block is not marked as the current block. */
+ if (set->block == block)
+ set->block = NULL;
+
+ context->mem_allocated -= block->blksize;
+ free(block);
+}
+
+/*
+ * GenerationRealloc
+ * When handling repalloc, we simply allocate a new chunk, copy the data
+ * and discard the old one. The only exception is when the new size fits
+ * into the old chunk - in that case we just update chunk header.
+ */
+static void *
+GenerationRealloc(MemoryContext context, void *pointer, Size size)
+{
+ GenerationContext *set = (GenerationContext *) context;
+ GenerationChunk *chunk = GenerationPointerGetChunk(pointer);
+ GenerationPointer newPointer;
+ Size oldsize;
+
+ /* Allow access to private part of chunk header. */
+ VALGRIND_MAKE_MEM_DEFINED(chunk, GENERATIONCHUNK_PRIVATE_LEN);
+
+ oldsize = chunk->size;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* Test for someone scribbling on unused space in chunk */
+ if (chunk->requested_size < oldsize)
+ if (!sentinel_ok(pointer, chunk->requested_size))
+ elog(WARNING, "detected write past chunk end in %s %p",
+ ((MemoryContext) set)->name, chunk);
+#endif
+
+ /*
+ * Maybe the allocated area already is >= the new size. (In particular,
+ * we always fall out here if the requested size is a decrease.)
+ *
+ * This memory context does not use power-of-2 chunk sizing and instead
+ * carves the chunks to be as small as possible, so most repalloc() calls
+ * will end up in the palloc/memcpy/pfree branch.
+ *
+ * XXX Perhaps we should annotate this condition with unlikely()?
+ */
+ if (oldsize >= size)
+ {
+#ifdef MEMORY_CONTEXT_CHECKING
+ Size oldrequest = chunk->requested_size;
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+ /* We can only fill the extra space if we know the prior request */
+ if (size > oldrequest)
+ randomize_mem((char *) pointer + oldrequest,
+ size - oldrequest);
+#endif
+
+ chunk->requested_size = size;
+
+ /*
+ * If this is an increase, mark any newly-available part UNDEFINED.
+ * Otherwise, mark the obsolete part NOACCESS.
+ */
+ if (size > oldrequest)
+ VALGRIND_MAKE_MEM_UNDEFINED((char *) pointer + oldrequest,
+ size - oldrequest);
+ else
+ VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size,
+ oldsize - size);
+
+ /* set mark to catch clobber of "unused" space */
+ if (size < oldsize)
+ set_sentinel(pointer, size);
+#else /* !MEMORY_CONTEXT_CHECKING */
+
+ /*
+ * We don't have the information to determine whether we're growing
+ * the old request or shrinking it, so we conservatively mark the
+ * entire new allocation DEFINED.
+ */
+ VALGRIND_MAKE_MEM_NOACCESS(pointer, oldsize);
+ VALGRIND_MAKE_MEM_DEFINED(pointer, size);
+#endif
+
+ /* Disallow external access to private part of chunk header. */
+ VALGRIND_MAKE_MEM_NOACCESS(chunk, GENERATIONCHUNK_PRIVATE_LEN);
+
+ return pointer;
+ }
+
+ /* allocate new chunk */
+ newPointer = GenerationAlloc((MemoryContext) set, size);
+
+ /* leave immediately if request was not completed */
+ if (newPointer == NULL)
+ {
+ /* Disallow external access to private part of chunk header. */
+ VALGRIND_MAKE_MEM_NOACCESS(chunk, GENERATIONCHUNK_PRIVATE_LEN);
+ return NULL;
+ }
+
+ /*
+ * GenerationAlloc() may have returned a region that is still NOACCESS.
+ * Change it to UNDEFINED for the moment; memcpy() will then transfer
+ * definedness from the old allocation to the new. If we know the old
+ * allocation, copy just that much. Otherwise, make the entire old chunk
+ * defined to avoid errors as we copy the currently-NOACCESS trailing
+ * bytes.
+ */
+ VALGRIND_MAKE_MEM_UNDEFINED(newPointer, size);
+#ifdef MEMORY_CONTEXT_CHECKING
+ oldsize = chunk->requested_size;
+#else
+ VALGRIND_MAKE_MEM_DEFINED(pointer, oldsize);
+#endif
+
+ /* transfer existing data (certain to fit) */
+ memcpy(newPointer, pointer, oldsize);
+
+ /* free old chunk */
+ GenerationFree((MemoryContext) set, pointer);
+
+ return newPointer;
+}
+
+/*
+ * GenerationGetChunkSpace
+ * Given a currently-allocated chunk, determine the total space
+ * it occupies (including all memory-allocation overhead).
+ */
+static Size
+GenerationGetChunkSpace(MemoryContext context, void *pointer)
+{
+ GenerationChunk *chunk = GenerationPointerGetChunk(pointer);
+ Size result;
+
+ VALGRIND_MAKE_MEM_DEFINED(chunk, GENERATIONCHUNK_PRIVATE_LEN);
+ result = chunk->size + Generation_CHUNKHDRSZ;
+ VALGRIND_MAKE_MEM_NOACCESS(chunk, GENERATIONCHUNK_PRIVATE_LEN);
+ return result;
+}
+
+/*
+ * GenerationIsEmpty
+ * Is a GenerationContext empty of any allocated space?
+ */
+static bool
+GenerationIsEmpty(MemoryContext context)
+{
+ GenerationContext *set = (GenerationContext *) context;
+
+ return dlist_is_empty(&set->blocks);
+}
+
+/*
+ * GenerationStats
+ * Compute stats about memory consumption of a Generation context.
+ *
+ * printfunc: if not NULL, pass a human-readable stats string to this.
+ * passthru: pass this pointer through to printfunc.
+ * totals: if not NULL, add stats about this context into *totals.
+ * print_to_stderr: print stats to stderr if true, elog otherwise.
+ *
+ * XXX freespace only accounts for empty space at the end of the block, not
+ * space of freed chunks (which is unknown).
+ */
+static void
+GenerationStats(MemoryContext context,
+ MemoryStatsPrintFunc printfunc, void *passthru,
+ MemoryContextCounters *totals, bool print_to_stderr)
+{
+ GenerationContext *set = (GenerationContext *) context;
+ Size nblocks = 0;
+ Size nchunks = 0;
+ Size nfreechunks = 0;
+ Size totalspace;
+ Size freespace = 0;
+ dlist_iter iter;
+
+ /* Include context header in totalspace */
+ totalspace = MAXALIGN(sizeof(GenerationContext));
+
+ dlist_foreach(iter, &set->blocks)
+ {
+ GenerationBlock *block = dlist_container(GenerationBlock, node, iter.cur);
+
+ nblocks++;
+ nchunks += block->nchunks;
+ nfreechunks += block->nfree;
+ totalspace += block->blksize;
+ freespace += (block->endptr - block->freeptr);
+ }
+
+ if (printfunc)
+ {
+ char stats_string[200];
+
+ snprintf(stats_string, sizeof(stats_string),
+ "%zu total in %zd blocks (%zd chunks); %zu free (%zd chunks); %zu used",
+ totalspace, nblocks, nchunks, freespace,
+ nfreechunks, totalspace - freespace);
+ printfunc(context, passthru, stats_string, print_to_stderr);
+ }
+
+ if (totals)
+ {
+ totals->nblocks += nblocks;
+ totals->freechunks += nfreechunks;
+ totals->totalspace += totalspace;
+ totals->freespace += freespace;
+ }
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * GenerationCheck
+ * Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL. Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+GenerationCheck(MemoryContext context)
+{
+ GenerationContext *gen = (GenerationContext *) context;
+ const char *name = context->name;
+ dlist_iter iter;
+ Size total_allocated = 0;
+
+ /* walk all blocks in this context */
+ dlist_foreach(iter, &gen->blocks)
+ {
+ GenerationBlock *block = dlist_container(GenerationBlock, node, iter.cur);
+ int nfree,
+ nchunks;
+ char *ptr;
+
+ total_allocated += block->blksize;
+
+ /*
+ * nfree > nchunks is surely wrong, and we don't expect to see
+ * equality either, because such a block should have gotten freed.
+ */
+ if (block->nfree >= block->nchunks)
+ elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p exceeds %d allocated",
+ name, block->nfree, block, block->nchunks);
+
+ /* Now walk through the chunks and count them. */
+ nfree = 0;
+ nchunks = 0;
+ ptr = ((char *) block) + Generation_BLOCKHDRSZ;
+
+ while (ptr < block->freeptr)
+ {
+ GenerationChunk *chunk = (GenerationChunk *) ptr;
+
+ /* Allow access to private part of chunk header. */
+ VALGRIND_MAKE_MEM_DEFINED(chunk, GENERATIONCHUNK_PRIVATE_LEN);
+
+ /* move to the next chunk */
+ ptr += (chunk->size + Generation_CHUNKHDRSZ);
+
+ nchunks += 1;
+
+ /* chunks have both block and context pointers, so check both */
+ if (chunk->block != block)
+ elog(WARNING, "problem in Generation %s: bogus block link in block %p, chunk %p",
+ name, block, chunk);
+
+ /*
+ * Check for valid context pointer. Note this is an incomplete
+ * test, since palloc(0) produces an allocated chunk with
+ * requested_size == 0.
+ */
+ if ((chunk->requested_size > 0 && chunk->context != gen) ||
+ (chunk->context != gen && chunk->context != NULL))
+ elog(WARNING, "problem in Generation %s: bogus context link in block %p, chunk %p",
+ name, block, chunk);
+
+ /* now make sure the chunk size is correct */
+ if (chunk->size < chunk->requested_size ||
+ chunk->size != MAXALIGN(chunk->size))
+ elog(WARNING, "problem in Generation %s: bogus chunk size in block %p, chunk %p",
+ name, block, chunk);
+
+ /* is chunk allocated? */
+ if (chunk->context != NULL)
+ {
+ /* check sentinel, but only in allocated blocks */
+ if (chunk->requested_size < chunk->size &&
+ !sentinel_ok(chunk, Generation_CHUNKHDRSZ + chunk->requested_size))
+ elog(WARNING, "problem in Generation %s: detected write past chunk end in block %p, chunk %p",
+ name, block, chunk);
+ }
+ else
+ nfree += 1;
+
+ /*
+ * If chunk is allocated, disallow external access to private part
+ * of chunk header.
+ */
+ if (chunk->context != NULL)
+ VALGRIND_MAKE_MEM_NOACCESS(chunk, GENERATIONCHUNK_PRIVATE_LEN);
+ }
+
+ /*
+ * Make sure we got the expected number of allocated and free chunks
+ * (as tracked in the block header).
+ */
+ if (nchunks != block->nchunks)
+ elog(WARNING, "problem in Generation %s: number of allocated chunks %d in block %p does not match header %d",
+ name, nchunks, block, block->nchunks);
+
+ if (nfree != block->nfree)
+ elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p does not match header %d",
+ name, nfree, block, block->nfree);
+ }
+
+ Assert(total_allocated == context->mem_allocated);
+}
+
+#endif /* MEMORY_CONTEXT_CHECKING */
diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c
new file mode 100644
index 0000000..a5f31e2
--- /dev/null
+++ b/src/backend/utils/mmgr/mcxt.c
@@ -0,0 +1,1341 @@
+/*-------------------------------------------------------------------------
+ *
+ * mcxt.c
+ * POSTGRES memory context management code.
+ *
+ * This module handles context management operations that are independent
+ * of the particular kind of context being operated on. It calls
+ * context-type-specific operations via the function pointers in a
+ * context's MemoryContextMethods struct.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/utils/mmgr/mcxt.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/procsignal.h"
+#include "utils/fmgrprotos.h"
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+
+
+/*****************************************************************************
+ * GLOBAL MEMORY *
+ *****************************************************************************/
+
+/*
+ * CurrentMemoryContext
+ * Default memory context for allocations.
+ */
+MemoryContext CurrentMemoryContext = NULL;
+
+/*
+ * Standard top-level contexts. For a description of the purpose of each
+ * of these contexts, refer to src/backend/utils/mmgr/README
+ */
+MemoryContext TopMemoryContext = NULL;
+MemoryContext ErrorContext = NULL;
+MemoryContext PostmasterContext = NULL;
+MemoryContext CacheMemoryContext = NULL;
+MemoryContext MessageContext = NULL;
+MemoryContext TopTransactionContext = NULL;
+MemoryContext CurTransactionContext = NULL;
+
+/* This is a transient link to the active portal's memory context: */
+MemoryContext PortalContext = NULL;
+
+static void MemoryContextCallResetCallbacks(MemoryContext context);
+static void MemoryContextStatsInternal(MemoryContext context, int level,
+ bool print, int max_children,
+ MemoryContextCounters *totals,
+ bool print_to_stderr);
+static void MemoryContextStatsPrint(MemoryContext context, void *passthru,
+ const char *stats_string,
+ bool print_to_stderr);
+
+/*
+ * You should not do memory allocations within a critical section, because
+ * an out-of-memory error will be escalated to a PANIC. To enforce that
+ * rule, the allocation functions Assert that.
+ */
+#define AssertNotInCriticalSection(context) \
+ Assert(CritSectionCount == 0 || (context)->allowInCritSection)
+
+
+/*****************************************************************************
+ * EXPORTED ROUTINES *
+ *****************************************************************************/
+
+
+/*
+ * MemoryContextInit
+ * Start up the memory-context subsystem.
+ *
+ * This must be called before creating contexts or allocating memory in
+ * contexts. TopMemoryContext and ErrorContext are initialized here;
+ * other contexts must be created afterwards.
+ *
+ * In normal multi-backend operation, this is called once during
+ * postmaster startup, and not at all by individual backend startup
+ * (since the backends inherit an already-initialized context subsystem
+ * by virtue of being forked off the postmaster). But in an EXEC_BACKEND
+ * build, each process must do this for itself.
+ *
+ * In a standalone backend this must be called during backend startup.
+ */
+void
+MemoryContextInit(void)
+{
+ AssertState(TopMemoryContext == NULL);
+
+ /*
+ * First, initialize TopMemoryContext, which is the parent of all others.
+ */
+ TopMemoryContext = AllocSetContextCreate((MemoryContext) NULL,
+ "TopMemoryContext",
+ ALLOCSET_DEFAULT_SIZES);
+
+ /*
+ * Not having any other place to point CurrentMemoryContext, make it point
+ * to TopMemoryContext. Caller should change this soon!
+ */
+ CurrentMemoryContext = TopMemoryContext;
+
+ /*
+ * Initialize ErrorContext as an AllocSetContext with slow growth rate ---
+ * we don't really expect much to be allocated in it. More to the point,
+ * require it to contain at least 8K at all times. This is the only case
+ * where retained memory in a context is *essential* --- we want to be
+ * sure ErrorContext still has some memory even if we've run out
+ * elsewhere! Also, allow allocations in ErrorContext within a critical
+ * section. Otherwise a PANIC will cause an assertion failure in the error
+ * reporting code, before printing out the real cause of the failure.
+ *
+ * This should be the last step in this function, as elog.c assumes memory
+ * management works once ErrorContext is non-null.
+ */
+ ErrorContext = AllocSetContextCreate(TopMemoryContext,
+ "ErrorContext",
+ 8 * 1024,
+ 8 * 1024,
+ 8 * 1024);
+ MemoryContextAllowInCriticalSection(ErrorContext, true);
+}
+
+/*
+ * MemoryContextReset
+ * Release all space allocated within a context and delete all its
+ * descendant contexts (but not the named context itself).
+ */
+void
+MemoryContextReset(MemoryContext context)
+{
+ AssertArg(MemoryContextIsValid(context));
+
+ /* save a function call in common case where there are no children */
+ if (context->firstchild != NULL)
+ MemoryContextDeleteChildren(context);
+
+ /* save a function call if no pallocs since startup or last reset */
+ if (!context->isReset)
+ MemoryContextResetOnly(context);
+}
+
+/*
+ * MemoryContextResetOnly
+ * Release all space allocated within a context.
+ * Nothing is done to the context's descendant contexts.
+ */
+void
+MemoryContextResetOnly(MemoryContext context)
+{
+ AssertArg(MemoryContextIsValid(context));
+
+ /* Nothing to do if no pallocs since startup or last reset */
+ if (!context->isReset)
+ {
+ MemoryContextCallResetCallbacks(context);
+
+ /*
+ * If context->ident points into the context's memory, it will become
+ * a dangling pointer. We could prevent that by setting it to NULL
+ * here, but that would break valid coding patterns that keep the
+ * ident elsewhere, e.g. in a parent context. Another idea is to use
+ * MemoryContextContains(), but we don't require ident strings to be
+ * in separately-palloc'd chunks, so that risks false positives. So
+ * for now we assume the programmer got it right.
+ */
+
+ context->methods->reset(context);
+ context->isReset = true;
+ VALGRIND_DESTROY_MEMPOOL(context);
+ VALGRIND_CREATE_MEMPOOL(context, 0, false);
+ }
+}
+
+/*
+ * MemoryContextResetChildren
+ * Release all space allocated within a context's descendants,
+ * but don't delete the contexts themselves. The named context
+ * itself is not touched.
+ */
+void
+MemoryContextResetChildren(MemoryContext context)
+{
+ MemoryContext child;
+
+ AssertArg(MemoryContextIsValid(context));
+
+ for (child = context->firstchild; child != NULL; child = child->nextchild)
+ {
+ MemoryContextResetChildren(child);
+ MemoryContextResetOnly(child);
+ }
+}
+
+/*
+ * MemoryContextDelete
+ * Delete a context and its descendants, and release all space
+ * allocated therein.
+ *
+ * The type-specific delete routine removes all storage for the context,
+ * but we have to recurse to handle the children.
+ * We must also delink the context from its parent, if it has one.
+ */
+void
+MemoryContextDelete(MemoryContext context)
+{
+ AssertArg(MemoryContextIsValid(context));
+ /* We had better not be deleting TopMemoryContext ... */
+ Assert(context != TopMemoryContext);
+ /* And not CurrentMemoryContext, either */
+ Assert(context != CurrentMemoryContext);
+
+ /* save a function call in common case where there are no children */
+ if (context->firstchild != NULL)
+ MemoryContextDeleteChildren(context);
+
+ /*
+ * It's not entirely clear whether 'tis better to do this before or after
+ * delinking the context; but an error in a callback will likely result in
+ * leaking the whole context (if it's not a root context) if we do it
+ * after, so let's do it before.
+ */
+ MemoryContextCallResetCallbacks(context);
+
+ /*
+ * We delink the context from its parent before deleting it, so that if
+ * there's an error we won't have deleted/busted contexts still attached
+ * to the context tree. Better a leak than a crash.
+ */
+ MemoryContextSetParent(context, NULL);
+
+ /*
+ * Also reset the context's ident pointer, in case it points into the
+ * context. This would only matter if someone tries to get stats on the
+ * (already unlinked) context, which is unlikely, but let's be safe.
+ */
+ context->ident = NULL;
+
+ context->methods->delete_context(context);
+
+ VALGRIND_DESTROY_MEMPOOL(context);
+}
+
+/*
+ * MemoryContextDeleteChildren
+ * Delete all the descendants of the named context and release all
+ * space allocated therein. The named context itself is not touched.
+ */
+void
+MemoryContextDeleteChildren(MemoryContext context)
+{
+ AssertArg(MemoryContextIsValid(context));
+
+ /*
+ * MemoryContextDelete will delink the child from me, so just iterate as
+ * long as there is a child.
+ */
+ while (context->firstchild != NULL)
+ MemoryContextDelete(context->firstchild);
+}
+
+/*
+ * MemoryContextRegisterResetCallback
+ * Register a function to be called before next context reset/delete.
+ * Such callbacks will be called in reverse order of registration.
+ *
+ * The caller is responsible for allocating a MemoryContextCallback struct
+ * to hold the info about this callback request, and for filling in the
+ * "func" and "arg" fields in the struct to show what function to call with
+ * what argument. Typically the callback struct should be allocated within
+ * the specified context, since that means it will automatically be freed
+ * when no longer needed.
+ *
+ * There is no API for deregistering a callback once registered. If you
+ * want it to not do anything anymore, adjust the state pointed to by its
+ * "arg" to indicate that.
+ */
+void
+MemoryContextRegisterResetCallback(MemoryContext context,
+ MemoryContextCallback *cb)
+{
+ AssertArg(MemoryContextIsValid(context));
+
+ /* Push onto head so this will be called before older registrants. */
+ cb->next = context->reset_cbs;
+ context->reset_cbs = cb;
+ /* Mark the context as non-reset (it probably is already). */
+ context->isReset = false;
+}
+
+/*
+ * MemoryContextCallResetCallbacks
+ * Internal function to call all registered callbacks for context.
+ */
+static void
+MemoryContextCallResetCallbacks(MemoryContext context)
+{
+ MemoryContextCallback *cb;
+
+ /*
+ * We pop each callback from the list before calling. That way, if an
+ * error occurs inside the callback, we won't try to call it a second time
+ * in the likely event that we reset or delete the context later.
+ */
+ while ((cb = context->reset_cbs) != NULL)
+ {
+ context->reset_cbs = cb->next;
+ cb->func(cb->arg);
+ }
+}
+
+/*
+ * MemoryContextSetIdentifier
+ * Set the identifier string for a memory context.
+ *
+ * An identifier can be provided to help distinguish among different contexts
+ * of the same kind in memory context stats dumps. The identifier string
+ * must live at least as long as the context it is for; typically it is
+ * allocated inside that context, so that it automatically goes away on
+ * context deletion. Pass id = NULL to forget any old identifier.
+ */
+void
+MemoryContextSetIdentifier(MemoryContext context, const char *id)
+{
+ AssertArg(MemoryContextIsValid(context));
+ context->ident = id;
+}
+
+/*
+ * MemoryContextSetParent
+ * Change a context to belong to a new parent (or no parent).
+ *
+ * We provide this as an API function because it is sometimes useful to
+ * change a context's lifespan after creation. For example, a context
+ * might be created underneath a transient context, filled with data,
+ * and then reparented underneath CacheMemoryContext to make it long-lived.
+ * In this way no special effort is needed to get rid of the context in case
+ * a failure occurs before its contents are completely set up.
+ *
+ * Callers often assume that this function cannot fail, so don't put any
+ * elog(ERROR) calls in it.
+ *
+ * A possible caller error is to reparent a context under itself, creating
+ * a loop in the context graph. We assert here that context != new_parent,
+ * but checking for multi-level loops seems more trouble than it's worth.
+ */
+void
+MemoryContextSetParent(MemoryContext context, MemoryContext new_parent)
+{
+ AssertArg(MemoryContextIsValid(context));
+ AssertArg(context != new_parent);
+
+ /* Fast path if it's got correct parent already */
+ if (new_parent == context->parent)
+ return;
+
+ /* Delink from existing parent, if any */
+ if (context->parent)
+ {
+ MemoryContext parent = context->parent;
+
+ if (context->prevchild != NULL)
+ context->prevchild->nextchild = context->nextchild;
+ else
+ {
+ Assert(parent->firstchild == context);
+ parent->firstchild = context->nextchild;
+ }
+
+ if (context->nextchild != NULL)
+ context->nextchild->prevchild = context->prevchild;
+ }
+
+ /* And relink */
+ if (new_parent)
+ {
+ AssertArg(MemoryContextIsValid(new_parent));
+ context->parent = new_parent;
+ context->prevchild = NULL;
+ context->nextchild = new_parent->firstchild;
+ if (new_parent->firstchild != NULL)
+ new_parent->firstchild->prevchild = context;
+ new_parent->firstchild = context;
+ }
+ else
+ {
+ context->parent = NULL;
+ context->prevchild = NULL;
+ context->nextchild = NULL;
+ }
+}
+
+/*
+ * MemoryContextAllowInCriticalSection
+ * Allow/disallow allocations in this memory context within a critical
+ * section.
+ *
+ * Normally, memory allocations are not allowed within a critical section,
+ * because a failure would lead to PANIC. There are a few exceptions to
+ * that, like allocations related to debugging code that is not supposed to
+ * be enabled in production. This function can be used to exempt specific
+ * memory contexts from the assertion in palloc().
+ */
+void
+MemoryContextAllowInCriticalSection(MemoryContext context, bool allow)
+{
+ AssertArg(MemoryContextIsValid(context));
+
+ context->allowInCritSection = allow;
+}
+
+/*
+ * GetMemoryChunkSpace
+ * Given a currently-allocated chunk, determine the total space
+ * it occupies (including all memory-allocation overhead).
+ *
+ * This is useful for measuring the total space occupied by a set of
+ * allocated chunks.
+ */
+Size
+GetMemoryChunkSpace(void *pointer)
+{
+ MemoryContext context = GetMemoryChunkContext(pointer);
+
+ return context->methods->get_chunk_space(context, pointer);
+}
+
+/*
+ * MemoryContextGetParent
+ * Get the parent context (if any) of the specified context
+ */
+MemoryContext
+MemoryContextGetParent(MemoryContext context)
+{
+ AssertArg(MemoryContextIsValid(context));
+
+ return context->parent;
+}
+
+/*
+ * MemoryContextIsEmpty
+ * Is a memory context empty of any allocated space?
+ */
+bool
+MemoryContextIsEmpty(MemoryContext context)
+{
+ AssertArg(MemoryContextIsValid(context));
+
+ /*
+ * For now, we consider a memory context nonempty if it has any children;
+ * perhaps this should be changed later.
+ */
+ if (context->firstchild != NULL)
+ return false;
+ /* Otherwise use the type-specific inquiry */
+ return context->methods->is_empty(context);
+}
+
+/*
+ * Find the memory allocated to blocks for this memory context. If recurse is
+ * true, also include children.
+ */
+Size
+MemoryContextMemAllocated(MemoryContext context, bool recurse)
+{
+ Size total = context->mem_allocated;
+
+ AssertArg(MemoryContextIsValid(context));
+
+ if (recurse)
+ {
+ MemoryContext child;
+
+ for (child = context->firstchild;
+ child != NULL;
+ child = child->nextchild)
+ total += MemoryContextMemAllocated(child, true);
+ }
+
+ return total;
+}
+
+/*
+ * MemoryContextStats
+ * Print statistics about the named context and all its descendants.
+ *
+ * This is just a debugging utility, so it's not very fancy. However, we do
+ * make some effort to summarize when the output would otherwise be very long.
+ * The statistics are sent to stderr.
+ */
+void
+MemoryContextStats(MemoryContext context)
+{
+ /* A hard-wired limit on the number of children is usually good enough */
+ MemoryContextStatsDetail(context, 100, true);
+}
+
+/*
+ * MemoryContextStatsDetail
+ *
+ * Entry point for use if you want to vary the number of child contexts shown.
+ *
+ * If print_to_stderr is true, print statistics about the memory contexts
+ * with fprintf(stderr), otherwise use ereport().
+ */
+void
+MemoryContextStatsDetail(MemoryContext context, int max_children,
+ bool print_to_stderr)
+{
+ MemoryContextCounters grand_totals;
+
+ memset(&grand_totals, 0, sizeof(grand_totals));
+
+ MemoryContextStatsInternal(context, 0, true, max_children, &grand_totals, print_to_stderr);
+
+ if (print_to_stderr)
+ fprintf(stderr,
+ "Grand total: %zu bytes in %zd blocks; %zu free (%zd chunks); %zu used\n",
+ grand_totals.totalspace, grand_totals.nblocks,
+ grand_totals.freespace, grand_totals.freechunks,
+ grand_totals.totalspace - grand_totals.freespace);
+ else
+
+ /*
+ * Use LOG_SERVER_ONLY to prevent the memory contexts from being sent
+ * to the connected client.
+ *
+ * We don't buffer the information about all memory contexts in a
+ * backend into StringInfo and log it as one message. Otherwise which
+ * may require the buffer to be enlarged very much and lead to OOM
+ * error since there can be a large number of memory contexts in a
+ * backend. Instead, we log one message per memory context.
+ */
+ ereport(LOG_SERVER_ONLY,
+ (errhidestmt(true),
+ errhidecontext(true),
+ errmsg_internal("Grand total: %zu bytes in %zd blocks; %zu free (%zd chunks); %zu used",
+ grand_totals.totalspace, grand_totals.nblocks,
+ grand_totals.freespace, grand_totals.freechunks,
+ grand_totals.totalspace - grand_totals.freespace)));
+}
+
+/*
+ * MemoryContextStatsInternal
+ * One recursion level for MemoryContextStats
+ *
+ * Print this context if print is true, but in any case accumulate counts into
+ * *totals (if given).
+ */
+static void
+MemoryContextStatsInternal(MemoryContext context, int level,
+ bool print, int max_children,
+ MemoryContextCounters *totals,
+ bool print_to_stderr)
+{
+ MemoryContextCounters local_totals;
+ MemoryContext child;
+ int ichild;
+
+ AssertArg(MemoryContextIsValid(context));
+
+ /* Examine the context itself */
+ context->methods->stats(context,
+ print ? MemoryContextStatsPrint : NULL,
+ (void *) &level,
+ totals, print_to_stderr);
+
+ /*
+ * Examine children. If there are more than max_children of them, we do
+ * not print the rest explicitly, but just summarize them.
+ */
+ memset(&local_totals, 0, sizeof(local_totals));
+
+ for (child = context->firstchild, ichild = 0;
+ child != NULL;
+ child = child->nextchild, ichild++)
+ {
+ if (ichild < max_children)
+ MemoryContextStatsInternal(child, level + 1,
+ print, max_children,
+ totals,
+ print_to_stderr);
+ else
+ MemoryContextStatsInternal(child, level + 1,
+ false, max_children,
+ &local_totals,
+ print_to_stderr);
+ }
+
+ /* Deal with excess children */
+ if (ichild > max_children)
+ {
+ if (print)
+ {
+ if (print_to_stderr)
+ {
+ int i;
+
+ for (i = 0; i <= level; i++)
+ fprintf(stderr, " ");
+ fprintf(stderr,
+ "%d more child contexts containing %zu total in %zd blocks; %zu free (%zd chunks); %zu used\n",
+ ichild - max_children,
+ local_totals.totalspace,
+ local_totals.nblocks,
+ local_totals.freespace,
+ local_totals.freechunks,
+ local_totals.totalspace - local_totals.freespace);
+ }
+ else
+ ereport(LOG_SERVER_ONLY,
+ (errhidestmt(true),
+ errhidecontext(true),
+ errmsg_internal("level: %d; %d more child contexts containing %zu total in %zd blocks; %zu free (%zd chunks); %zu used",
+ level,
+ ichild - max_children,
+ local_totals.totalspace,
+ local_totals.nblocks,
+ local_totals.freespace,
+ local_totals.freechunks,
+ local_totals.totalspace - local_totals.freespace)));
+ }
+
+ if (totals)
+ {
+ totals->nblocks += local_totals.nblocks;
+ totals->freechunks += local_totals.freechunks;
+ totals->totalspace += local_totals.totalspace;
+ totals->freespace += local_totals.freespace;
+ }
+ }
+}
+
+/*
+ * MemoryContextStatsPrint
+ * Print callback used by MemoryContextStatsInternal
+ *
+ * For now, the passthru pointer just points to "int level"; later we might
+ * make that more complicated.
+ */
+static void
+MemoryContextStatsPrint(MemoryContext context, void *passthru,
+ const char *stats_string,
+ bool print_to_stderr)
+{
+ int level = *(int *) passthru;
+ const char *name = context->name;
+ const char *ident = context->ident;
+ char truncated_ident[110];
+ int i;
+
+ /*
+ * It seems preferable to label dynahash contexts with just the hash table
+ * name. Those are already unique enough, so the "dynahash" part isn't
+ * very helpful, and this way is more consistent with pre-v11 practice.
+ */
+ if (ident && strcmp(name, "dynahash") == 0)
+ {
+ name = ident;
+ ident = NULL;
+ }
+
+ truncated_ident[0] = '\0';
+
+ if (ident)
+ {
+ /*
+ * Some contexts may have very long identifiers (e.g., SQL queries).
+ * Arbitrarily truncate at 100 bytes, but be careful not to break
+ * multibyte characters. Also, replace ASCII control characters, such
+ * as newlines, with spaces.
+ */
+ int idlen = strlen(ident);
+ bool truncated = false;
+
+ strcpy(truncated_ident, ": ");
+ i = strlen(truncated_ident);
+
+ if (idlen > 100)
+ {
+ idlen = pg_mbcliplen(ident, idlen, 100);
+ truncated = true;
+ }
+
+ while (idlen-- > 0)
+ {
+ unsigned char c = *ident++;
+
+ if (c < ' ')
+ c = ' ';
+ truncated_ident[i++] = c;
+ }
+ truncated_ident[i] = '\0';
+
+ if (truncated)
+ strcat(truncated_ident, "...");
+ }
+
+ if (print_to_stderr)
+ {
+ for (i = 0; i < level; i++)
+ fprintf(stderr, " ");
+ fprintf(stderr, "%s: %s%s\n", name, stats_string, truncated_ident);
+ }
+ else
+ ereport(LOG_SERVER_ONLY,
+ (errhidestmt(true),
+ errhidecontext(true),
+ errmsg_internal("level: %d; %s: %s%s",
+ level, name, stats_string, truncated_ident)));
+}
+
+/*
+ * MemoryContextCheck
+ * Check all chunks in the named context.
+ *
+ * This is just a debugging utility, so it's not fancy.
+ */
+#ifdef MEMORY_CONTEXT_CHECKING
+void
+MemoryContextCheck(MemoryContext context)
+{
+ MemoryContext child;
+
+ AssertArg(MemoryContextIsValid(context));
+
+ context->methods->check(context);
+ for (child = context->firstchild; child != NULL; child = child->nextchild)
+ MemoryContextCheck(child);
+}
+#endif
+
+/*
+ * MemoryContextContains
+ * Detect whether an allocated chunk of memory belongs to a given
+ * context or not.
+ *
+ * Caution: this test is reliable as long as 'pointer' does point to
+ * a chunk of memory allocated from *some* context. If 'pointer' points
+ * at memory obtained in some other way, there is a small chance of a
+ * false-positive result, since the bits right before it might look like
+ * a valid chunk header by chance.
+ */
+bool
+MemoryContextContains(MemoryContext context, void *pointer)
+{
+ MemoryContext ptr_context;
+
+ /*
+ * NB: Can't use GetMemoryChunkContext() here - that performs assertions
+ * that aren't acceptable here since we might be passed memory not
+ * allocated by any memory context.
+ *
+ * Try to detect bogus pointers handed to us, poorly though we can.
+ * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an
+ * allocated chunk.
+ */
+ if (pointer == NULL || pointer != (void *) MAXALIGN(pointer))
+ return false;
+
+ /*
+ * OK, it's probably safe to look at the context.
+ */
+ ptr_context = *(MemoryContext *) (((char *) pointer) - sizeof(void *));
+
+ return ptr_context == context;
+}
+
+/*
+ * MemoryContextCreate
+ * Context-type-independent part of context creation.
+ *
+ * This is only intended to be called by context-type-specific
+ * context creation routines, not by the unwashed masses.
+ *
+ * The memory context creation procedure goes like this:
+ * 1. Context-type-specific routine makes some initial space allocation,
+ * including enough space for the context header. If it fails,
+ * it can ereport() with no damage done.
+ * 2. Context-type-specific routine sets up all type-specific fields of
+ * the header (those beyond MemoryContextData proper), as well as any
+ * other management fields it needs to have a fully valid context.
+ * Usually, failure in this step is impossible, but if it's possible
+ * the initial space allocation should be freed before ereport'ing.
+ * 3. Context-type-specific routine calls MemoryContextCreate() to fill in
+ * the generic header fields and link the context into the context tree.
+ * 4. We return to the context-type-specific routine, which finishes
+ * up type-specific initialization. This routine can now do things
+ * that might fail (like allocate more memory), so long as it's
+ * sure the node is left in a state that delete will handle.
+ *
+ * node: the as-yet-uninitialized common part of the context header node.
+ * tag: NodeTag code identifying the memory context type.
+ * methods: context-type-specific methods (usually statically allocated).
+ * parent: parent context, or NULL if this will be a top-level context.
+ * name: name of context (must be statically allocated).
+ *
+ * Context routines generally assume that MemoryContextCreate can't fail,
+ * so this can contain Assert but not elog/ereport.
+ */
+void
+MemoryContextCreate(MemoryContext node,
+ NodeTag tag,
+ const MemoryContextMethods *methods,
+ MemoryContext parent,
+ const char *name)
+{
+ /* Creating new memory contexts is not allowed in a critical section */
+ Assert(CritSectionCount == 0);
+
+ /* Initialize all standard fields of memory context header */
+ node->type = tag;
+ node->isReset = true;
+ node->methods = methods;
+ node->parent = parent;
+ node->firstchild = NULL;
+ node->mem_allocated = 0;
+ node->prevchild = NULL;
+ node->name = name;
+ node->ident = NULL;
+ node->reset_cbs = NULL;
+
+ /* OK to link node into context tree */
+ if (parent)
+ {
+ node->nextchild = parent->firstchild;
+ if (parent->firstchild != NULL)
+ parent->firstchild->prevchild = node;
+ parent->firstchild = node;
+ /* inherit allowInCritSection flag from parent */
+ node->allowInCritSection = parent->allowInCritSection;
+ }
+ else
+ {
+ node->nextchild = NULL;
+ node->allowInCritSection = false;
+ }
+
+ VALGRIND_CREATE_MEMPOOL(node, 0, false);
+}
+
+/*
+ * MemoryContextAlloc
+ * Allocate space within the specified context.
+ *
+ * This could be turned into a macro, but we'd have to import
+ * nodes/memnodes.h into postgres.h which seems a bad idea.
+ */
+void *
+MemoryContextAlloc(MemoryContext context, Size size)
+{
+ void *ret;
+
+ AssertArg(MemoryContextIsValid(context));
+ AssertNotInCriticalSection(context);
+
+ if (!AllocSizeIsValid(size))
+ elog(ERROR, "invalid memory alloc request size %zu", size);
+
+ context->isReset = false;
+
+ ret = context->methods->alloc(context, size);
+ if (unlikely(ret == NULL))
+ {
+ MemoryContextStats(TopMemoryContext);
+
+ /*
+ * Here, and elsewhere in this module, we show the target context's
+ * "name" but not its "ident" (if any) in user-visible error messages.
+ * The "ident" string might contain security-sensitive data, such as
+ * values in SQL commands.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed on request of size %zu in memory context \"%s\".",
+ size, context->name)));
+ }
+
+ VALGRIND_MEMPOOL_ALLOC(context, ret, size);
+
+ return ret;
+}
+
+/*
+ * MemoryContextAllocZero
+ * Like MemoryContextAlloc, but clears allocated memory
+ *
+ * We could just call MemoryContextAlloc then clear the memory, but this
+ * is a very common combination, so we provide the combined operation.
+ */
+void *
+MemoryContextAllocZero(MemoryContext context, Size size)
+{
+ void *ret;
+
+ AssertArg(MemoryContextIsValid(context));
+ AssertNotInCriticalSection(context);
+
+ if (!AllocSizeIsValid(size))
+ elog(ERROR, "invalid memory alloc request size %zu", size);
+
+ context->isReset = false;
+
+ ret = context->methods->alloc(context, size);
+ if (unlikely(ret == NULL))
+ {
+ MemoryContextStats(TopMemoryContext);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed on request of size %zu in memory context \"%s\".",
+ size, context->name)));
+ }
+
+ VALGRIND_MEMPOOL_ALLOC(context, ret, size);
+
+ MemSetAligned(ret, 0, size);
+
+ return ret;
+}
+
+/*
+ * MemoryContextAllocZeroAligned
+ * MemoryContextAllocZero where length is suitable for MemSetLoop
+ *
+ * This might seem overly specialized, but it's not because newNode()
+ * is so often called with compile-time-constant sizes.
+ */
+void *
+MemoryContextAllocZeroAligned(MemoryContext context, Size size)
+{
+ void *ret;
+
+ AssertArg(MemoryContextIsValid(context));
+ AssertNotInCriticalSection(context);
+
+ if (!AllocSizeIsValid(size))
+ elog(ERROR, "invalid memory alloc request size %zu", size);
+
+ context->isReset = false;
+
+ ret = context->methods->alloc(context, size);
+ if (unlikely(ret == NULL))
+ {
+ MemoryContextStats(TopMemoryContext);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed on request of size %zu in memory context \"%s\".",
+ size, context->name)));
+ }
+
+ VALGRIND_MEMPOOL_ALLOC(context, ret, size);
+
+ MemSetLoop(ret, 0, size);
+
+ return ret;
+}
+
+/*
+ * MemoryContextAllocExtended
+ * Allocate space within the specified context using the given flags.
+ */
+void *
+MemoryContextAllocExtended(MemoryContext context, Size size, int flags)
+{
+ void *ret;
+
+ AssertArg(MemoryContextIsValid(context));
+ AssertNotInCriticalSection(context);
+
+ if (((flags & MCXT_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
+ ((flags & MCXT_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
+ elog(ERROR, "invalid memory alloc request size %zu", size);
+
+ context->isReset = false;
+
+ ret = context->methods->alloc(context, size);
+ if (unlikely(ret == NULL))
+ {
+ if ((flags & MCXT_ALLOC_NO_OOM) == 0)
+ {
+ MemoryContextStats(TopMemoryContext);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed on request of size %zu in memory context \"%s\".",
+ size, context->name)));
+ }
+ return NULL;
+ }
+
+ VALGRIND_MEMPOOL_ALLOC(context, ret, size);
+
+ if ((flags & MCXT_ALLOC_ZERO) != 0)
+ MemSetAligned(ret, 0, size);
+
+ return ret;
+}
+
+/*
+ * HandleLogMemoryContextInterrupt
+ * Handle receipt of an interrupt indicating logging of memory
+ * contexts.
+ *
+ * All the actual work is deferred to ProcessLogMemoryContextInterrupt(),
+ * because we cannot safely emit a log message inside the signal handler.
+ */
+void
+HandleLogMemoryContextInterrupt(void)
+{
+ InterruptPending = true;
+ LogMemoryContextPending = true;
+ /* latch will be set by procsignal_sigusr1_handler */
+}
+
+/*
+ * ProcessLogMemoryContextInterrupt
+ * Perform logging of memory contexts of this backend process.
+ *
+ * Any backend that participates in ProcSignal signaling must arrange
+ * to call this function if we see LogMemoryContextPending set.
+ * It is called from CHECK_FOR_INTERRUPTS(), which is enough because
+ * the target process for logging of memory contexts is a backend.
+ */
+void
+ProcessLogMemoryContextInterrupt(void)
+{
+ LogMemoryContextPending = false;
+
+ /*
+ * Use LOG_SERVER_ONLY to prevent this message from being sent to the
+ * connected client.
+ */
+ ereport(LOG_SERVER_ONLY,
+ (errhidestmt(true),
+ errhidecontext(true),
+ errmsg("logging memory contexts of PID %d", MyProcPid)));
+
+ /*
+ * When a backend process is consuming huge memory, logging all its memory
+ * contexts might overrun available disk space. To prevent this, we limit
+ * the number of child contexts to log per parent to 100.
+ *
+ * As with MemoryContextStats(), we suppose that practical cases where the
+ * dump gets long will typically be huge numbers of siblings under the
+ * same parent context; while the additional debugging value from seeing
+ * details about individual siblings beyond 100 will not be large.
+ */
+ MemoryContextStatsDetail(TopMemoryContext, 100, false);
+}
+
+void *
+palloc(Size size)
+{
+ /* duplicates MemoryContextAlloc to avoid increased overhead */
+ void *ret;
+ MemoryContext context = CurrentMemoryContext;
+
+ AssertArg(MemoryContextIsValid(context));
+ AssertNotInCriticalSection(context);
+
+ if (!AllocSizeIsValid(size))
+ elog(ERROR, "invalid memory alloc request size %zu", size);
+
+ context->isReset = false;
+
+ ret = context->methods->alloc(context, size);
+ if (unlikely(ret == NULL))
+ {
+ MemoryContextStats(TopMemoryContext);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed on request of size %zu in memory context \"%s\".",
+ size, context->name)));
+ }
+
+ VALGRIND_MEMPOOL_ALLOC(context, ret, size);
+
+ return ret;
+}
+
+void *
+palloc0(Size size)
+{
+ /* duplicates MemoryContextAllocZero to avoid increased overhead */
+ void *ret;
+ MemoryContext context = CurrentMemoryContext;
+
+ AssertArg(MemoryContextIsValid(context));
+ AssertNotInCriticalSection(context);
+
+ if (!AllocSizeIsValid(size))
+ elog(ERROR, "invalid memory alloc request size %zu", size);
+
+ context->isReset = false;
+
+ ret = context->methods->alloc(context, size);
+ if (unlikely(ret == NULL))
+ {
+ MemoryContextStats(TopMemoryContext);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed on request of size %zu in memory context \"%s\".",
+ size, context->name)));
+ }
+
+ VALGRIND_MEMPOOL_ALLOC(context, ret, size);
+
+ MemSetAligned(ret, 0, size);
+
+ return ret;
+}
+
+void *
+palloc_extended(Size size, int flags)
+{
+ /* duplicates MemoryContextAllocExtended to avoid increased overhead */
+ void *ret;
+ MemoryContext context = CurrentMemoryContext;
+
+ AssertArg(MemoryContextIsValid(context));
+ AssertNotInCriticalSection(context);
+
+ if (((flags & MCXT_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
+ ((flags & MCXT_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
+ elog(ERROR, "invalid memory alloc request size %zu", size);
+
+ context->isReset = false;
+
+ ret = context->methods->alloc(context, size);
+ if (unlikely(ret == NULL))
+ {
+ if ((flags & MCXT_ALLOC_NO_OOM) == 0)
+ {
+ MemoryContextStats(TopMemoryContext);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed on request of size %zu in memory context \"%s\".",
+ size, context->name)));
+ }
+ return NULL;
+ }
+
+ VALGRIND_MEMPOOL_ALLOC(context, ret, size);
+
+ if ((flags & MCXT_ALLOC_ZERO) != 0)
+ MemSetAligned(ret, 0, size);
+
+ return ret;
+}
+
+/*
+ * pfree
+ * Release an allocated chunk.
+ */
+void
+pfree(void *pointer)
+{
+ MemoryContext context = GetMemoryChunkContext(pointer);
+
+ context->methods->free_p(context, pointer);
+ VALGRIND_MEMPOOL_FREE(context, pointer);
+}
+
+/*
+ * repalloc
+ * Adjust the size of a previously allocated chunk.
+ */
+void *
+repalloc(void *pointer, Size size)
+{
+ MemoryContext context = GetMemoryChunkContext(pointer);
+ void *ret;
+
+ if (!AllocSizeIsValid(size))
+ elog(ERROR, "invalid memory alloc request size %zu", size);
+
+ AssertNotInCriticalSection(context);
+
+ /* isReset must be false already */
+ Assert(!context->isReset);
+
+ ret = context->methods->realloc(context, pointer, size);
+ if (unlikely(ret == NULL))
+ {
+ MemoryContextStats(TopMemoryContext);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed on request of size %zu in memory context \"%s\".",
+ size, context->name)));
+ }
+
+ VALGRIND_MEMPOOL_CHANGE(context, pointer, ret, size);
+
+ return ret;
+}
+
+/*
+ * MemoryContextAllocHuge
+ * Allocate (possibly-expansive) space within the specified context.
+ *
+ * See considerations in comment at MaxAllocHugeSize.
+ */
+void *
+MemoryContextAllocHuge(MemoryContext context, Size size)
+{
+ void *ret;
+
+ AssertArg(MemoryContextIsValid(context));
+ AssertNotInCriticalSection(context);
+
+ if (!AllocHugeSizeIsValid(size))
+ elog(ERROR, "invalid memory alloc request size %zu", size);
+
+ context->isReset = false;
+
+ ret = context->methods->alloc(context, size);
+ if (unlikely(ret == NULL))
+ {
+ MemoryContextStats(TopMemoryContext);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed on request of size %zu in memory context \"%s\".",
+ size, context->name)));
+ }
+
+ VALGRIND_MEMPOOL_ALLOC(context, ret, size);
+
+ return ret;
+}
+
+/*
+ * repalloc_huge
+ * Adjust the size of a previously allocated chunk, permitting a large
+ * value. The previous allocation need not have been "huge".
+ */
+void *
+repalloc_huge(void *pointer, Size size)
+{
+ MemoryContext context = GetMemoryChunkContext(pointer);
+ void *ret;
+
+ if (!AllocHugeSizeIsValid(size))
+ elog(ERROR, "invalid memory alloc request size %zu", size);
+
+ AssertNotInCriticalSection(context);
+
+ /* isReset must be false already */
+ Assert(!context->isReset);
+
+ ret = context->methods->realloc(context, pointer, size);
+ if (unlikely(ret == NULL))
+ {
+ MemoryContextStats(TopMemoryContext);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed on request of size %zu in memory context \"%s\".",
+ size, context->name)));
+ }
+
+ VALGRIND_MEMPOOL_CHANGE(context, pointer, ret, size);
+
+ return ret;
+}
+
+/*
+ * MemoryContextStrdup
+ * Like strdup(), but allocate from the specified context
+ */
+char *
+MemoryContextStrdup(MemoryContext context, const char *string)
+{
+ char *nstr;
+ Size len = strlen(string) + 1;
+
+ nstr = (char *) MemoryContextAlloc(context, len);
+
+ memcpy(nstr, string, len);
+
+ return nstr;
+}
+
+char *
+pstrdup(const char *in)
+{
+ return MemoryContextStrdup(CurrentMemoryContext, in);
+}
+
+/*
+ * pnstrdup
+ * Like pstrdup(), but append null byte to a
+ * not-necessarily-null-terminated input string.
+ */
+char *
+pnstrdup(const char *in, Size len)
+{
+ char *out;
+
+ len = strnlen(in, len);
+
+ out = palloc(len + 1);
+ memcpy(out, in, len);
+ out[len] = '\0';
+
+ return out;
+}
+
+/*
+ * Make copy of string with all trailing newline characters removed.
+ */
+char *
+pchomp(const char *in)
+{
+ size_t n;
+
+ n = strlen(in);
+ while (n > 0 && in[n - 1] == '\n')
+ n--;
+ return pnstrdup(in, n);
+}
diff --git a/src/backend/utils/mmgr/memdebug.c b/src/backend/utils/mmgr/memdebug.c
new file mode 100644
index 0000000..3644c7f
--- /dev/null
+++ b/src/backend/utils/mmgr/memdebug.c
@@ -0,0 +1,93 @@
+/*-------------------------------------------------------------------------
+ *
+ * memdebug.c
+ * Declarations used in memory context implementations, not part of the
+ * public API of the memory management subsystem.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/utils/mmgr/memdebug.c
+ *
+ *
+ * About CLOBBER_FREED_MEMORY:
+ *
+ * If this symbol is defined, all freed memory is overwritten with 0x7F's.
+ * This is useful for catching places that reference already-freed memory.
+ *
+ * About MEMORY_CONTEXT_CHECKING:
+ *
+ * Since we usually round request sizes up to the next power of 2, there
+ * is often some unused space immediately after a requested data area.
+ * Thus, if someone makes the common error of writing past what they've
+ * requested, the problem is likely to go unnoticed ... until the day when
+ * there *isn't* any wasted space, perhaps because of different memory
+ * alignment on a new platform, or some other effect. To catch this sort
+ * of problem, the MEMORY_CONTEXT_CHECKING option stores 0x7E just beyond
+ * the requested space whenever the request is less than the actual chunk
+ * size, and verifies that the byte is undamaged when the chunk is freed.
+ *
+ *
+ * About USE_VALGRIND and Valgrind client requests:
+ *
+ * Valgrind provides "client request" macros that exchange information with
+ * the host Valgrind (if any). Under !USE_VALGRIND, memdebug.h stubs out
+ * currently-used macros.
+ *
+ * When running under Valgrind, we want a NOACCESS memory region both before
+ * and after the allocation. The chunk header is tempting as the preceding
+ * region, but mcxt.c expects to able to examine the standard chunk header
+ * fields. Therefore, we use, when available, the requested_size field and
+ * any subsequent padding. requested_size is made NOACCESS before returning
+ * a chunk pointer to a caller. However, to reduce client request traffic,
+ * it is kept DEFINED in chunks on the free list.
+ *
+ * The rounded-up capacity of the chunk usually acts as a post-allocation
+ * NOACCESS region. If the request consumes precisely the entire chunk,
+ * there is no such region; another chunk header may immediately follow. In
+ * that case, Valgrind will not detect access beyond the end of the chunk.
+ *
+ * See also the cooperating Valgrind client requests in mcxt.c.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/memdebug.h"
+
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+
+/*
+ * Fill a just-allocated piece of memory with "random" data. It's not really
+ * very random, just a repeating sequence with a length that's prime. What
+ * we mainly want out of it is to have a good probability that two palloc's
+ * of the same number of bytes start out containing different data.
+ *
+ * The region may be NOACCESS, so make it UNDEFINED first to avoid errors as
+ * we fill it. Filling the region makes it DEFINED, so make it UNDEFINED
+ * again afterward. Whether to finally make it UNDEFINED or NOACCESS is
+ * fairly arbitrary. UNDEFINED is more convenient for SlabRealloc(), and
+ * other callers have no preference.
+ */
+void
+randomize_mem(char *ptr, size_t size)
+{
+ static int save_ctr = 1;
+ size_t remaining = size;
+ int ctr;
+
+ ctr = save_ctr;
+ VALGRIND_MAKE_MEM_UNDEFINED(ptr, size);
+ while (remaining-- > 0)
+ {
+ *ptr++ = ctr;
+ if (++ctr > 251)
+ ctr = 1;
+ }
+ VALGRIND_MAKE_MEM_UNDEFINED(ptr - size, size);
+ save_ctr = ctr;
+}
+
+#endif /* RANDOMIZE_ALLOCATED_MEMORY */
diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c
new file mode 100644
index 0000000..d93c591
--- /dev/null
+++ b/src/backend/utils/mmgr/portalmem.c
@@ -0,0 +1,1340 @@
+/*-------------------------------------------------------------------------
+ *
+ * portalmem.c
+ * backend portal memory management
+ *
+ * Portals are objects representing the execution state of a query.
+ * This module provides memory management services for portals, but it
+ * doesn't actually run the executor for them.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/utils/mmgr/portalmem.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xact.h"
+#include "catalog/pg_type.h"
+#include "commands/portalcmds.h"
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "utils/snapmgr.h"
+#include "utils/timestamp.h"
+
+/*
+ * Estimate of the maximum number of open portals a user would have,
+ * used in initially sizing the PortalHashTable in EnablePortalManager().
+ * Since the hash table can expand, there's no need to make this overly
+ * generous, and keeping it small avoids unnecessary overhead in the
+ * hash_seq_search() calls executed during transaction end.
+ */
+#define PORTALS_PER_USER 16
+
+
+/* ----------------
+ * Global state
+ * ----------------
+ */
+
+#define MAX_PORTALNAME_LEN NAMEDATALEN
+
+typedef struct portalhashent
+{
+ char portalname[MAX_PORTALNAME_LEN];
+ Portal portal;
+} PortalHashEnt;
+
+static HTAB *PortalHashTable = NULL;
+
+#define PortalHashTableLookup(NAME, PORTAL) \
+do { \
+ PortalHashEnt *hentry; \
+ \
+ hentry = (PortalHashEnt *) hash_search(PortalHashTable, \
+ (NAME), HASH_FIND, NULL); \
+ if (hentry) \
+ PORTAL = hentry->portal; \
+ else \
+ PORTAL = NULL; \
+} while(0)
+
+#define PortalHashTableInsert(PORTAL, NAME) \
+do { \
+ PortalHashEnt *hentry; bool found; \
+ \
+ hentry = (PortalHashEnt *) hash_search(PortalHashTable, \
+ (NAME), HASH_ENTER, &found); \
+ if (found) \
+ elog(ERROR, "duplicate portal name"); \
+ hentry->portal = PORTAL; \
+ /* To avoid duplicate storage, make PORTAL->name point to htab entry */ \
+ PORTAL->name = hentry->portalname; \
+} while(0)
+
+#define PortalHashTableDelete(PORTAL) \
+do { \
+ PortalHashEnt *hentry; \
+ \
+ hentry = (PortalHashEnt *) hash_search(PortalHashTable, \
+ PORTAL->name, HASH_REMOVE, NULL); \
+ if (hentry == NULL) \
+ elog(WARNING, "trying to delete portal name that does not exist"); \
+} while(0)
+
+static MemoryContext TopPortalContext = NULL;
+
+
+/* ----------------------------------------------------------------
+ * public portal interface functions
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * EnablePortalManager
+ * Enables the portal management module at backend startup.
+ */
+void
+EnablePortalManager(void)
+{
+ HASHCTL ctl;
+
+ Assert(TopPortalContext == NULL);
+
+ TopPortalContext = AllocSetContextCreate(TopMemoryContext,
+ "TopPortalContext",
+ ALLOCSET_DEFAULT_SIZES);
+
+ ctl.keysize = MAX_PORTALNAME_LEN;
+ ctl.entrysize = sizeof(PortalHashEnt);
+
+ /*
+ * use PORTALS_PER_USER as a guess of how many hash table entries to
+ * create, initially
+ */
+ PortalHashTable = hash_create("Portal hash", PORTALS_PER_USER,
+ &ctl, HASH_ELEM | HASH_STRINGS);
+}
+
+/*
+ * GetPortalByName
+ * Returns a portal given a portal name, or NULL if name not found.
+ */
+Portal
+GetPortalByName(const char *name)
+{
+ Portal portal;
+
+ if (PointerIsValid(name))
+ PortalHashTableLookup(name, portal);
+ else
+ portal = NULL;
+
+ return portal;
+}
+
+/*
+ * PortalGetPrimaryStmt
+ * Get the "primary" stmt within a portal, ie, the one marked canSetTag.
+ *
+ * Returns NULL if no such stmt. If multiple PlannedStmt structs within the
+ * portal are marked canSetTag, returns the first one. Neither of these
+ * cases should occur in present usages of this function.
+ */
+PlannedStmt *
+PortalGetPrimaryStmt(Portal portal)
+{
+ ListCell *lc;
+
+ foreach(lc, portal->stmts)
+ {
+ PlannedStmt *stmt = lfirst_node(PlannedStmt, lc);
+
+ if (stmt->canSetTag)
+ return stmt;
+ }
+ return NULL;
+}
+
+/*
+ * CreatePortal
+ * Returns a new portal given a name.
+ *
+ * allowDup: if true, automatically drop any pre-existing portal of the
+ * same name (if false, an error is raised).
+ *
+ * dupSilent: if true, don't even emit a WARNING.
+ */
+Portal
+CreatePortal(const char *name, bool allowDup, bool dupSilent)
+{
+ Portal portal;
+
+ AssertArg(PointerIsValid(name));
+
+ portal = GetPortalByName(name);
+ if (PortalIsValid(portal))
+ {
+ if (!allowDup)
+ ereport(ERROR,
+ (errcode(ERRCODE_DUPLICATE_CURSOR),
+ errmsg("cursor \"%s\" already exists", name)));
+ if (!dupSilent)
+ ereport(WARNING,
+ (errcode(ERRCODE_DUPLICATE_CURSOR),
+ errmsg("closing existing cursor \"%s\"",
+ name)));
+ PortalDrop(portal, false);
+ }
+
+ /* make new portal structure */
+ portal = (Portal) MemoryContextAllocZero(TopPortalContext, sizeof *portal);
+
+ /* initialize portal context; typically it won't store much */
+ portal->portalContext = AllocSetContextCreate(TopPortalContext,
+ "PortalContext",
+ ALLOCSET_SMALL_SIZES);
+
+ /* create a resource owner for the portal */
+ portal->resowner = ResourceOwnerCreate(CurTransactionResourceOwner,
+ "Portal");
+
+ /* initialize portal fields that don't start off zero */
+ portal->status = PORTAL_NEW;
+ portal->cleanup = PortalCleanup;
+ portal->createSubid = GetCurrentSubTransactionId();
+ portal->activeSubid = portal->createSubid;
+ portal->createLevel = GetCurrentTransactionNestLevel();
+ portal->strategy = PORTAL_MULTI_QUERY;
+ portal->cursorOptions = CURSOR_OPT_NO_SCROLL;
+ portal->atStart = true;
+ portal->atEnd = true; /* disallow fetches until query is set */
+ portal->visible = true;
+ portal->creation_time = GetCurrentStatementStartTimestamp();
+
+ /* put portal in table (sets portal->name) */
+ PortalHashTableInsert(portal, name);
+
+ /* for named portals reuse portal->name copy */
+ MemoryContextSetIdentifier(portal->portalContext, portal->name[0] ? portal->name : "<unnamed>");
+
+ return portal;
+}
+
+/*
+ * CreateNewPortal
+ * Create a new portal, assigning it a random nonconflicting name.
+ */
+Portal
+CreateNewPortal(void)
+{
+ static unsigned int unnamed_portal_count = 0;
+
+ char portalname[MAX_PORTALNAME_LEN];
+
+ /* Select a nonconflicting name */
+ for (;;)
+ {
+ unnamed_portal_count++;
+ sprintf(portalname, "<unnamed portal %u>", unnamed_portal_count);
+ if (GetPortalByName(portalname) == NULL)
+ break;
+ }
+
+ return CreatePortal(portalname, false, false);
+}
+
+/*
+ * PortalDefineQuery
+ * A simple subroutine to establish a portal's query.
+ *
+ * Notes: as of PG 8.4, caller MUST supply a sourceText string; it is not
+ * allowed anymore to pass NULL. (If you really don't have source text,
+ * you can pass a constant string, perhaps "(query not available)".)
+ *
+ * commandTag shall be NULL if and only if the original query string
+ * (before rewriting) was an empty string. Also, the passed commandTag must
+ * be a pointer to a constant string, since it is not copied.
+ *
+ * If cplan is provided, then it is a cached plan containing the stmts, and
+ * the caller must have done GetCachedPlan(), causing a refcount increment.
+ * The refcount will be released when the portal is destroyed.
+ *
+ * If cplan is NULL, then it is the caller's responsibility to ensure that
+ * the passed plan trees have adequate lifetime. Typically this is done by
+ * copying them into the portal's context.
+ *
+ * The caller is also responsible for ensuring that the passed prepStmtName
+ * (if not NULL) and sourceText have adequate lifetime.
+ *
+ * NB: this function mustn't do much beyond storing the passed values; in
+ * particular don't do anything that risks elog(ERROR). If that were to
+ * happen here before storing the cplan reference, we'd leak the plancache
+ * refcount that the caller is trying to hand off to us.
+ */
+void
+PortalDefineQuery(Portal portal,
+ const char *prepStmtName,
+ const char *sourceText,
+ CommandTag commandTag,
+ List *stmts,
+ CachedPlan *cplan)
+{
+ AssertArg(PortalIsValid(portal));
+ AssertState(portal->status == PORTAL_NEW);
+
+ AssertArg(sourceText != NULL);
+ AssertArg(commandTag != CMDTAG_UNKNOWN || stmts == NIL);
+
+ portal->prepStmtName = prepStmtName;
+ portal->sourceText = sourceText;
+ portal->qc.commandTag = commandTag;
+ portal->qc.nprocessed = 0;
+ portal->commandTag = commandTag;
+ portal->stmts = stmts;
+ portal->cplan = cplan;
+ portal->status = PORTAL_DEFINED;
+}
+
+/*
+ * PortalReleaseCachedPlan
+ * Release a portal's reference to its cached plan, if any.
+ */
+static void
+PortalReleaseCachedPlan(Portal portal)
+{
+ if (portal->cplan)
+ {
+ ReleaseCachedPlan(portal->cplan, NULL);
+ portal->cplan = NULL;
+
+ /*
+ * We must also clear portal->stmts which is now a dangling reference
+ * to the cached plan's plan list. This protects any code that might
+ * try to examine the Portal later.
+ */
+ portal->stmts = NIL;
+ }
+}
+
+/*
+ * PortalCreateHoldStore
+ * Create the tuplestore for a portal.
+ */
+void
+PortalCreateHoldStore(Portal portal)
+{
+ MemoryContext oldcxt;
+
+ Assert(portal->holdContext == NULL);
+ Assert(portal->holdStore == NULL);
+ Assert(portal->holdSnapshot == NULL);
+
+ /*
+ * Create the memory context that is used for storage of the tuple set.
+ * Note this is NOT a child of the portal's portalContext.
+ */
+ portal->holdContext =
+ AllocSetContextCreate(TopPortalContext,
+ "PortalHoldContext",
+ ALLOCSET_DEFAULT_SIZES);
+
+ /*
+ * Create the tuple store, selecting cross-transaction temp files, and
+ * enabling random access only if cursor requires scrolling.
+ *
+ * XXX: Should maintenance_work_mem be used for the portal size?
+ */
+ oldcxt = MemoryContextSwitchTo(portal->holdContext);
+
+ portal->holdStore =
+ tuplestore_begin_heap(portal->cursorOptions & CURSOR_OPT_SCROLL,
+ true, work_mem);
+
+ MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * PinPortal
+ * Protect a portal from dropping.
+ *
+ * A pinned portal is still unpinned and dropped at transaction or
+ * subtransaction abort.
+ */
+void
+PinPortal(Portal portal)
+{
+ if (portal->portalPinned)
+ elog(ERROR, "portal already pinned");
+
+ portal->portalPinned = true;
+}
+
+void
+UnpinPortal(Portal portal)
+{
+ if (!portal->portalPinned)
+ elog(ERROR, "portal not pinned");
+
+ portal->portalPinned = false;
+}
+
+/*
+ * MarkPortalActive
+ * Transition a portal from READY to ACTIVE state.
+ *
+ * NOTE: never set portal->status = PORTAL_ACTIVE directly; call this instead.
+ */
+void
+MarkPortalActive(Portal portal)
+{
+ /* For safety, this is a runtime test not just an Assert */
+ if (portal->status != PORTAL_READY)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("portal \"%s\" cannot be run", portal->name)));
+ /* Perform the state transition */
+ portal->status = PORTAL_ACTIVE;
+ portal->activeSubid = GetCurrentSubTransactionId();
+}
+
+/*
+ * MarkPortalDone
+ * Transition a portal from ACTIVE to DONE state.
+ *
+ * NOTE: never set portal->status = PORTAL_DONE directly; call this instead.
+ */
+void
+MarkPortalDone(Portal portal)
+{
+ /* Perform the state transition */
+ Assert(portal->status == PORTAL_ACTIVE);
+ portal->status = PORTAL_DONE;
+
+ /*
+ * Allow portalcmds.c to clean up the state it knows about. We might as
+ * well do that now, since the portal can't be executed any more.
+ *
+ * In some cases involving execution of a ROLLBACK command in an already
+ * aborted transaction, this is necessary, or we'd reach AtCleanup_Portals
+ * with the cleanup hook still unexecuted.
+ */
+ if (PointerIsValid(portal->cleanup))
+ {
+ portal->cleanup(portal);
+ portal->cleanup = NULL;
+ }
+}
+
+/*
+ * MarkPortalFailed
+ * Transition a portal into FAILED state.
+ *
+ * NOTE: never set portal->status = PORTAL_FAILED directly; call this instead.
+ */
+void
+MarkPortalFailed(Portal portal)
+{
+ /* Perform the state transition */
+ Assert(portal->status != PORTAL_DONE);
+ portal->status = PORTAL_FAILED;
+
+ /*
+ * Allow portalcmds.c to clean up the state it knows about. We might as
+ * well do that now, since the portal can't be executed any more.
+ *
+ * In some cases involving cleanup of an already aborted transaction, this
+ * is necessary, or we'd reach AtCleanup_Portals with the cleanup hook
+ * still unexecuted.
+ */
+ if (PointerIsValid(portal->cleanup))
+ {
+ portal->cleanup(portal);
+ portal->cleanup = NULL;
+ }
+}
+
+/*
+ * PortalDrop
+ * Destroy the portal.
+ */
+void
+PortalDrop(Portal portal, bool isTopCommit)
+{
+ AssertArg(PortalIsValid(portal));
+
+ /*
+ * Don't allow dropping a pinned portal, it's still needed by whoever
+ * pinned it.
+ */
+ if (portal->portalPinned)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_CURSOR_STATE),
+ errmsg("cannot drop pinned portal \"%s\"", portal->name)));
+
+ /*
+ * Not sure if the PORTAL_ACTIVE case can validly happen or not...
+ */
+ if (portal->status == PORTAL_ACTIVE)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_CURSOR_STATE),
+ errmsg("cannot drop active portal \"%s\"", portal->name)));
+
+ /*
+ * Allow portalcmds.c to clean up the state it knows about, in particular
+ * shutting down the executor if still active. This step potentially runs
+ * user-defined code so failure has to be expected. It's the cleanup
+ * hook's responsibility to not try to do that more than once, in the case
+ * that failure occurs and then we come back to drop the portal again
+ * during transaction abort.
+ *
+ * Note: in most paths of control, this will have been done already in
+ * MarkPortalDone or MarkPortalFailed. We're just making sure.
+ */
+ if (PointerIsValid(portal->cleanup))
+ {
+ portal->cleanup(portal);
+ portal->cleanup = NULL;
+ }
+
+ /* There shouldn't be an active snapshot anymore, except after error */
+ Assert(portal->portalSnapshot == NULL || !isTopCommit);
+
+ /*
+ * Remove portal from hash table. Because we do this here, we will not
+ * come back to try to remove the portal again if there's any error in the
+ * subsequent steps. Better to leak a little memory than to get into an
+ * infinite error-recovery loop.
+ */
+ PortalHashTableDelete(portal);
+
+ /* drop cached plan reference, if any */
+ PortalReleaseCachedPlan(portal);
+
+ /*
+ * If portal has a snapshot protecting its data, release that. This needs
+ * a little care since the registration will be attached to the portal's
+ * resowner; if the portal failed, we will already have released the
+ * resowner (and the snapshot) during transaction abort.
+ */
+ if (portal->holdSnapshot)
+ {
+ if (portal->resowner)
+ UnregisterSnapshotFromOwner(portal->holdSnapshot,
+ portal->resowner);
+ portal->holdSnapshot = NULL;
+ }
+
+ /*
+ * Release any resources still attached to the portal. There are several
+ * cases being covered here:
+ *
+ * Top transaction commit (indicated by isTopCommit): normally we should
+ * do nothing here and let the regular end-of-transaction resource
+ * releasing mechanism handle these resources too. However, if we have a
+ * FAILED portal (eg, a cursor that got an error), we'd better clean up
+ * its resources to avoid resource-leakage warning messages.
+ *
+ * Sub transaction commit: never comes here at all, since we don't kill
+ * any portals in AtSubCommit_Portals().
+ *
+ * Main or sub transaction abort: we will do nothing here because
+ * portal->resowner was already set NULL; the resources were already
+ * cleaned up in transaction abort.
+ *
+ * Ordinary portal drop: must release resources. However, if the portal
+ * is not FAILED then we do not release its locks. The locks become the
+ * responsibility of the transaction's ResourceOwner (since it is the
+ * parent of the portal's owner) and will be released when the transaction
+ * eventually ends.
+ */
+ if (portal->resowner &&
+ (!isTopCommit || portal->status == PORTAL_FAILED))
+ {
+ bool isCommit = (portal->status != PORTAL_FAILED);
+
+ ResourceOwnerRelease(portal->resowner,
+ RESOURCE_RELEASE_BEFORE_LOCKS,
+ isCommit, false);
+ ResourceOwnerRelease(portal->resowner,
+ RESOURCE_RELEASE_LOCKS,
+ isCommit, false);
+ ResourceOwnerRelease(portal->resowner,
+ RESOURCE_RELEASE_AFTER_LOCKS,
+ isCommit, false);
+ ResourceOwnerDelete(portal->resowner);
+ }
+ portal->resowner = NULL;
+
+ /*
+ * Delete tuplestore if present. We should do this even under error
+ * conditions; since the tuplestore would have been using cross-
+ * transaction storage, its temp files need to be explicitly deleted.
+ */
+ if (portal->holdStore)
+ {
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(portal->holdContext);
+ tuplestore_end(portal->holdStore);
+ MemoryContextSwitchTo(oldcontext);
+ portal->holdStore = NULL;
+ }
+
+ /* delete tuplestore storage, if any */
+ if (portal->holdContext)
+ MemoryContextDelete(portal->holdContext);
+
+ /* release subsidiary storage */
+ MemoryContextDelete(portal->portalContext);
+
+ /* release portal struct (it's in TopPortalContext) */
+ pfree(portal);
+}
+
+/*
+ * Delete all declared cursors.
+ *
+ * Used by commands: CLOSE ALL, DISCARD ALL
+ */
+void
+PortalHashTableDeleteAll(void)
+{
+ HASH_SEQ_STATUS status;
+ PortalHashEnt *hentry;
+
+ if (PortalHashTable == NULL)
+ return;
+
+ hash_seq_init(&status, PortalHashTable);
+ while ((hentry = hash_seq_search(&status)) != NULL)
+ {
+ Portal portal = hentry->portal;
+
+ /* Can't close the active portal (the one running the command) */
+ if (portal->status == PORTAL_ACTIVE)
+ continue;
+
+ PortalDrop(portal, false);
+
+ /* Restart the iteration in case that led to other drops */
+ hash_seq_term(&status);
+ hash_seq_init(&status, PortalHashTable);
+ }
+}
+
+/*
+ * "Hold" a portal. Prepare it for access by later transactions.
+ */
+static void
+HoldPortal(Portal portal)
+{
+ /*
+ * Note that PersistHoldablePortal() must release all resources used by
+ * the portal that are local to the creating transaction.
+ */
+ PortalCreateHoldStore(portal);
+ PersistHoldablePortal(portal);
+
+ /* drop cached plan reference, if any */
+ PortalReleaseCachedPlan(portal);
+
+ /*
+ * Any resources belonging to the portal will be released in the upcoming
+ * transaction-wide cleanup; the portal will no longer have its own
+ * resources.
+ */
+ portal->resowner = NULL;
+
+ /*
+ * Having successfully exported the holdable cursor, mark it as not
+ * belonging to this transaction.
+ */
+ portal->createSubid = InvalidSubTransactionId;
+ portal->activeSubid = InvalidSubTransactionId;
+ portal->createLevel = 0;
+}
+
+/*
+ * Pre-commit processing for portals.
+ *
+ * Holdable cursors created in this transaction need to be converted to
+ * materialized form, since we are going to close down the executor and
+ * release locks. Non-holdable portals created in this transaction are
+ * simply removed. Portals remaining from prior transactions should be
+ * left untouched.
+ *
+ * Returns true if any portals changed state (possibly causing user-defined
+ * code to be run), false if not.
+ */
+bool
+PreCommit_Portals(bool isPrepare)
+{
+ bool result = false;
+ HASH_SEQ_STATUS status;
+ PortalHashEnt *hentry;
+
+ hash_seq_init(&status, PortalHashTable);
+
+ while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+ {
+ Portal portal = hentry->portal;
+
+ /*
+ * There should be no pinned portals anymore. Complain if someone
+ * leaked one. Auto-held portals are allowed; we assume that whoever
+ * pinned them is managing them.
+ */
+ if (portal->portalPinned && !portal->autoHeld)
+ elog(ERROR, "cannot commit while a portal is pinned");
+
+ /*
+ * Do not touch active portals --- this can only happen in the case of
+ * a multi-transaction utility command, such as VACUUM, or a commit in
+ * a procedure.
+ *
+ * Note however that any resource owner attached to such a portal is
+ * still going to go away, so don't leave a dangling pointer. Also
+ * unregister any snapshots held by the portal, mainly to avoid
+ * snapshot leak warnings from ResourceOwnerRelease().
+ */
+ if (portal->status == PORTAL_ACTIVE)
+ {
+ if (portal->holdSnapshot)
+ {
+ if (portal->resowner)
+ UnregisterSnapshotFromOwner(portal->holdSnapshot,
+ portal->resowner);
+ portal->holdSnapshot = NULL;
+ }
+ portal->resowner = NULL;
+ /* Clear portalSnapshot too, for cleanliness */
+ portal->portalSnapshot = NULL;
+ continue;
+ }
+
+ /* Is it a holdable portal created in the current xact? */
+ if ((portal->cursorOptions & CURSOR_OPT_HOLD) &&
+ portal->createSubid != InvalidSubTransactionId &&
+ portal->status == PORTAL_READY)
+ {
+ /*
+ * We are exiting the transaction that created a holdable cursor.
+ * Instead of dropping the portal, prepare it for access by later
+ * transactions.
+ *
+ * However, if this is PREPARE TRANSACTION rather than COMMIT,
+ * refuse PREPARE, because the semantics seem pretty unclear.
+ */
+ if (isPrepare)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot PREPARE a transaction that has created a cursor WITH HOLD")));
+
+ HoldPortal(portal);
+
+ /* Report we changed state */
+ result = true;
+ }
+ else if (portal->createSubid == InvalidSubTransactionId)
+ {
+ /*
+ * Do nothing to cursors held over from a previous transaction
+ * (including ones we just froze in a previous cycle of this loop)
+ */
+ continue;
+ }
+ else
+ {
+ /* Zap all non-holdable portals */
+ PortalDrop(portal, true);
+
+ /* Report we changed state */
+ result = true;
+ }
+
+ /*
+ * After either freezing or dropping a portal, we have to restart the
+ * iteration, because we could have invoked user-defined code that
+ * caused a drop of the next portal in the hash chain.
+ */
+ hash_seq_term(&status);
+ hash_seq_init(&status, PortalHashTable);
+ }
+
+ return result;
+}
+
+/*
+ * Abort processing for portals.
+ *
+ * At this point we run the cleanup hook if present, but we can't release the
+ * portal's memory until the cleanup call.
+ */
+void
+AtAbort_Portals(void)
+{
+ HASH_SEQ_STATUS status;
+ PortalHashEnt *hentry;
+
+ hash_seq_init(&status, PortalHashTable);
+
+ while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+ {
+ Portal portal = hentry->portal;
+
+ /*
+ * When elog(FATAL) is progress, we need to set the active portal to
+ * failed, so that PortalCleanup() doesn't run the executor shutdown.
+ */
+ if (portal->status == PORTAL_ACTIVE && shmem_exit_inprogress)
+ MarkPortalFailed(portal);
+
+ /*
+ * Do nothing else to cursors held over from a previous transaction.
+ */
+ if (portal->createSubid == InvalidSubTransactionId)
+ continue;
+
+ /*
+ * Do nothing to auto-held cursors. This is similar to the case of a
+ * cursor from a previous transaction, but it could also be that the
+ * cursor was auto-held in this transaction, so it wants to live on.
+ */
+ if (portal->autoHeld)
+ continue;
+
+ /*
+ * If it was created in the current transaction, we can't do normal
+ * shutdown on a READY portal either; it might refer to objects
+ * created in the failed transaction. See comments in
+ * AtSubAbort_Portals.
+ */
+ if (portal->status == PORTAL_READY)
+ MarkPortalFailed(portal);
+
+ /*
+ * Allow portalcmds.c to clean up the state it knows about, if we
+ * haven't already.
+ */
+ if (PointerIsValid(portal->cleanup))
+ {
+ portal->cleanup(portal);
+ portal->cleanup = NULL;
+ }
+
+ /* drop cached plan reference, if any */
+ PortalReleaseCachedPlan(portal);
+
+ /*
+ * Any resources belonging to the portal will be released in the
+ * upcoming transaction-wide cleanup; they will be gone before we run
+ * PortalDrop.
+ */
+ portal->resowner = NULL;
+
+ /*
+ * Although we can't delete the portal data structure proper, we can
+ * release any memory in subsidiary contexts, such as executor state.
+ * The cleanup hook was the last thing that might have needed data
+ * there. But leave active portals alone.
+ */
+ if (portal->status != PORTAL_ACTIVE)
+ MemoryContextDeleteChildren(portal->portalContext);
+ }
+}
+
+/*
+ * Post-abort cleanup for portals.
+ *
+ * Delete all portals not held over from prior transactions. */
+void
+AtCleanup_Portals(void)
+{
+ HASH_SEQ_STATUS status;
+ PortalHashEnt *hentry;
+
+ hash_seq_init(&status, PortalHashTable);
+
+ while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+ {
+ Portal portal = hentry->portal;
+
+ /*
+ * Do not touch active portals --- this can only happen in the case of
+ * a multi-transaction command.
+ */
+ if (portal->status == PORTAL_ACTIVE)
+ continue;
+
+ /*
+ * Do nothing to cursors held over from a previous transaction or
+ * auto-held ones.
+ */
+ if (portal->createSubid == InvalidSubTransactionId || portal->autoHeld)
+ {
+ Assert(portal->status != PORTAL_ACTIVE);
+ Assert(portal->resowner == NULL);
+ continue;
+ }
+
+ /*
+ * If a portal is still pinned, forcibly unpin it. PortalDrop will not
+ * let us drop the portal otherwise. Whoever pinned the portal was
+ * interrupted by the abort too and won't try to use it anymore.
+ */
+ if (portal->portalPinned)
+ portal->portalPinned = false;
+
+ /*
+ * We had better not call any user-defined code during cleanup, so if
+ * the cleanup hook hasn't been run yet, too bad; we'll just skip it.
+ */
+ if (PointerIsValid(portal->cleanup))
+ {
+ elog(WARNING, "skipping cleanup for portal \"%s\"", portal->name);
+ portal->cleanup = NULL;
+ }
+
+ /* Zap it. */
+ PortalDrop(portal, false);
+ }
+}
+
+/*
+ * Portal-related cleanup when we return to the main loop on error.
+ *
+ * This is different from the cleanup at transaction abort. Auto-held portals
+ * are cleaned up on error but not on transaction abort.
+ */
+void
+PortalErrorCleanup(void)
+{
+ HASH_SEQ_STATUS status;
+ PortalHashEnt *hentry;
+
+ hash_seq_init(&status, PortalHashTable);
+
+ while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+ {
+ Portal portal = hentry->portal;
+
+ if (portal->autoHeld)
+ {
+ portal->portalPinned = false;
+ PortalDrop(portal, false);
+ }
+ }
+}
+
+/*
+ * Pre-subcommit processing for portals.
+ *
+ * Reassign portals created or used in the current subtransaction to the
+ * parent subtransaction.
+ */
+void
+AtSubCommit_Portals(SubTransactionId mySubid,
+ SubTransactionId parentSubid,
+ int parentLevel,
+ ResourceOwner parentXactOwner)
+{
+ HASH_SEQ_STATUS status;
+ PortalHashEnt *hentry;
+
+ hash_seq_init(&status, PortalHashTable);
+
+ while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+ {
+ Portal portal = hentry->portal;
+
+ if (portal->createSubid == mySubid)
+ {
+ portal->createSubid = parentSubid;
+ portal->createLevel = parentLevel;
+ if (portal->resowner)
+ ResourceOwnerNewParent(portal->resowner, parentXactOwner);
+ }
+ if (portal->activeSubid == mySubid)
+ portal->activeSubid = parentSubid;
+ }
+}
+
+/*
+ * Subtransaction abort handling for portals.
+ *
+ * Deactivate portals created or used during the failed subtransaction.
+ * Note that per AtSubCommit_Portals, this will catch portals created/used
+ * in descendants of the subtransaction too.
+ *
+ * We don't destroy any portals here; that's done in AtSubCleanup_Portals.
+ */
+void
+AtSubAbort_Portals(SubTransactionId mySubid,
+ SubTransactionId parentSubid,
+ ResourceOwner myXactOwner,
+ ResourceOwner parentXactOwner)
+{
+ HASH_SEQ_STATUS status;
+ PortalHashEnt *hentry;
+
+ hash_seq_init(&status, PortalHashTable);
+
+ while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+ {
+ Portal portal = hentry->portal;
+
+ /* Was it created in this subtransaction? */
+ if (portal->createSubid != mySubid)
+ {
+ /* No, but maybe it was used in this subtransaction? */
+ if (portal->activeSubid == mySubid)
+ {
+ /* Maintain activeSubid until the portal is removed */
+ portal->activeSubid = parentSubid;
+
+ /*
+ * A MarkPortalActive() caller ran an upper-level portal in
+ * this subtransaction and left the portal ACTIVE. This can't
+ * happen, but force the portal into FAILED state for the same
+ * reasons discussed below.
+ *
+ * We assume we can get away without forcing upper-level READY
+ * portals to fail, even if they were run and then suspended.
+ * In theory a suspended upper-level portal could have
+ * acquired some references to objects that are about to be
+ * destroyed, but there should be sufficient defenses against
+ * such cases: the portal's original query cannot contain such
+ * references, and any references within, say, cached plans of
+ * PL/pgSQL functions are not from active queries and should
+ * be protected by revalidation logic.
+ */
+ if (portal->status == PORTAL_ACTIVE)
+ MarkPortalFailed(portal);
+
+ /*
+ * Also, if we failed it during the current subtransaction
+ * (either just above, or earlier), reattach its resource
+ * owner to the current subtransaction's resource owner, so
+ * that any resources it still holds will be released while
+ * cleaning up this subtransaction. This prevents some corner
+ * cases wherein we might get Asserts or worse while cleaning
+ * up objects created during the current subtransaction
+ * (because they're still referenced within this portal).
+ */
+ if (portal->status == PORTAL_FAILED && portal->resowner)
+ {
+ ResourceOwnerNewParent(portal->resowner, myXactOwner);
+ portal->resowner = NULL;
+ }
+ }
+ /* Done if it wasn't created in this subtransaction */
+ continue;
+ }
+
+ /*
+ * Force any live portals of my own subtransaction into FAILED state.
+ * We have to do this because they might refer to objects created or
+ * changed in the failed subtransaction, leading to crashes within
+ * ExecutorEnd when portalcmds.c tries to close down the portal.
+ * Currently, every MarkPortalActive() caller ensures it updates the
+ * portal status again before relinquishing control, so ACTIVE can't
+ * happen here. If it does happen, dispose the portal like existing
+ * MarkPortalActive() callers would.
+ */
+ if (portal->status == PORTAL_READY ||
+ portal->status == PORTAL_ACTIVE)
+ MarkPortalFailed(portal);
+
+ /*
+ * Allow portalcmds.c to clean up the state it knows about, if we
+ * haven't already.
+ */
+ if (PointerIsValid(portal->cleanup))
+ {
+ portal->cleanup(portal);
+ portal->cleanup = NULL;
+ }
+
+ /* drop cached plan reference, if any */
+ PortalReleaseCachedPlan(portal);
+
+ /*
+ * Any resources belonging to the portal will be released in the
+ * upcoming transaction-wide cleanup; they will be gone before we run
+ * PortalDrop.
+ */
+ portal->resowner = NULL;
+
+ /*
+ * Although we can't delete the portal data structure proper, we can
+ * release any memory in subsidiary contexts, such as executor state.
+ * The cleanup hook was the last thing that might have needed data
+ * there.
+ */
+ MemoryContextDeleteChildren(portal->portalContext);
+ }
+}
+
+/*
+ * Post-subabort cleanup for portals.
+ *
+ * Drop all portals created in the failed subtransaction (but note that
+ * we will not drop any that were reassigned to the parent above).
+ */
+void
+AtSubCleanup_Portals(SubTransactionId mySubid)
+{
+ HASH_SEQ_STATUS status;
+ PortalHashEnt *hentry;
+
+ hash_seq_init(&status, PortalHashTable);
+
+ while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+ {
+ Portal portal = hentry->portal;
+
+ if (portal->createSubid != mySubid)
+ continue;
+
+ /*
+ * If a portal is still pinned, forcibly unpin it. PortalDrop will not
+ * let us drop the portal otherwise. Whoever pinned the portal was
+ * interrupted by the abort too and won't try to use it anymore.
+ */
+ if (portal->portalPinned)
+ portal->portalPinned = false;
+
+ /*
+ * We had better not call any user-defined code during cleanup, so if
+ * the cleanup hook hasn't been run yet, too bad; we'll just skip it.
+ */
+ if (PointerIsValid(portal->cleanup))
+ {
+ elog(WARNING, "skipping cleanup for portal \"%s\"", portal->name);
+ portal->cleanup = NULL;
+ }
+
+ /* Zap it. */
+ PortalDrop(portal, false);
+ }
+}
+
+/* Find all available cursors */
+Datum
+pg_cursor(PG_FUNCTION_ARGS)
+{
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ TupleDesc tupdesc;
+ Tuplestorestate *tupstore;
+ MemoryContext per_query_ctx;
+ MemoryContext oldcontext;
+ HASH_SEQ_STATUS hash_seq;
+ PortalHashEnt *hentry;
+
+ /* check to see if caller supports us returning a tuplestore */
+ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("set-valued function called in context that cannot accept a set")));
+ if (!(rsinfo->allowedModes & SFRM_Materialize))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("materialize mode required, but it is not allowed in this context")));
+
+ /* need to build tuplestore in query context */
+ per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+ oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+ /*
+ * build tupdesc for result tuples. This must match the definition of the
+ * pg_cursors view in system_views.sql
+ */
+ tupdesc = CreateTemplateTupleDesc(6);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 1, "name",
+ TEXTOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 2, "statement",
+ TEXTOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 3, "is_holdable",
+ BOOLOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 4, "is_binary",
+ BOOLOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 5, "is_scrollable",
+ BOOLOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 6, "creation_time",
+ TIMESTAMPTZOID, -1, 0);
+
+ /*
+ * We put all the tuples into a tuplestore in one scan of the hashtable.
+ * This avoids any issue of the hashtable possibly changing between calls.
+ */
+ tupstore =
+ tuplestore_begin_heap(rsinfo->allowedModes & SFRM_Materialize_Random,
+ false, work_mem);
+
+ /* generate junk in short-term context */
+ MemoryContextSwitchTo(oldcontext);
+
+ hash_seq_init(&hash_seq, PortalHashTable);
+ while ((hentry = hash_seq_search(&hash_seq)) != NULL)
+ {
+ Portal portal = hentry->portal;
+ Datum values[6];
+ bool nulls[6];
+
+ /* report only "visible" entries */
+ if (!portal->visible)
+ continue;
+
+ MemSet(nulls, 0, sizeof(nulls));
+
+ values[0] = CStringGetTextDatum(portal->name);
+ values[1] = CStringGetTextDatum(portal->sourceText);
+ values[2] = BoolGetDatum(portal->cursorOptions & CURSOR_OPT_HOLD);
+ values[3] = BoolGetDatum(portal->cursorOptions & CURSOR_OPT_BINARY);
+ values[4] = BoolGetDatum(portal->cursorOptions & CURSOR_OPT_SCROLL);
+ values[5] = TimestampTzGetDatum(portal->creation_time);
+
+ tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+ }
+
+ /* clean up and return the tuplestore */
+ tuplestore_donestoring(tupstore);
+
+ rsinfo->returnMode = SFRM_Materialize;
+ rsinfo->setResult = tupstore;
+ rsinfo->setDesc = tupdesc;
+
+ return (Datum) 0;
+}
+
+bool
+ThereAreNoReadyPortals(void)
+{
+ HASH_SEQ_STATUS status;
+ PortalHashEnt *hentry;
+
+ hash_seq_init(&status, PortalHashTable);
+
+ while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+ {
+ Portal portal = hentry->portal;
+
+ if (portal->status == PORTAL_READY)
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Hold all pinned portals.
+ *
+ * When initiating a COMMIT or ROLLBACK inside a procedure, this must be
+ * called to protect internally-generated cursors from being dropped during
+ * the transaction shutdown. Currently, SPI calls this automatically; PLs
+ * that initiate COMMIT or ROLLBACK some other way are on the hook to do it
+ * themselves. (Note that we couldn't do this in, say, AtAbort_Portals
+ * because we need to run user-defined code while persisting a portal.
+ * It's too late to do that once transaction abort has started.)
+ *
+ * We protect such portals by converting them to held cursors. We mark them
+ * as "auto-held" so that exception exit knows to clean them up. (In normal,
+ * non-exception code paths, the PL needs to clean such portals itself, since
+ * transaction end won't do it anymore; but that should be normal practice
+ * anyway.)
+ */
+void
+HoldPinnedPortals(void)
+{
+ HASH_SEQ_STATUS status;
+ PortalHashEnt *hentry;
+
+ hash_seq_init(&status, PortalHashTable);
+
+ while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+ {
+ Portal portal = hentry->portal;
+
+ if (portal->portalPinned && !portal->autoHeld)
+ {
+ /*
+ * Doing transaction control, especially abort, inside a cursor
+ * loop that is not read-only, for example using UPDATE ...
+ * RETURNING, has weird semantics issues. Also, this
+ * implementation wouldn't work, because such portals cannot be
+ * held. (The core grammar enforces that only SELECT statements
+ * can drive a cursor, but for example PL/pgSQL does not restrict
+ * it.)
+ */
+ if (portal->strategy != PORTAL_ONE_SELECT)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("cannot perform transaction commands inside a cursor loop that is not read-only")));
+
+ /* Verify it's in a suitable state to be held */
+ if (portal->status != PORTAL_READY)
+ elog(ERROR, "pinned portal is not ready to be auto-held");
+
+ HoldPortal(portal);
+ portal->autoHeld = true;
+ }
+ }
+}
+
+/*
+ * Drop the outer active snapshots for all portals, so that no snapshots
+ * remain active.
+ *
+ * Like HoldPinnedPortals, this must be called when initiating a COMMIT or
+ * ROLLBACK inside a procedure. This has to be separate from that since it
+ * should not be run until we're done with steps that are likely to fail.
+ *
+ * It's tempting to fold this into PreCommit_Portals, but to do so, we'd
+ * need to clean up snapshot management in VACUUM and perhaps other places.
+ */
+void
+ForgetPortalSnapshots(void)
+{
+ HASH_SEQ_STATUS status;
+ PortalHashEnt *hentry;
+ int numPortalSnaps = 0;
+ int numActiveSnaps = 0;
+
+ /* First, scan PortalHashTable and clear portalSnapshot fields */
+ hash_seq_init(&status, PortalHashTable);
+
+ while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+ {
+ Portal portal = hentry->portal;
+
+ if (portal->portalSnapshot != NULL)
+ {
+ portal->portalSnapshot = NULL;
+ numPortalSnaps++;
+ }
+ /* portal->holdSnapshot will be cleaned up in PreCommit_Portals */
+ }
+
+ /*
+ * Now, pop all the active snapshots, which should be just those that were
+ * portal snapshots. Ideally we'd drive this directly off the portal
+ * scan, but there's no good way to visit the portals in the correct
+ * order. So just cross-check after the fact.
+ */
+ while (ActiveSnapshotSet())
+ {
+ PopActiveSnapshot();
+ numActiveSnaps++;
+ }
+
+ if (numPortalSnaps != numActiveSnaps)
+ elog(ERROR, "portal snapshots (%d) did not account for all active snapshots (%d)",
+ numPortalSnaps, numActiveSnaps);
+}
diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c
new file mode 100644
index 0000000..553dd7f
--- /dev/null
+++ b/src/backend/utils/mmgr/slab.c
@@ -0,0 +1,794 @@
+/*-------------------------------------------------------------------------
+ *
+ * slab.c
+ * SLAB allocator definitions.
+ *
+ * SLAB is a MemoryContext implementation designed for cases where large
+ * numbers of equally-sized objects are allocated (and freed).
+ *
+ *
+ * Portions Copyright (c) 2017-2021, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/utils/mmgr/slab.c
+ *
+ *
+ * NOTE:
+ * The constant allocation size allows significant simplification and various
+ * optimizations over more general purpose allocators. The blocks are carved
+ * into chunks of exactly the right size (plus alignment), not wasting any
+ * memory.
+ *
+ * The information about free chunks is maintained both at the block level and
+ * global (context) level. This is possible as the chunk size (and thus also
+ * the number of chunks per block) is fixed.
+ *
+ * On each block, free chunks are tracked in a simple linked list. Contents
+ * of free chunks is replaced with an index of the next free chunk, forming
+ * a very simple linked list. Each block also contains a counter of free
+ * chunks. Combined with the local block-level freelist, it makes it trivial
+ * to eventually free the whole block.
+ *
+ * At the context level, we use 'freelist' to track blocks ordered by number
+ * of free chunks, starting with blocks having a single allocated chunk, and
+ * with completely full blocks on the tail.
+ *
+ * This also allows various optimizations - for example when searching for
+ * free chunk, the allocator reuses space from the fullest blocks first, in
+ * the hope that some of the less full blocks will get completely empty (and
+ * returned back to the OS).
+ *
+ * For each block, we maintain pointer to the first free chunk - this is quite
+ * cheap and allows us to skip all the preceding used chunks, eliminating
+ * a significant number of lookups in many common usage patterns. In the worst
+ * case this performs as if the pointer was not maintained.
+ *
+ * We cache the freelist index for the blocks with the fewest free chunks
+ * (minFreeChunks), so that we don't have to search the freelist on every
+ * SlabAlloc() call, which is quite expensive.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "lib/ilist.h"
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+
+/*
+ * SlabContext is a specialized implementation of MemoryContext.
+ */
+typedef struct SlabContext
+{
+ MemoryContextData header; /* Standard memory-context fields */
+ /* Allocation parameters for this context: */
+ Size chunkSize; /* chunk size */
+ Size fullChunkSize; /* chunk size including header and alignment */
+ Size blockSize; /* block size */
+ Size headerSize; /* allocated size of context header */
+ int chunksPerBlock; /* number of chunks per block */
+ int minFreeChunks; /* min number of free chunks in any block */
+ int nblocks; /* number of blocks allocated */
+#ifdef MEMORY_CONTEXT_CHECKING
+ bool *freechunks; /* bitmap of free chunks in a block */
+#endif
+ /* blocks with free space, grouped by number of free chunks: */
+ dlist_head freelist[FLEXIBLE_ARRAY_MEMBER];
+} SlabContext;
+
+/*
+ * SlabBlock
+ * Structure of a single block in SLAB allocator.
+ *
+ * node: doubly-linked list of blocks in global freelist
+ * nfree: number of free chunks in this block
+ * firstFreeChunk: index of the first free chunk
+ */
+typedef struct SlabBlock
+{
+ dlist_node node; /* doubly-linked list */
+ int nfree; /* number of free chunks */
+ int firstFreeChunk; /* index of the first free chunk in the block */
+} SlabBlock;
+
+/*
+ * SlabChunk
+ * The prefix of each piece of memory in a SlabBlock
+ *
+ * Note: to meet the memory context APIs, the payload area of the chunk must
+ * be maxaligned, and the "slab" link must be immediately adjacent to the
+ * payload area (cf. GetMemoryChunkContext). Since we support no machines on
+ * which MAXALIGN is more than twice sizeof(void *), this happens without any
+ * special hacking in this struct declaration. But there is a static
+ * assertion below that the alignment is done correctly.
+ */
+typedef struct SlabChunk
+{
+ SlabBlock *block; /* block owning this chunk */
+ SlabContext *slab; /* owning context */
+ /* there must not be any padding to reach a MAXALIGN boundary here! */
+} SlabChunk;
+
+
+#define SlabPointerGetChunk(ptr) \
+ ((SlabChunk *)(((char *)(ptr)) - sizeof(SlabChunk)))
+#define SlabChunkGetPointer(chk) \
+ ((void *)(((char *)(chk)) + sizeof(SlabChunk)))
+#define SlabBlockGetChunk(slab, block, idx) \
+ ((SlabChunk *) ((char *) (block) + sizeof(SlabBlock) \
+ + (idx * slab->fullChunkSize)))
+#define SlabBlockStart(block) \
+ ((char *) block + sizeof(SlabBlock))
+#define SlabChunkIndex(slab, block, chunk) \
+ (((char *) chunk - SlabBlockStart(block)) / slab->fullChunkSize)
+
+/*
+ * These functions implement the MemoryContext API for Slab contexts.
+ */
+static void *SlabAlloc(MemoryContext context, Size size);
+static void SlabFree(MemoryContext context, void *pointer);
+static void *SlabRealloc(MemoryContext context, void *pointer, Size size);
+static void SlabReset(MemoryContext context);
+static void SlabDelete(MemoryContext context);
+static Size SlabGetChunkSpace(MemoryContext context, void *pointer);
+static bool SlabIsEmpty(MemoryContext context);
+static void SlabStats(MemoryContext context,
+ MemoryStatsPrintFunc printfunc, void *passthru,
+ MemoryContextCounters *totals,
+ bool print_to_stderr);
+#ifdef MEMORY_CONTEXT_CHECKING
+static void SlabCheck(MemoryContext context);
+#endif
+
+/*
+ * This is the virtual function table for Slab contexts.
+ */
+static const MemoryContextMethods SlabMethods = {
+ SlabAlloc,
+ SlabFree,
+ SlabRealloc,
+ SlabReset,
+ SlabDelete,
+ SlabGetChunkSpace,
+ SlabIsEmpty,
+ SlabStats
+#ifdef MEMORY_CONTEXT_CHECKING
+ ,SlabCheck
+#endif
+};
+
+
+/*
+ * SlabContextCreate
+ * Create a new Slab context.
+ *
+ * parent: parent context, or NULL if top-level context
+ * name: name of context (must be statically allocated)
+ * blockSize: allocation block size
+ * chunkSize: allocation chunk size
+ *
+ * The chunkSize may not exceed:
+ * MAXALIGN_DOWN(SIZE_MAX) - MAXALIGN(sizeof(SlabBlock)) - sizeof(SlabChunk)
+ */
+MemoryContext
+SlabContextCreate(MemoryContext parent,
+ const char *name,
+ Size blockSize,
+ Size chunkSize)
+{
+ int chunksPerBlock;
+ Size fullChunkSize;
+ Size freelistSize;
+ Size headerSize;
+ SlabContext *slab;
+ int i;
+
+ /* Assert we padded SlabChunk properly */
+ StaticAssertStmt(sizeof(SlabChunk) == MAXALIGN(sizeof(SlabChunk)),
+ "sizeof(SlabChunk) is not maxaligned");
+ StaticAssertStmt(offsetof(SlabChunk, slab) + sizeof(MemoryContext) ==
+ sizeof(SlabChunk),
+ "padding calculation in SlabChunk is wrong");
+
+ /* Make sure the linked list node fits inside a freed chunk */
+ if (chunkSize < sizeof(int))
+ chunkSize = sizeof(int);
+
+ /* chunk, including SLAB header (both addresses nicely aligned) */
+ fullChunkSize = sizeof(SlabChunk) + MAXALIGN(chunkSize);
+
+ /* Make sure the block can store at least one chunk. */
+ if (blockSize < fullChunkSize + sizeof(SlabBlock))
+ elog(ERROR, "block size %zu for slab is too small for %zu chunks",
+ blockSize, chunkSize);
+
+ /* Compute maximum number of chunks per block */
+ chunksPerBlock = (blockSize - sizeof(SlabBlock)) / fullChunkSize;
+
+ /* The freelist starts with 0, ends with chunksPerBlock. */
+ freelistSize = sizeof(dlist_head) * (chunksPerBlock + 1);
+
+ /*
+ * Allocate the context header. Unlike aset.c, we never try to combine
+ * this with the first regular block; not worth the extra complication.
+ */
+
+ /* Size of the memory context header */
+ headerSize = offsetof(SlabContext, freelist) + freelistSize;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+ /*
+ * With memory checking, we need to allocate extra space for the bitmap of
+ * free chunks. The bitmap is an array of bools, so we don't need to worry
+ * about alignment.
+ */
+ headerSize += chunksPerBlock * sizeof(bool);
+#endif
+
+ slab = (SlabContext *) malloc(headerSize);
+ if (slab == NULL)
+ {
+ MemoryContextStats(TopMemoryContext);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory"),
+ errdetail("Failed while creating memory context \"%s\".",
+ name)));
+ }
+
+ /*
+ * Avoid writing code that can fail between here and MemoryContextCreate;
+ * we'd leak the header if we ereport in this stretch.
+ */
+
+ /* Fill in SlabContext-specific header fields */
+ slab->chunkSize = chunkSize;
+ slab->fullChunkSize = fullChunkSize;
+ slab->blockSize = blockSize;
+ slab->headerSize = headerSize;
+ slab->chunksPerBlock = chunksPerBlock;
+ slab->minFreeChunks = 0;
+ slab->nblocks = 0;
+
+ /* initialize the freelist slots */
+ for (i = 0; i < (slab->chunksPerBlock + 1); i++)
+ dlist_init(&slab->freelist[i]);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* set the freechunks pointer right after the freelists array */
+ slab->freechunks
+ = (bool *) slab + offsetof(SlabContext, freelist) + freelistSize;
+#endif
+
+ /* Finally, do the type-independent part of context creation */
+ MemoryContextCreate((MemoryContext) slab,
+ T_SlabContext,
+ &SlabMethods,
+ parent,
+ name);
+
+ return (MemoryContext) slab;
+}
+
+/*
+ * SlabReset
+ * Frees all memory which is allocated in the given set.
+ *
+ * The code simply frees all the blocks in the context - we don't keep any
+ * keeper blocks or anything like that.
+ */
+static void
+SlabReset(MemoryContext context)
+{
+ int i;
+ SlabContext *slab = castNode(SlabContext, context);
+
+ Assert(slab);
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* Check for corruption and leaks before freeing */
+ SlabCheck(context);
+#endif
+
+ /* walk over freelists and free the blocks */
+ for (i = 0; i <= slab->chunksPerBlock; i++)
+ {
+ dlist_mutable_iter miter;
+
+ dlist_foreach_modify(miter, &slab->freelist[i])
+ {
+ SlabBlock *block = dlist_container(SlabBlock, node, miter.cur);
+
+ dlist_delete(miter.cur);
+
+#ifdef CLOBBER_FREED_MEMORY
+ wipe_mem(block, slab->blockSize);
+#endif
+ free(block);
+ slab->nblocks--;
+ context->mem_allocated -= slab->blockSize;
+ }
+ }
+
+ slab->minFreeChunks = 0;
+
+ Assert(slab->nblocks == 0);
+ Assert(context->mem_allocated == 0);
+}
+
+/*
+ * SlabDelete
+ * Free all memory which is allocated in the given context.
+ */
+static void
+SlabDelete(MemoryContext context)
+{
+ /* Reset to release all the SlabBlocks */
+ SlabReset(context);
+ /* And free the context header */
+ free(context);
+}
+
+/*
+ * SlabAlloc
+ * Returns pointer to allocated memory of given size or NULL if
+ * request could not be completed; memory is added to the slab.
+ */
+static void *
+SlabAlloc(MemoryContext context, Size size)
+{
+ SlabContext *slab = castNode(SlabContext, context);
+ SlabBlock *block;
+ SlabChunk *chunk;
+ int idx;
+
+ Assert(slab);
+
+ Assert((slab->minFreeChunks >= 0) &&
+ (slab->minFreeChunks < slab->chunksPerBlock));
+
+ /* make sure we only allow correct request size */
+ if (size != slab->chunkSize)
+ elog(ERROR, "unexpected alloc chunk size %zu (expected %zu)",
+ size, slab->chunkSize);
+
+ /*
+ * If there are no free chunks in any existing block, create a new block
+ * and put it to the last freelist bucket.
+ *
+ * slab->minFreeChunks == 0 means there are no blocks with free chunks,
+ * thanks to how minFreeChunks is updated at the end of SlabAlloc().
+ */
+ if (slab->minFreeChunks == 0)
+ {
+ block = (SlabBlock *) malloc(slab->blockSize);
+
+ if (block == NULL)
+ return NULL;
+
+ block->nfree = slab->chunksPerBlock;
+ block->firstFreeChunk = 0;
+
+ /*
+ * Put all the chunks on a freelist. Walk the chunks and point each
+ * one to the next one.
+ */
+ for (idx = 0; idx < slab->chunksPerBlock; idx++)
+ {
+ chunk = SlabBlockGetChunk(slab, block, idx);
+ *(int32 *) SlabChunkGetPointer(chunk) = (idx + 1);
+ }
+
+ /*
+ * And add it to the last freelist with all chunks empty.
+ *
+ * We know there are no blocks in the freelist, otherwise we wouldn't
+ * need a new block.
+ */
+ Assert(dlist_is_empty(&slab->freelist[slab->chunksPerBlock]));
+
+ dlist_push_head(&slab->freelist[slab->chunksPerBlock], &block->node);
+
+ slab->minFreeChunks = slab->chunksPerBlock;
+ slab->nblocks += 1;
+ context->mem_allocated += slab->blockSize;
+ }
+
+ /* grab the block from the freelist (even the new block is there) */
+ block = dlist_head_element(SlabBlock, node,
+ &slab->freelist[slab->minFreeChunks]);
+
+ /* make sure we actually got a valid block, with matching nfree */
+ Assert(block != NULL);
+ Assert(slab->minFreeChunks == block->nfree);
+ Assert(block->nfree > 0);
+
+ /* we know index of the first free chunk in the block */
+ idx = block->firstFreeChunk;
+
+ /* make sure the chunk index is valid, and that it's marked as empty */
+ Assert((idx >= 0) && (idx < slab->chunksPerBlock));
+
+ /* compute the chunk location block start (after the block header) */
+ chunk = SlabBlockGetChunk(slab, block, idx);
+
+ /*
+ * Update the block nfree count, and also the minFreeChunks as we've
+ * decreased nfree for a block with the minimum number of free chunks
+ * (because that's how we chose the block).
+ */
+ block->nfree--;
+ slab->minFreeChunks = block->nfree;
+
+ /*
+ * Remove the chunk from the freelist head. The index of the next free
+ * chunk is stored in the chunk itself.
+ */
+ VALGRIND_MAKE_MEM_DEFINED(SlabChunkGetPointer(chunk), sizeof(int32));
+ block->firstFreeChunk = *(int32 *) SlabChunkGetPointer(chunk);
+
+ Assert(block->firstFreeChunk >= 0);
+ Assert(block->firstFreeChunk <= slab->chunksPerBlock);
+
+ Assert((block->nfree != 0 &&
+ block->firstFreeChunk < slab->chunksPerBlock) ||
+ (block->nfree == 0 &&
+ block->firstFreeChunk == slab->chunksPerBlock));
+
+ /* move the whole block to the right place in the freelist */
+ dlist_delete(&block->node);
+ dlist_push_head(&slab->freelist[block->nfree], &block->node);
+
+ /*
+ * And finally update minFreeChunks, i.e. the index to the block with the
+ * lowest number of free chunks. We only need to do that when the block
+ * got full (otherwise we know the current block is the right one). We'll
+ * simply walk the freelist until we find a non-empty entry.
+ */
+ if (slab->minFreeChunks == 0)
+ {
+ for (idx = 1; idx <= slab->chunksPerBlock; idx++)
+ {
+ if (dlist_is_empty(&slab->freelist[idx]))
+ continue;
+
+ /* found a non-empty freelist */
+ slab->minFreeChunks = idx;
+ break;
+ }
+ }
+
+ if (slab->minFreeChunks == slab->chunksPerBlock)
+ slab->minFreeChunks = 0;
+
+ /* Prepare to initialize the chunk header. */
+ VALGRIND_MAKE_MEM_UNDEFINED(chunk, sizeof(SlabChunk));
+
+ chunk->block = block;
+ chunk->slab = slab;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* slab mark to catch clobber of "unused" space */
+ if (slab->chunkSize < (slab->fullChunkSize - sizeof(SlabChunk)))
+ {
+ set_sentinel(SlabChunkGetPointer(chunk), size);
+ VALGRIND_MAKE_MEM_NOACCESS(((char *) chunk) +
+ sizeof(SlabChunk) + slab->chunkSize,
+ slab->fullChunkSize -
+ (slab->chunkSize + sizeof(SlabChunk)));
+ }
+#endif
+#ifdef RANDOMIZE_ALLOCATED_MEMORY
+ /* fill the allocated space with junk */
+ randomize_mem((char *) SlabChunkGetPointer(chunk), size);
+#endif
+
+ Assert(slab->nblocks * slab->blockSize == context->mem_allocated);
+
+ return SlabChunkGetPointer(chunk);
+}
+
+/*
+ * SlabFree
+ * Frees allocated memory; memory is removed from the slab.
+ */
+static void
+SlabFree(MemoryContext context, void *pointer)
+{
+ int idx;
+ SlabContext *slab = castNode(SlabContext, context);
+ SlabChunk *chunk = SlabPointerGetChunk(pointer);
+ SlabBlock *block = chunk->block;
+
+#ifdef MEMORY_CONTEXT_CHECKING
+ /* Test for someone scribbling on unused space in chunk */
+ if (slab->chunkSize < (slab->fullChunkSize - sizeof(SlabChunk)))
+ if (!sentinel_ok(pointer, slab->chunkSize))
+ elog(WARNING, "detected write past chunk end in %s %p",
+ slab->header.name, chunk);
+#endif
+
+ /* compute index of the chunk with respect to block start */
+ idx = SlabChunkIndex(slab, block, chunk);
+
+ /* add chunk to freelist, and update block nfree count */
+ *(int32 *) pointer = block->firstFreeChunk;
+ block->firstFreeChunk = idx;
+ block->nfree++;
+
+ Assert(block->nfree > 0);
+ Assert(block->nfree <= slab->chunksPerBlock);
+
+#ifdef CLOBBER_FREED_MEMORY
+ /* XXX don't wipe the int32 index, used for block-level freelist */
+ wipe_mem((char *) pointer + sizeof(int32),
+ slab->chunkSize - sizeof(int32));
+#endif
+
+ /* remove the block from a freelist */
+ dlist_delete(&block->node);
+
+ /*
+ * See if we need to update the minFreeChunks field for the slab - we only
+ * need to do that if there the block had that number of free chunks
+ * before we freed one. In that case, we check if there still are blocks
+ * in the original freelist and we either keep the current value (if there
+ * still are blocks) or increment it by one (the new block is still the
+ * one with minimum free chunks).
+ *
+ * The one exception is when the block will get completely free - in that
+ * case we will free it, se we can't use it for minFreeChunks. It however
+ * means there are no more blocks with free chunks.
+ */
+ if (slab->minFreeChunks == (block->nfree - 1))
+ {
+ /* Have we removed the last chunk from the freelist? */
+ if (dlist_is_empty(&slab->freelist[slab->minFreeChunks]))
+ {
+ /* but if we made the block entirely free, we'll free it */
+ if (block->nfree == slab->chunksPerBlock)
+ slab->minFreeChunks = 0;
+ else
+ slab->minFreeChunks++;
+ }
+ }
+
+ /* If the block is now completely empty, free it. */
+ if (block->nfree == slab->chunksPerBlock)
+ {
+ free(block);
+ slab->nblocks--;
+ context->mem_allocated -= slab->blockSize;
+ }
+ else
+ dlist_push_head(&slab->freelist[block->nfree], &block->node);
+
+ Assert(slab->nblocks >= 0);
+ Assert(slab->nblocks * slab->blockSize == context->mem_allocated);
+}
+
+/*
+ * SlabRealloc
+ * Change the allocated size of a chunk.
+ *
+ * As Slab is designed for allocating equally-sized chunks of memory, it can't
+ * do an actual chunk size change. We try to be gentle and allow calls with
+ * exactly the same size, as in that case we can simply return the same
+ * chunk. When the size differs, we throw an error.
+ *
+ * We could also allow requests with size < chunkSize. That however seems
+ * rather pointless - Slab is meant for chunks of constant size, and moreover
+ * realloc is usually used to enlarge the chunk.
+ */
+static void *
+SlabRealloc(MemoryContext context, void *pointer, Size size)
+{
+ SlabContext *slab = castNode(SlabContext, context);
+
+ Assert(slab);
+
+ /* can't do actual realloc with slab, but let's try to be gentle */
+ if (size == slab->chunkSize)
+ return pointer;
+
+ elog(ERROR, "slab allocator does not support realloc()");
+ return NULL; /* keep compiler quiet */
+}
+
+/*
+ * SlabGetChunkSpace
+ * Given a currently-allocated chunk, determine the total space
+ * it occupies (including all memory-allocation overhead).
+ */
+static Size
+SlabGetChunkSpace(MemoryContext context, void *pointer)
+{
+ SlabContext *slab = castNode(SlabContext, context);
+
+ Assert(slab);
+
+ return slab->fullChunkSize;
+}
+
+/*
+ * SlabIsEmpty
+ * Is an Slab empty of any allocated space?
+ */
+static bool
+SlabIsEmpty(MemoryContext context)
+{
+ SlabContext *slab = castNode(SlabContext, context);
+
+ Assert(slab);
+
+ return (slab->nblocks == 0);
+}
+
+/*
+ * SlabStats
+ * Compute stats about memory consumption of a Slab context.
+ *
+ * printfunc: if not NULL, pass a human-readable stats string to this.
+ * passthru: pass this pointer through to printfunc.
+ * totals: if not NULL, add stats about this context into *totals.
+ * print_to_stderr: print stats to stderr if true, elog otherwise.
+ */
+static void
+SlabStats(MemoryContext context,
+ MemoryStatsPrintFunc printfunc, void *passthru,
+ MemoryContextCounters *totals,
+ bool print_to_stderr)
+{
+ SlabContext *slab = castNode(SlabContext, context);
+ Size nblocks = 0;
+ Size freechunks = 0;
+ Size totalspace;
+ Size freespace = 0;
+ int i;
+
+ /* Include context header in totalspace */
+ totalspace = slab->headerSize;
+
+ for (i = 0; i <= slab->chunksPerBlock; i++)
+ {
+ dlist_iter iter;
+
+ dlist_foreach(iter, &slab->freelist[i])
+ {
+ SlabBlock *block = dlist_container(SlabBlock, node, iter.cur);
+
+ nblocks++;
+ totalspace += slab->blockSize;
+ freespace += slab->fullChunkSize * block->nfree;
+ freechunks += block->nfree;
+ }
+ }
+
+ if (printfunc)
+ {
+ char stats_string[200];
+
+ snprintf(stats_string, sizeof(stats_string),
+ "%zu total in %zd blocks; %zu free (%zd chunks); %zu used",
+ totalspace, nblocks, freespace, freechunks,
+ totalspace - freespace);
+ printfunc(context, passthru, stats_string, print_to_stderr);
+ }
+
+ if (totals)
+ {
+ totals->nblocks += nblocks;
+ totals->freechunks += freechunks;
+ totals->totalspace += totalspace;
+ totals->freespace += freespace;
+ }
+}
+
+
+#ifdef MEMORY_CONTEXT_CHECKING
+
+/*
+ * SlabCheck
+ * Walk through chunks and check consistency of memory.
+ *
+ * NOTE: report errors as WARNING, *not* ERROR or FATAL. Otherwise you'll
+ * find yourself in an infinite loop when trouble occurs, because this
+ * routine will be entered again when elog cleanup tries to release memory!
+ */
+static void
+SlabCheck(MemoryContext context)
+{
+ int i;
+ SlabContext *slab = castNode(SlabContext, context);
+ const char *name = slab->header.name;
+
+ Assert(slab);
+ Assert(slab->chunksPerBlock > 0);
+
+ /* walk all the freelists */
+ for (i = 0; i <= slab->chunksPerBlock; i++)
+ {
+ int j,
+ nfree;
+ dlist_iter iter;
+
+ /* walk all blocks on this freelist */
+ dlist_foreach(iter, &slab->freelist[i])
+ {
+ int idx;
+ SlabBlock *block = dlist_container(SlabBlock, node, iter.cur);
+
+ /*
+ * Make sure the number of free chunks (in the block header)
+ * matches position in the freelist.
+ */
+ if (block->nfree != i)
+ elog(WARNING, "problem in slab %s: number of free chunks %d in block %p does not match freelist %d",
+ name, block->nfree, block, i);
+
+ /* reset the bitmap of free chunks for this block */
+ memset(slab->freechunks, 0, (slab->chunksPerBlock * sizeof(bool)));
+ idx = block->firstFreeChunk;
+
+ /*
+ * Now walk through the chunks, count the free ones and also
+ * perform some additional checks for the used ones. As the chunk
+ * freelist is stored within the chunks themselves, we have to
+ * walk through the chunks and construct our own bitmap.
+ */
+
+ nfree = 0;
+ while (idx < slab->chunksPerBlock)
+ {
+ SlabChunk *chunk;
+
+ /* count the chunk as free, add it to the bitmap */
+ nfree++;
+ slab->freechunks[idx] = true;
+
+ /* read index of the next free chunk */
+ chunk = SlabBlockGetChunk(slab, block, idx);
+ VALGRIND_MAKE_MEM_DEFINED(SlabChunkGetPointer(chunk), sizeof(int32));
+ idx = *(int32 *) SlabChunkGetPointer(chunk);
+ }
+
+ for (j = 0; j < slab->chunksPerBlock; j++)
+ {
+ /* non-zero bit in the bitmap means chunk the chunk is used */
+ if (!slab->freechunks[j])
+ {
+ SlabChunk *chunk = SlabBlockGetChunk(slab, block, j);
+
+ /* chunks have both block and slab pointers, so check both */
+ if (chunk->block != block)
+ elog(WARNING, "problem in slab %s: bogus block link in block %p, chunk %p",
+ name, block, chunk);
+
+ if (chunk->slab != slab)
+ elog(WARNING, "problem in slab %s: bogus slab link in block %p, chunk %p",
+ name, block, chunk);
+
+ /* there might be sentinel (thanks to alignment) */
+ if (slab->chunkSize < (slab->fullChunkSize - sizeof(SlabChunk)))
+ if (!sentinel_ok(chunk, slab->chunkSize))
+ elog(WARNING, "problem in slab %s: detected write past chunk end in block %p, chunk %p",
+ name, block, chunk);
+ }
+ }
+
+ /*
+ * Make sure we got the expected number of free chunks (as tracked
+ * in the block header).
+ */
+ if (nfree != block->nfree)
+ elog(WARNING, "problem in slab %s: number of free chunks %d in block %p does not match bitmap %d",
+ name, block->nfree, block, nfree);
+ }
+ }
+
+ Assert(slab->nblocks * slab->blockSize == context->mem_allocated);
+}
+
+#endif /* MEMORY_CONTEXT_CHECKING */