diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
commit | 46651ce6fe013220ed397add242004d764fc0153 (patch) | |
tree | 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/utils/mmgr | |
parent | Initial commit. (diff) | |
download | postgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip |
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/utils/mmgr')
-rw-r--r-- | src/backend/utils/mmgr/Makefile | 25 | ||||
-rw-r--r-- | src/backend/utils/mmgr/README | 487 | ||||
-rw-r--r-- | src/backend/utils/mmgr/aset.c | 1520 | ||||
-rw-r--r-- | src/backend/utils/mmgr/dsa.c | 2287 | ||||
-rw-r--r-- | src/backend/utils/mmgr/freepage.c | 1886 | ||||
-rw-r--r-- | src/backend/utils/mmgr/generation.c | 832 | ||||
-rw-r--r-- | src/backend/utils/mmgr/mcxt.c | 1341 | ||||
-rw-r--r-- | src/backend/utils/mmgr/memdebug.c | 93 | ||||
-rw-r--r-- | src/backend/utils/mmgr/portalmem.c | 1340 | ||||
-rw-r--r-- | src/backend/utils/mmgr/slab.c | 794 |
10 files changed, 10605 insertions, 0 deletions
diff --git a/src/backend/utils/mmgr/Makefile b/src/backend/utils/mmgr/Makefile new file mode 100644 index 0000000..3b4cfdb --- /dev/null +++ b/src/backend/utils/mmgr/Makefile @@ -0,0 +1,25 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for utils/mmgr +# +# IDENTIFICATION +# src/backend/utils/mmgr/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/utils/mmgr +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + aset.o \ + dsa.o \ + freepage.o \ + generation.o \ + mcxt.o \ + memdebug.o \ + portalmem.o \ + slab.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/utils/mmgr/README b/src/backend/utils/mmgr/README new file mode 100644 index 0000000..221b4bd --- /dev/null +++ b/src/backend/utils/mmgr/README @@ -0,0 +1,487 @@ +src/backend/utils/mmgr/README + +Memory Context System Design Overview +===================================== + +Background +---------- + +We do most of our memory allocation in "memory contexts", which are usually +AllocSets as implemented by src/backend/utils/mmgr/aset.c. The key to +successful memory management without lots of overhead is to define a useful +set of contexts with appropriate lifespans. + +The basic operations on a memory context are: + +* create a context + +* allocate a chunk of memory within a context (equivalent of standard + C library's malloc()) + +* delete a context (including freeing all the memory allocated therein) + +* reset a context (free all memory allocated in the context, but not the + context object itself) + +* inquire about the total amount of memory allocated to the context + (the raw memory from which the context allocates chunks; not the + chunks themselves) + +Given a chunk of memory previously allocated from a context, one can +free it or reallocate it larger or smaller (corresponding to standard C +library's free() and realloc() routines). These operations return memory +to or get more memory from the same context the chunk was originally +allocated in. + +At all times there is a "current" context denoted by the +CurrentMemoryContext global variable. palloc() implicitly allocates space +in that context. The MemoryContextSwitchTo() operation selects a new current +context (and returns the previous context, so that the caller can restore the +previous context before exiting). + +The main advantage of memory contexts over plain use of malloc/free is +that the entire contents of a memory context can be freed easily, without +having to request freeing of each individual chunk within it. This is +both faster and more reliable than per-chunk bookkeeping. We use this +fact to clean up at transaction end: by resetting all the active contexts +of transaction or shorter lifespan, we can reclaim all transient memory. +Similarly, we can clean up at the end of each query, or after each tuple +is processed during a query. + + +Some Notes About the palloc API Versus Standard C Library +--------------------------------------------------------- + +The behavior of palloc and friends is similar to the standard C library's +malloc and friends, but there are some deliberate differences too. Here +are some notes to clarify the behavior. + +* If out of memory, palloc and repalloc exit via elog(ERROR). They +never return NULL, and it is not necessary or useful to test for such +a result. With palloc_extended() that behavior can be overridden +using the MCXT_ALLOC_NO_OOM flag. + +* palloc(0) is explicitly a valid operation. It does not return a NULL +pointer, but a valid chunk of which no bytes may be used. However, the +chunk might later be repalloc'd larger; it can also be pfree'd without +error. Similarly, repalloc allows realloc'ing to zero size. + +* pfree and repalloc do not accept a NULL pointer. This is intentional. + + +The Current Memory Context +-------------------------- + +Because it would be too much notational overhead to always pass an +appropriate memory context to called routines, there always exists the +notion of the current memory context CurrentMemoryContext. Without it, +for example, the copyObject routines would need to be passed a context, as +would function execution routines that return a pass-by-reference +datatype. Similarly for routines that temporarily allocate space +internally, but don't return it to their caller? We certainly don't +want to clutter every call in the system with "here is a context to +use for any temporary memory allocation you might want to do". + +The upshot of that reasoning, though, is that CurrentMemoryContext should +generally point at a short-lifespan context if at all possible. During +query execution it usually points to a context that gets reset after each +tuple. Only in *very* circumscribed code should it ever point at a +context having greater than transaction lifespan, since doing so risks +permanent memory leaks. + + +pfree/repalloc Do Not Depend On CurrentMemoryContext +---------------------------------------------------- + +pfree() and repalloc() can be applied to any chunk whether it belongs +to CurrentMemoryContext or not --- the chunk's owning context will be +invoked to handle the operation, regardless. + + +"Parent" and "Child" Contexts +----------------------------- + +If all contexts were independent, it'd be hard to keep track of them, +especially in error cases. That is solved by creating a tree of +"parent" and "child" contexts. When creating a memory context, the +new context can be specified to be a child of some existing context. +A context can have many children, but only one parent. In this way +the contexts form a forest (not necessarily a single tree, since there +could be more than one top-level context; although in current practice +there is only one top context, TopMemoryContext). + +Deleting a context deletes all its direct and indirect children as +well. When resetting a context it's almost always more useful to +delete child contexts, thus MemoryContextReset() means that, and if +you really do want a tree of empty contexts you need to call +MemoryContextResetOnly() plus MemoryContextResetChildren(). + +These features allow us to manage a lot of contexts without fear that +some will be leaked; we only need to keep track of one top-level +context that we are going to delete at transaction end, and make sure +that any shorter-lived contexts we create are descendants of that +context. Since the tree can have multiple levels, we can deal easily +with nested lifetimes of storage, such as per-transaction, +per-statement, per-scan, per-tuple. Storage lifetimes that only +partially overlap can be handled by allocating from different trees of +the context forest (there are some examples in the next section). + +For convenience we also provide operations like "reset/delete all children +of a given context, but don't reset or delete that context itself". + + +Memory Context Reset/Delete Callbacks +------------------------------------- + +A feature introduced in Postgres 9.5 allows memory contexts to be used +for managing more resources than just plain palloc'd memory. This is +done by registering a "reset callback function" for a memory context. +Such a function will be called, once, just before the context is next +reset or deleted. It can be used to give up resources that are in some +sense associated with an object allocated within the context. Possible +use-cases include +* closing open files associated with a tuplesort object; +* releasing reference counts on long-lived cache objects that are held + by some object within the context being reset; +* freeing malloc-managed memory associated with some palloc'd object. +That last case would just represent bad programming practice for pure +Postgres code; better to have made all the allocations using palloc, +in the target context or some child context. However, it could well +come in handy for code that interfaces to non-Postgres libraries. + +Any number of reset callbacks can be established for a memory context; +they are called in reverse order of registration. Also, callbacks +attached to child contexts are called before callbacks attached to +parent contexts, if a tree of contexts is being reset or deleted. + +The API for this requires the caller to provide a MemoryContextCallback +memory chunk to hold the state for a callback. Typically this should be +allocated in the same context it is logically attached to, so that it +will be released automatically after use. The reason for asking the +caller to provide this memory is that in most usage scenarios, the caller +will be creating some larger struct within the target context, and the +MemoryContextCallback struct can be made "for free" without a separate +palloc() call by including it in this larger struct. + + +Memory Contexts in Practice +=========================== + +Globally Known Contexts +----------------------- + +There are a few widely-known contexts that are typically referenced +through global variables. At any instant the system may contain many +additional contexts, but all other contexts should be direct or indirect +children of one of these contexts to ensure they are not leaked in event +of an error. + +TopMemoryContext --- this is the actual top level of the context tree; +every other context is a direct or indirect child of this one. Allocating +here is essentially the same as "malloc", because this context will never +be reset or deleted. This is for stuff that should live forever, or for +stuff that the controlling module will take care of deleting at the +appropriate time. An example is fd.c's tables of open files. Avoid +allocating stuff here unless really necessary, and especially avoid +running with CurrentMemoryContext pointing here. + +PostmasterContext --- this is the postmaster's normal working context. +After a backend is spawned, it can delete PostmasterContext to free its +copy of memory the postmaster was using that it doesn't need. +Note that in non-EXEC_BACKEND builds, the postmaster's copy of pg_hba.conf +and pg_ident.conf data is used directly during authentication in backend +processes; so backends can't delete PostmasterContext until that's done. +(The postmaster has only TopMemoryContext, PostmasterContext, and +ErrorContext --- the remaining top-level contexts are set up in each +backend during startup.) + +CacheMemoryContext --- permanent storage for relcache, catcache, and +related modules. This will never be reset or deleted, either, so it's +not truly necessary to distinguish it from TopMemoryContext. But it +seems worthwhile to maintain the distinction for debugging purposes. +(Note: CacheMemoryContext has child contexts with shorter lifespans. +For example, a child context is the best place to keep the subsidiary +storage associated with a relcache entry; that way we can free rule +parsetrees and so forth easily, without having to depend on constructing +a reliable version of freeObject().) + +MessageContext --- this context holds the current command message from the +frontend, as well as any derived storage that need only live as long as +the current message (for example, in simple-Query mode the parse and plan +trees can live here). This context will be reset, and any children +deleted, at the top of each cycle of the outer loop of PostgresMain. This +is kept separate from per-transaction and per-portal contexts because a +query string might need to live either a longer or shorter time than any +single transaction or portal. + +TopTransactionContext --- this holds everything that lives until end of the +top-level transaction. This context will be reset, and all its children +deleted, at conclusion of each top-level transaction cycle. In most cases +you don't want to allocate stuff directly here, but in CurTransactionContext; +what does belong here is control information that exists explicitly to manage +status across multiple subtransactions. Note: this context is NOT cleared +immediately upon error; its contents will survive until the transaction block +is exited by COMMIT/ROLLBACK. + +CurTransactionContext --- this holds data that has to survive until the end +of the current transaction, and in particular will be needed at top-level +transaction commit. When we are in a top-level transaction this is the same +as TopTransactionContext, but in subtransactions it points to a child context. +It is important to understand that if a subtransaction aborts, its +CurTransactionContext is thrown away after finishing the abort processing; +but a committed subtransaction's CurTransactionContext is kept until top-level +commit (unless of course one of the intermediate levels of subtransaction +aborts). This ensures that we do not keep data from a failed subtransaction +longer than necessary. Because of this behavior, you must be careful to clean +up properly during subtransaction abort --- the subtransaction's state must be +delinked from any pointers or lists kept in upper transactions, or you will +have dangling pointers leading to a crash at top-level commit. An example of +data kept here is pending NOTIFY messages, which are sent at top-level commit, +but only if the generating subtransaction did not abort. + +PortalContext --- this is not actually a separate context, but a +global variable pointing to the per-portal context of the currently active +execution portal. This can be used if it's necessary to allocate storage +that will live just as long as the execution of the current portal requires. + +ErrorContext --- this permanent context is switched into for error +recovery processing, and then reset on completion of recovery. We arrange +to have a few KB of memory available in it at all times. In this way, we +can ensure that some memory is available for error recovery even if the +backend has run out of memory otherwise. This allows out-of-memory to be +treated as a normal ERROR condition, not a FATAL error. + + +Contexts For Prepared Statements And Portals +-------------------------------------------- + +A prepared-statement object has an associated private context, in which +the parse and plan trees for its query are stored. Because these trees +are read-only to the executor, the prepared statement can be re-used many +times without further copying of these trees. + +An execution-portal object has a private context that is referenced by +PortalContext when the portal is active. In the case of a portal created +by DECLARE CURSOR, this private context contains the query parse and plan +trees (there being no other object that can hold them). Portals created +from prepared statements simply reference the prepared statements' trees, +and don't actually need any storage allocated in their private contexts. + + +Logical Replication Worker Contexts +----------------------------------- + +ApplyContext --- permanent during whole lifetime of apply worker. It +is possible to use TopMemoryContext here as well, but for simplicity +of memory usage analysis we spin up different context. + +ApplyMessageContext --- short-lived context that is reset after each +logical replication protocol message is processed. + + +Transient Contexts During Execution +----------------------------------- + +When creating a prepared statement, the parse and plan trees will be built +in a temporary context that's a child of MessageContext (so that it will +go away automatically upon error). On success, the finished plan is +copied to the prepared statement's private context, and the temp context +is released; this allows planner temporary space to be recovered before +execution begins. (In simple-Query mode we don't bother with the extra +copy step, so the planner temp space stays around till end of query.) + +The top-level executor routines, as well as most of the "plan node" +execution code, will normally run in a context that is created by +ExecutorStart and destroyed by ExecutorEnd; this context also holds the +"plan state" tree built during ExecutorStart. Most of the memory +allocated in these routines is intended to live until end of query, +so this is appropriate for those purposes. The executor's top context +is a child of PortalContext, that is, the per-portal context of the +portal that represents the query's execution. + +The main memory-management consideration in the executor is that +expression evaluation --- both for qual testing and for computation of +targetlist entries --- needs to not leak memory. To do this, each +ExprContext (expression-eval context) created in the executor has a +private memory context associated with it, and we switch into that context +when evaluating expressions in that ExprContext. The plan node that owns +the ExprContext is responsible for resetting the private context to empty +when it no longer needs the results of expression evaluations. Typically +the reset is done at the start of each tuple-fetch cycle in the plan node. + +Note that this design gives each plan node its own expression-eval memory +context. This appears necessary to handle nested joins properly, since +an outer plan node might need to retain expression results it has computed +while obtaining the next tuple from an inner node --- but the inner node +might execute many tuple cycles and many expressions before returning a +tuple. The inner node must be able to reset its own expression context +more often than once per outer tuple cycle. Fortunately, memory contexts +are cheap enough that giving one to each plan node doesn't seem like a +problem. + +A problem with running index accesses and sorts in a query-lifespan context +is that these operations invoke datatype-specific comparison functions, +and if the comparators leak any memory then that memory won't be recovered +till end of query. The comparator functions all return bool or int32, +so there's no problem with their result data, but there can be a problem +with leakage of internal temporary data. In particular, comparator +functions that operate on TOAST-able data types need to be careful +not to leak detoasted versions of their inputs. This is annoying, but +it appeared a lot easier to make the comparators conform than to fix the +index and sort routines, so that's what was done for 7.1. This remains +the state of affairs in btree and hash indexes, so btree and hash support +functions still need to not leak memory. Most of the other index AMs +have been modified to run opclass support functions in short-lived +contexts, so that leakage is not a problem; this is necessary in view +of the fact that their support functions tend to be far more complex. + +There are some special cases, such as aggregate functions. nodeAgg.c +needs to remember the results of evaluation of aggregate transition +functions from one tuple cycle to the next, so it can't just discard +all per-tuple state in each cycle. The easiest way to handle this seems +to be to have two per-tuple contexts in an aggregate node, and to +ping-pong between them, so that at each tuple one is the active allocation +context and the other holds any results allocated by the prior cycle's +transition function. + +Executor routines that switch the active CurrentMemoryContext may need +to copy data into their caller's current memory context before returning. +However, we have minimized the need for that, because of the convention +of resetting the per-tuple context at the *start* of an execution cycle +rather than at its end. With that rule, an execution node can return a +tuple that is palloc'd in its per-tuple context, and the tuple will remain +good until the node is called for another tuple or told to end execution. +This parallels the situation with pass-by-reference values at the table +scan level, since a scan node can return a direct pointer to a tuple in a +disk buffer that is only guaranteed to remain good that long. + +A more common reason for copying data is to transfer a result from +per-tuple context to per-query context; for example, a Unique node will +save the last distinct tuple value in its per-query context, requiring a +copy step. + + +Mechanisms to Allow Multiple Types of Contexts +---------------------------------------------- + +To efficiently allow for different allocation patterns, and for +experimentation, we allow for different types of memory contexts with +different allocation policies but similar external behavior. To +handle this, memory allocation functions are accessed via function +pointers, and we require all context types to obey the conventions +given here. + +A memory context is represented by struct MemoryContextData (see +memnodes.h). This struct identifies the exact type of the context, and +contains information common between the different types of +MemoryContext like the parent and child contexts, and the name of the +context. + +This is essentially an abstract superclass, and the behavior is +determined by the "methods" pointer is its virtual function table +(struct MemoryContextMethods). Specific memory context types will use +derived structs having these fields as their first fields. All the +contexts of a specific type will have methods pointers that point to +the same static table of function pointers. + +While operations like allocating from and resetting a context take the +relevant MemoryContext as a parameter, operations like free and +realloc are trickier. To make those work, we require all memory +context types to produce allocated chunks that are immediately, +without any padding, preceded by a pointer to the corresponding +MemoryContext. + +If a type of allocator needs additional information about its chunks, +like e.g. the size of the allocation, that information can in turn +precede the MemoryContext. This means the only overhead implied by +the memory context mechanism is a pointer to its context, so we're not +constraining context-type designers very much. + +Given this, routines like pfree determine their corresponding context +with an operation like (although that is usually encapsulated in +GetMemoryChunkContext()) + + MemoryContext context = *(MemoryContext*) (((char *) pointer) - sizeof(void *)); + +and then invoke the corresponding method for the context + + context->methods->free_p(pointer); + + +More Control Over aset.c Behavior +--------------------------------- + +By default aset.c always allocates an 8K block upon the first +allocation in a context, and doubles that size for each successive +block request. That's good behavior for a context that might hold +*lots* of data. But if there are dozens if not hundreds of smaller +contexts in the system, we need to be able to fine-tune things a +little better. + +The creator of a context is able to specify an initial block size and +a maximum block size. Selecting smaller values can prevent wastage of +space in contexts that aren't expected to hold very much (an example +is the relcache's per-relation contexts). + +Also, it is possible to specify a minimum context size, in case for some +reason that should be different from the initial size for additional +blocks. An aset.c context will always contain at least one block, +of size minContextSize if that is specified, otherwise initBlockSize. + +We expect that per-tuple contexts will be reset frequently and typically +will not allocate very much space per tuple cycle. To make this usage +pattern cheap, the first block allocated in a context is not given +back to malloc() during reset, but just cleared. This avoids malloc +thrashing. + + +Alternative Memory Context Implementations +------------------------------------------ + +aset.c is our default general-purpose implementation, working fine +in most situations. We also have two implementations optimized for +special use cases, providing either better performance or lower memory +usage compared to aset.c (or both). + +* slab.c (SlabContext) is designed for allocations of fixed-length + chunks, and does not allow allocations of chunks with different size. + +* generation.c (GenerationContext) is designed for cases when chunks + are allocated in groups with similar lifespan (generations), or + roughly in FIFO order. + +Both memory contexts aim to free memory back to the operating system +(unlike aset.c, which keeps the freed chunks in a freelist, and only +returns the memory when reset/deleted). + +These memory contexts were initially developed for ReorderBuffer, but +may be useful elsewhere as long as the allocation patterns match. + + +Memory Accounting +----------------- + +One of the basic memory context operations is determining the amount of +memory used in the context (and its children). We have multiple places +that implement their own ad hoc memory accounting, and this is meant to +provide a unified approach. Ad hoc accounting solutions work for places +with tight control over the allocations or when it's easy to determine +sizes of allocated chunks (e.g. places that only work with tuples). + +The accounting built into the memory contexts is transparent and works +transparently for all allocations as long as they end up in the right +memory context subtree. + +Consider for example aggregate functions - the aggregate state is often +represented by an arbitrary structure, allocated from the transition +function, so the ad hoc accounting is unlikely to work. The built-in +accounting will however handle such cases just fine. + +To minimize overhead, the accounting is done at the block level, not for +individual allocation chunks. + +The accounting is lazy - after a block is allocated (or freed), only the +context owning that block is updated. This means that when inquiring +about the memory usage in a given context, we have to walk all children +contexts recursively. This means the memory accounting is not intended +for cases with too many memory contexts (in the relevant subtree). diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c new file mode 100644 index 0000000..77872e7 --- /dev/null +++ b/src/backend/utils/mmgr/aset.c @@ -0,0 +1,1520 @@ +/*------------------------------------------------------------------------- + * + * aset.c + * Allocation set definitions. + * + * AllocSet is our standard implementation of the abstract MemoryContext + * type. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/mmgr/aset.c + * + * NOTE: + * This is a new (Feb. 05, 1999) implementation of the allocation set + * routines. AllocSet...() does not use OrderedSet...() any more. + * Instead it manages allocations in a block pool by itself, combining + * many small allocations in a few bigger blocks. AllocSetFree() normally + * doesn't free() memory really. It just add's the free'd area to some + * list for later reuse by AllocSetAlloc(). All memory blocks are free()'d + * at once on AllocSetReset(), which happens when the memory context gets + * destroyed. + * Jan Wieck + * + * Performance improvement from Tom Lane, 8/99: for extremely large request + * sizes, we do want to be able to give the memory back to free() as soon + * as it is pfree()'d. Otherwise we risk tying up a lot of memory in + * freelist entries that might never be usable. This is specially needed + * when the caller is repeatedly repalloc()'ing a block bigger and bigger; + * the previous instances of the block were guaranteed to be wasted until + * AllocSetReset() under the old way. + * + * Further improvement 12/00: as the code stood, request sizes in the + * midrange between "small" and "large" were handled very inefficiently, + * because any sufficiently large free chunk would be used to satisfy a + * request, even if it was much larger than necessary. This led to more + * and more wasted space in allocated chunks over time. To fix, get rid + * of the midrange behavior: we now handle only "small" power-of-2-size + * chunks as chunks. Anything "large" is passed off to malloc(). Change + * the number of freelists to change the small/large boundary. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "port/pg_bitutils.h" +#include "utils/memdebug.h" +#include "utils/memutils.h" + +/*-------------------- + * Chunk freelist k holds chunks of size 1 << (k + ALLOC_MINBITS), + * for k = 0 .. ALLOCSET_NUM_FREELISTS-1. + * + * Note that all chunks in the freelists have power-of-2 sizes. This + * improves recyclability: we may waste some space, but the wasted space + * should stay pretty constant as requests are made and released. + * + * A request too large for the last freelist is handled by allocating a + * dedicated block from malloc(). The block still has a block header and + * chunk header, but when the chunk is freed we'll return the whole block + * to malloc(), not put it on our freelists. + * + * CAUTION: ALLOC_MINBITS must be large enough so that + * 1<<ALLOC_MINBITS is at least MAXALIGN, + * or we may fail to align the smallest chunks adequately. + * 8-byte alignment is enough on all currently known machines. + * + * With the current parameters, request sizes up to 8K are treated as chunks, + * larger requests go into dedicated blocks. Change ALLOCSET_NUM_FREELISTS + * to adjust the boundary point; and adjust ALLOCSET_SEPARATE_THRESHOLD in + * memutils.h to agree. (Note: in contexts with small maxBlockSize, we may + * set the allocChunkLimit to less than 8K, so as to avoid space wastage.) + *-------------------- + */ + +#define ALLOC_MINBITS 3 /* smallest chunk size is 8 bytes */ +#define ALLOCSET_NUM_FREELISTS 11 +#define ALLOC_CHUNK_LIMIT (1 << (ALLOCSET_NUM_FREELISTS-1+ALLOC_MINBITS)) +/* Size of largest chunk that we use a fixed size for */ +#define ALLOC_CHUNK_FRACTION 4 +/* We allow chunks to be at most 1/4 of maxBlockSize (less overhead) */ + +/*-------------------- + * The first block allocated for an allocset has size initBlockSize. + * Each time we have to allocate another block, we double the block size + * (if possible, and without exceeding maxBlockSize), so as to reduce + * the bookkeeping load on malloc(). + * + * Blocks allocated to hold oversize chunks do not follow this rule, however; + * they are just however big they need to be to hold that single chunk. + * + * Also, if a minContextSize is specified, the first block has that size, + * and then initBlockSize is used for the next one. + *-------------------- + */ + +#define ALLOC_BLOCKHDRSZ MAXALIGN(sizeof(AllocBlockData)) +#define ALLOC_CHUNKHDRSZ sizeof(struct AllocChunkData) + +typedef struct AllocBlockData *AllocBlock; /* forward reference */ +typedef struct AllocChunkData *AllocChunk; + +/* + * AllocPointer + * Aligned pointer which may be a member of an allocation set. + */ +typedef void *AllocPointer; + +/* + * AllocSetContext is our standard implementation of MemoryContext. + * + * Note: header.isReset means there is nothing for AllocSetReset to do. + * This is different from the aset being physically empty (empty blocks list) + * because we will still have a keeper block. It's also different from the set + * being logically empty, because we don't attempt to detect pfree'ing the + * last active chunk. + */ +typedef struct AllocSetContext +{ + MemoryContextData header; /* Standard memory-context fields */ + /* Info about storage allocated in this context: */ + AllocBlock blocks; /* head of list of blocks in this set */ + AllocChunk freelist[ALLOCSET_NUM_FREELISTS]; /* free chunk lists */ + /* Allocation parameters for this context: */ + Size initBlockSize; /* initial block size */ + Size maxBlockSize; /* maximum block size */ + Size nextBlockSize; /* next block size to allocate */ + Size allocChunkLimit; /* effective chunk size limit */ + AllocBlock keeper; /* keep this block over resets */ + /* freelist this context could be put in, or -1 if not a candidate: */ + int freeListIndex; /* index in context_freelists[], or -1 */ +} AllocSetContext; + +typedef AllocSetContext *AllocSet; + +/* + * AllocBlock + * An AllocBlock is the unit of memory that is obtained by aset.c + * from malloc(). It contains one or more AllocChunks, which are + * the units requested by palloc() and freed by pfree(). AllocChunks + * cannot be returned to malloc() individually, instead they are put + * on freelists by pfree() and re-used by the next palloc() that has + * a matching request size. + * + * AllocBlockData is the header data for a block --- the usable space + * within the block begins at the next alignment boundary. + */ +typedef struct AllocBlockData +{ + AllocSet aset; /* aset that owns this block */ + AllocBlock prev; /* prev block in aset's blocks list, if any */ + AllocBlock next; /* next block in aset's blocks list, if any */ + char *freeptr; /* start of free space in this block */ + char *endptr; /* end of space in this block */ +} AllocBlockData; + +/* + * AllocChunk + * The prefix of each piece of memory in an AllocBlock + * + * Note: to meet the memory context APIs, the payload area of the chunk must + * be maxaligned, and the "aset" link must be immediately adjacent to the + * payload area (cf. GetMemoryChunkContext). We simplify matters for this + * module by requiring sizeof(AllocChunkData) to be maxaligned, and then + * we can ensure things work by adding any required alignment padding before + * the "aset" field. There is a static assertion below that the alignment + * is done correctly. + */ +typedef struct AllocChunkData +{ + /* size is always the size of the usable space in the chunk */ + Size size; +#ifdef MEMORY_CONTEXT_CHECKING + /* when debugging memory usage, also store actual requested size */ + /* this is zero in a free chunk */ + Size requested_size; + +#define ALLOCCHUNK_RAWSIZE (SIZEOF_SIZE_T * 2 + SIZEOF_VOID_P) +#else +#define ALLOCCHUNK_RAWSIZE (SIZEOF_SIZE_T + SIZEOF_VOID_P) +#endif /* MEMORY_CONTEXT_CHECKING */ + + /* ensure proper alignment by adding padding if needed */ +#if (ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF) != 0 + char padding[MAXIMUM_ALIGNOF - ALLOCCHUNK_RAWSIZE % MAXIMUM_ALIGNOF]; +#endif + + /* aset is the owning aset if allocated, or the freelist link if free */ + void *aset; + /* there must not be any padding to reach a MAXALIGN boundary here! */ +} AllocChunkData; + +/* + * Only the "aset" field should be accessed outside this module. + * We keep the rest of an allocated chunk's header marked NOACCESS when using + * valgrind. But note that chunk headers that are in a freelist are kept + * accessible, for simplicity. + */ +#define ALLOCCHUNK_PRIVATE_LEN offsetof(AllocChunkData, aset) + +/* + * AllocPointerIsValid + * True iff pointer is valid allocation pointer. + */ +#define AllocPointerIsValid(pointer) PointerIsValid(pointer) + +/* + * AllocSetIsValid + * True iff set is valid allocation set. + */ +#define AllocSetIsValid(set) PointerIsValid(set) + +#define AllocPointerGetChunk(ptr) \ + ((AllocChunk)(((char *)(ptr)) - ALLOC_CHUNKHDRSZ)) +#define AllocChunkGetPointer(chk) \ + ((AllocPointer)(((char *)(chk)) + ALLOC_CHUNKHDRSZ)) + +/* + * Rather than repeatedly creating and deleting memory contexts, we keep some + * freed contexts in freelists so that we can hand them out again with little + * work. Before putting a context in a freelist, we reset it so that it has + * only its initial malloc chunk and no others. To be a candidate for a + * freelist, a context must have the same minContextSize/initBlockSize as + * other contexts in the list; but its maxBlockSize is irrelevant since that + * doesn't affect the size of the initial chunk. + * + * We currently provide one freelist for ALLOCSET_DEFAULT_SIZES contexts + * and one for ALLOCSET_SMALL_SIZES contexts; the latter works for + * ALLOCSET_START_SMALL_SIZES too, since only the maxBlockSize differs. + * + * Ordinarily, we re-use freelist contexts in last-in-first-out order, in + * hopes of improving locality of reference. But if there get to be too + * many contexts in the list, we'd prefer to drop the most-recently-created + * contexts in hopes of keeping the process memory map compact. + * We approximate that by simply deleting all existing entries when the list + * overflows, on the assumption that queries that allocate a lot of contexts + * will probably free them in more or less reverse order of allocation. + * + * Contexts in a freelist are chained via their nextchild pointers. + */ +#define MAX_FREE_CONTEXTS 100 /* arbitrary limit on freelist length */ + +typedef struct AllocSetFreeList +{ + int num_free; /* current list length */ + AllocSetContext *first_free; /* list header */ +} AllocSetFreeList; + +/* context_freelists[0] is for default params, [1] for small params */ +static AllocSetFreeList context_freelists[2] = +{ + { + 0, NULL + }, + { + 0, NULL + } +}; + +/* + * These functions implement the MemoryContext API for AllocSet contexts. + */ +static void *AllocSetAlloc(MemoryContext context, Size size); +static void AllocSetFree(MemoryContext context, void *pointer); +static void *AllocSetRealloc(MemoryContext context, void *pointer, Size size); +static void AllocSetReset(MemoryContext context); +static void AllocSetDelete(MemoryContext context); +static Size AllocSetGetChunkSpace(MemoryContext context, void *pointer); +static bool AllocSetIsEmpty(MemoryContext context); +static void AllocSetStats(MemoryContext context, + MemoryStatsPrintFunc printfunc, void *passthru, + MemoryContextCounters *totals, + bool print_to_stderr); + +#ifdef MEMORY_CONTEXT_CHECKING +static void AllocSetCheck(MemoryContext context); +#endif + +/* + * This is the virtual function table for AllocSet contexts. + */ +static const MemoryContextMethods AllocSetMethods = { + AllocSetAlloc, + AllocSetFree, + AllocSetRealloc, + AllocSetReset, + AllocSetDelete, + AllocSetGetChunkSpace, + AllocSetIsEmpty, + AllocSetStats +#ifdef MEMORY_CONTEXT_CHECKING + ,AllocSetCheck +#endif +}; + + +/* ---------- + * AllocSetFreeIndex - + * + * Depending on the size of an allocation compute which freechunk + * list of the alloc set it belongs to. Caller must have verified + * that size <= ALLOC_CHUNK_LIMIT. + * ---------- + */ +static inline int +AllocSetFreeIndex(Size size) +{ + int idx; + + if (size > (1 << ALLOC_MINBITS)) + { + /*---------- + * At this point we must compute ceil(log2(size >> ALLOC_MINBITS)). + * This is the same as + * pg_leftmost_one_pos32((size - 1) >> ALLOC_MINBITS) + 1 + * or equivalently + * pg_leftmost_one_pos32(size - 1) - ALLOC_MINBITS + 1 + * + * However, rather than just calling that function, we duplicate the + * logic here, allowing an additional optimization. It's reasonable + * to assume that ALLOC_CHUNK_LIMIT fits in 16 bits, so we can unroll + * the byte-at-a-time loop in pg_leftmost_one_pos32 and just handle + * the last two bytes. + * + * Yes, this function is enough of a hot-spot to make it worth this + * much trouble. + *---------- + */ +#ifdef HAVE__BUILTIN_CLZ + idx = 31 - __builtin_clz((uint32) size - 1) - ALLOC_MINBITS + 1; +#else + uint32 t, + tsize; + + /* Statically assert that we only have a 16-bit input value. */ + StaticAssertStmt(ALLOC_CHUNK_LIMIT < (1 << 16), + "ALLOC_CHUNK_LIMIT must be less than 64kB"); + + tsize = size - 1; + t = tsize >> 8; + idx = t ? pg_leftmost_one_pos[t] + 8 : pg_leftmost_one_pos[tsize]; + idx -= ALLOC_MINBITS - 1; +#endif + + Assert(idx < ALLOCSET_NUM_FREELISTS); + } + else + idx = 0; + + return idx; +} + + +/* + * Public routines + */ + + +/* + * AllocSetContextCreateInternal + * Create a new AllocSet context. + * + * parent: parent context, or NULL if top-level context + * name: name of context (must be statically allocated) + * minContextSize: minimum context size + * initBlockSize: initial allocation block size + * maxBlockSize: maximum allocation block size + * + * Most callers should abstract the context size parameters using a macro + * such as ALLOCSET_DEFAULT_SIZES. + * + * Note: don't call this directly; go through the wrapper macro + * AllocSetContextCreate. + */ +MemoryContext +AllocSetContextCreateInternal(MemoryContext parent, + const char *name, + Size minContextSize, + Size initBlockSize, + Size maxBlockSize) +{ + int freeListIndex; + Size firstBlockSize; + AllocSet set; + AllocBlock block; + + /* Assert we padded AllocChunkData properly */ + StaticAssertStmt(ALLOC_CHUNKHDRSZ == MAXALIGN(ALLOC_CHUNKHDRSZ), + "sizeof(AllocChunkData) is not maxaligned"); + StaticAssertStmt(offsetof(AllocChunkData, aset) + sizeof(MemoryContext) == + ALLOC_CHUNKHDRSZ, + "padding calculation in AllocChunkData is wrong"); + + /* + * First, validate allocation parameters. Once these were regular runtime + * test and elog's, but in practice Asserts seem sufficient because nobody + * varies their parameters at runtime. We somewhat arbitrarily enforce a + * minimum 1K block size. + */ + Assert(initBlockSize == MAXALIGN(initBlockSize) && + initBlockSize >= 1024); + Assert(maxBlockSize == MAXALIGN(maxBlockSize) && + maxBlockSize >= initBlockSize && + AllocHugeSizeIsValid(maxBlockSize)); /* must be safe to double */ + Assert(minContextSize == 0 || + (minContextSize == MAXALIGN(minContextSize) && + minContextSize >= 1024 && + minContextSize <= maxBlockSize)); + + /* + * Check whether the parameters match either available freelist. We do + * not need to demand a match of maxBlockSize. + */ + if (minContextSize == ALLOCSET_DEFAULT_MINSIZE && + initBlockSize == ALLOCSET_DEFAULT_INITSIZE) + freeListIndex = 0; + else if (minContextSize == ALLOCSET_SMALL_MINSIZE && + initBlockSize == ALLOCSET_SMALL_INITSIZE) + freeListIndex = 1; + else + freeListIndex = -1; + + /* + * If a suitable freelist entry exists, just recycle that context. + */ + if (freeListIndex >= 0) + { + AllocSetFreeList *freelist = &context_freelists[freeListIndex]; + + if (freelist->first_free != NULL) + { + /* Remove entry from freelist */ + set = freelist->first_free; + freelist->first_free = (AllocSet) set->header.nextchild; + freelist->num_free--; + + /* Update its maxBlockSize; everything else should be OK */ + set->maxBlockSize = maxBlockSize; + + /* Reinitialize its header, installing correct name and parent */ + MemoryContextCreate((MemoryContext) set, + T_AllocSetContext, + &AllocSetMethods, + parent, + name); + + ((MemoryContext) set)->mem_allocated = + set->keeper->endptr - ((char *) set); + + return (MemoryContext) set; + } + } + + /* Determine size of initial block */ + firstBlockSize = MAXALIGN(sizeof(AllocSetContext)) + + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ; + if (minContextSize != 0) + firstBlockSize = Max(firstBlockSize, minContextSize); + else + firstBlockSize = Max(firstBlockSize, initBlockSize); + + /* + * Allocate the initial block. Unlike other aset.c blocks, it starts with + * the context header and its block header follows that. + */ + set = (AllocSet) malloc(firstBlockSize); + if (set == NULL) + { + if (TopMemoryContext) + MemoryContextStats(TopMemoryContext); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed while creating memory context \"%s\".", + name))); + } + + /* + * Avoid writing code that can fail between here and MemoryContextCreate; + * we'd leak the header/initial block if we ereport in this stretch. + */ + + /* Fill in the initial block's block header */ + block = (AllocBlock) (((char *) set) + MAXALIGN(sizeof(AllocSetContext))); + block->aset = set; + block->freeptr = ((char *) block) + ALLOC_BLOCKHDRSZ; + block->endptr = ((char *) set) + firstBlockSize; + block->prev = NULL; + block->next = NULL; + + /* Mark unallocated space NOACCESS; leave the block header alone. */ + VALGRIND_MAKE_MEM_NOACCESS(block->freeptr, block->endptr - block->freeptr); + + /* Remember block as part of block list */ + set->blocks = block; + /* Mark block as not to be released at reset time */ + set->keeper = block; + + /* Finish filling in aset-specific parts of the context header */ + MemSetAligned(set->freelist, 0, sizeof(set->freelist)); + + set->initBlockSize = initBlockSize; + set->maxBlockSize = maxBlockSize; + set->nextBlockSize = initBlockSize; + set->freeListIndex = freeListIndex; + + /* + * Compute the allocation chunk size limit for this context. It can't be + * more than ALLOC_CHUNK_LIMIT because of the fixed number of freelists. + * If maxBlockSize is small then requests exceeding the maxBlockSize, or + * even a significant fraction of it, should be treated as large chunks + * too. For the typical case of maxBlockSize a power of 2, the chunk size + * limit will be at most 1/8th maxBlockSize, so that given a stream of + * requests that are all the maximum chunk size we will waste at most + * 1/8th of the allocated space. + * + * We have to have allocChunkLimit a power of two, because the requested + * and actually-allocated sizes of any chunk must be on the same side of + * the limit, else we get confused about whether the chunk is "big". + * + * Also, allocChunkLimit must not exceed ALLOCSET_SEPARATE_THRESHOLD. + */ + StaticAssertStmt(ALLOC_CHUNK_LIMIT == ALLOCSET_SEPARATE_THRESHOLD, + "ALLOC_CHUNK_LIMIT != ALLOCSET_SEPARATE_THRESHOLD"); + + set->allocChunkLimit = ALLOC_CHUNK_LIMIT; + while ((Size) (set->allocChunkLimit + ALLOC_CHUNKHDRSZ) > + (Size) ((maxBlockSize - ALLOC_BLOCKHDRSZ) / ALLOC_CHUNK_FRACTION)) + set->allocChunkLimit >>= 1; + + /* Finally, do the type-independent part of context creation */ + MemoryContextCreate((MemoryContext) set, + T_AllocSetContext, + &AllocSetMethods, + parent, + name); + + ((MemoryContext) set)->mem_allocated = firstBlockSize; + + return (MemoryContext) set; +} + +/* + * AllocSetReset + * Frees all memory which is allocated in the given set. + * + * Actually, this routine has some discretion about what to do. + * It should mark all allocated chunks freed, but it need not necessarily + * give back all the resources the set owns. Our actual implementation is + * that we give back all but the "keeper" block (which we must keep, since + * it shares a malloc chunk with the context header). In this way, we don't + * thrash malloc() when a context is repeatedly reset after small allocations, + * which is typical behavior for per-tuple contexts. + */ +static void +AllocSetReset(MemoryContext context) +{ + AllocSet set = (AllocSet) context; + AllocBlock block; + Size keepersize PG_USED_FOR_ASSERTS_ONLY + = set->keeper->endptr - ((char *) set); + + AssertArg(AllocSetIsValid(set)); + +#ifdef MEMORY_CONTEXT_CHECKING + /* Check for corruption and leaks before freeing */ + AllocSetCheck(context); +#endif + + /* Clear chunk freelists */ + MemSetAligned(set->freelist, 0, sizeof(set->freelist)); + + block = set->blocks; + + /* New blocks list will be just the keeper block */ + set->blocks = set->keeper; + + while (block != NULL) + { + AllocBlock next = block->next; + + if (block == set->keeper) + { + /* Reset the block, but don't return it to malloc */ + char *datastart = ((char *) block) + ALLOC_BLOCKHDRSZ; + +#ifdef CLOBBER_FREED_MEMORY + wipe_mem(datastart, block->freeptr - datastart); +#else + /* wipe_mem() would have done this */ + VALGRIND_MAKE_MEM_NOACCESS(datastart, block->freeptr - datastart); +#endif + block->freeptr = datastart; + block->prev = NULL; + block->next = NULL; + } + else + { + /* Normal case, release the block */ + context->mem_allocated -= block->endptr - ((char *) block); + +#ifdef CLOBBER_FREED_MEMORY + wipe_mem(block, block->freeptr - ((char *) block)); +#endif + free(block); + } + block = next; + } + + Assert(context->mem_allocated == keepersize); + + /* Reset block size allocation sequence, too */ + set->nextBlockSize = set->initBlockSize; +} + +/* + * AllocSetDelete + * Frees all memory which is allocated in the given set, + * in preparation for deletion of the set. + * + * Unlike AllocSetReset, this *must* free all resources of the set. + */ +static void +AllocSetDelete(MemoryContext context) +{ + AllocSet set = (AllocSet) context; + AllocBlock block = set->blocks; + Size keepersize PG_USED_FOR_ASSERTS_ONLY + = set->keeper->endptr - ((char *) set); + + AssertArg(AllocSetIsValid(set)); + +#ifdef MEMORY_CONTEXT_CHECKING + /* Check for corruption and leaks before freeing */ + AllocSetCheck(context); +#endif + + /* + * If the context is a candidate for a freelist, put it into that freelist + * instead of destroying it. + */ + if (set->freeListIndex >= 0) + { + AllocSetFreeList *freelist = &context_freelists[set->freeListIndex]; + + /* + * Reset the context, if it needs it, so that we aren't hanging on to + * more than the initial malloc chunk. + */ + if (!context->isReset) + MemoryContextResetOnly(context); + + /* + * If the freelist is full, just discard what's already in it. See + * comments with context_freelists[]. + */ + if (freelist->num_free >= MAX_FREE_CONTEXTS) + { + while (freelist->first_free != NULL) + { + AllocSetContext *oldset = freelist->first_free; + + freelist->first_free = (AllocSetContext *) oldset->header.nextchild; + freelist->num_free--; + + /* All that remains is to free the header/initial block */ + free(oldset); + } + Assert(freelist->num_free == 0); + } + + /* Now add the just-deleted context to the freelist. */ + set->header.nextchild = (MemoryContext) freelist->first_free; + freelist->first_free = set; + freelist->num_free++; + + return; + } + + /* Free all blocks, except the keeper which is part of context header */ + while (block != NULL) + { + AllocBlock next = block->next; + + if (block != set->keeper) + context->mem_allocated -= block->endptr - ((char *) block); + +#ifdef CLOBBER_FREED_MEMORY + wipe_mem(block, block->freeptr - ((char *) block)); +#endif + + if (block != set->keeper) + free(block); + + block = next; + } + + Assert(context->mem_allocated == keepersize); + + /* Finally, free the context header, including the keeper block */ + free(set); +} + +/* + * AllocSetAlloc + * Returns pointer to allocated memory of given size or NULL if + * request could not be completed; memory is added to the set. + * + * No request may exceed: + * MAXALIGN_DOWN(SIZE_MAX) - ALLOC_BLOCKHDRSZ - ALLOC_CHUNKHDRSZ + * All callers use a much-lower limit. + * + * Note: when using valgrind, it doesn't matter how the returned allocation + * is marked, as mcxt.c will set it to UNDEFINED. In some paths we will + * return space that is marked NOACCESS - AllocSetRealloc has to beware! + */ +static void * +AllocSetAlloc(MemoryContext context, Size size) +{ + AllocSet set = (AllocSet) context; + AllocBlock block; + AllocChunk chunk; + int fidx; + Size chunk_size; + Size blksize; + + AssertArg(AllocSetIsValid(set)); + + /* + * If requested size exceeds maximum for chunks, allocate an entire block + * for this request. + */ + if (size > set->allocChunkLimit) + { + chunk_size = MAXALIGN(size); + blksize = chunk_size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ; + block = (AllocBlock) malloc(blksize); + if (block == NULL) + return NULL; + + context->mem_allocated += blksize; + + block->aset = set; + block->freeptr = block->endptr = ((char *) block) + blksize; + + chunk = (AllocChunk) (((char *) block) + ALLOC_BLOCKHDRSZ); + chunk->aset = set; + chunk->size = chunk_size; +#ifdef MEMORY_CONTEXT_CHECKING + chunk->requested_size = size; + /* set mark to catch clobber of "unused" space */ + if (size < chunk_size) + set_sentinel(AllocChunkGetPointer(chunk), size); +#endif +#ifdef RANDOMIZE_ALLOCATED_MEMORY + /* fill the allocated space with junk */ + randomize_mem((char *) AllocChunkGetPointer(chunk), size); +#endif + + /* + * Stick the new block underneath the active allocation block, if any, + * so that we don't lose the use of the space remaining therein. + */ + if (set->blocks != NULL) + { + block->prev = set->blocks; + block->next = set->blocks->next; + if (block->next) + block->next->prev = block; + set->blocks->next = block; + } + else + { + block->prev = NULL; + block->next = NULL; + set->blocks = block; + } + + /* Ensure any padding bytes are marked NOACCESS. */ + VALGRIND_MAKE_MEM_NOACCESS((char *) AllocChunkGetPointer(chunk) + size, + chunk_size - size); + + /* Disallow external access to private part of chunk header. */ + VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN); + + return AllocChunkGetPointer(chunk); + } + + /* + * Request is small enough to be treated as a chunk. Look in the + * corresponding free list to see if there is a free chunk we could reuse. + * If one is found, remove it from the free list, make it again a member + * of the alloc set and return its data address. + */ + fidx = AllocSetFreeIndex(size); + chunk = set->freelist[fidx]; + if (chunk != NULL) + { + Assert(chunk->size >= size); + + set->freelist[fidx] = (AllocChunk) chunk->aset; + + chunk->aset = (void *) set; + +#ifdef MEMORY_CONTEXT_CHECKING + chunk->requested_size = size; + /* set mark to catch clobber of "unused" space */ + if (size < chunk->size) + set_sentinel(AllocChunkGetPointer(chunk), size); +#endif +#ifdef RANDOMIZE_ALLOCATED_MEMORY + /* fill the allocated space with junk */ + randomize_mem((char *) AllocChunkGetPointer(chunk), size); +#endif + + /* Ensure any padding bytes are marked NOACCESS. */ + VALGRIND_MAKE_MEM_NOACCESS((char *) AllocChunkGetPointer(chunk) + size, + chunk->size - size); + + /* Disallow external access to private part of chunk header. */ + VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN); + + return AllocChunkGetPointer(chunk); + } + + /* + * Choose the actual chunk size to allocate. + */ + chunk_size = (1 << ALLOC_MINBITS) << fidx; + Assert(chunk_size >= size); + + /* + * If there is enough room in the active allocation block, we will put the + * chunk into that block. Else must start a new one. + */ + if ((block = set->blocks) != NULL) + { + Size availspace = block->endptr - block->freeptr; + + if (availspace < (chunk_size + ALLOC_CHUNKHDRSZ)) + { + /* + * The existing active (top) block does not have enough room for + * the requested allocation, but it might still have a useful + * amount of space in it. Once we push it down in the block list, + * we'll never try to allocate more space from it. So, before we + * do that, carve up its free space into chunks that we can put on + * the set's freelists. + * + * Because we can only get here when there's less than + * ALLOC_CHUNK_LIMIT left in the block, this loop cannot iterate + * more than ALLOCSET_NUM_FREELISTS-1 times. + */ + while (availspace >= ((1 << ALLOC_MINBITS) + ALLOC_CHUNKHDRSZ)) + { + Size availchunk = availspace - ALLOC_CHUNKHDRSZ; + int a_fidx = AllocSetFreeIndex(availchunk); + + /* + * In most cases, we'll get back the index of the next larger + * freelist than the one we need to put this chunk on. The + * exception is when availchunk is exactly a power of 2. + */ + if (availchunk != ((Size) 1 << (a_fidx + ALLOC_MINBITS))) + { + a_fidx--; + Assert(a_fidx >= 0); + availchunk = ((Size) 1 << (a_fidx + ALLOC_MINBITS)); + } + + chunk = (AllocChunk) (block->freeptr); + + /* Prepare to initialize the chunk header. */ + VALGRIND_MAKE_MEM_UNDEFINED(chunk, ALLOC_CHUNKHDRSZ); + + block->freeptr += (availchunk + ALLOC_CHUNKHDRSZ); + availspace -= (availchunk + ALLOC_CHUNKHDRSZ); + + chunk->size = availchunk; +#ifdef MEMORY_CONTEXT_CHECKING + chunk->requested_size = 0; /* mark it free */ +#endif + chunk->aset = (void *) set->freelist[a_fidx]; + set->freelist[a_fidx] = chunk; + } + + /* Mark that we need to create a new block */ + block = NULL; + } + } + + /* + * Time to create a new regular (multi-chunk) block? + */ + if (block == NULL) + { + Size required_size; + + /* + * The first such block has size initBlockSize, and we double the + * space in each succeeding block, but not more than maxBlockSize. + */ + blksize = set->nextBlockSize; + set->nextBlockSize <<= 1; + if (set->nextBlockSize > set->maxBlockSize) + set->nextBlockSize = set->maxBlockSize; + + /* + * If initBlockSize is less than ALLOC_CHUNK_LIMIT, we could need more + * space... but try to keep it a power of 2. + */ + required_size = chunk_size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ; + while (blksize < required_size) + blksize <<= 1; + + /* Try to allocate it */ + block = (AllocBlock) malloc(blksize); + + /* + * We could be asking for pretty big blocks here, so cope if malloc + * fails. But give up if there's less than 1 MB or so available... + */ + while (block == NULL && blksize > 1024 * 1024) + { + blksize >>= 1; + if (blksize < required_size) + break; + block = (AllocBlock) malloc(blksize); + } + + if (block == NULL) + return NULL; + + context->mem_allocated += blksize; + + block->aset = set; + block->freeptr = ((char *) block) + ALLOC_BLOCKHDRSZ; + block->endptr = ((char *) block) + blksize; + + /* Mark unallocated space NOACCESS. */ + VALGRIND_MAKE_MEM_NOACCESS(block->freeptr, + blksize - ALLOC_BLOCKHDRSZ); + + block->prev = NULL; + block->next = set->blocks; + if (block->next) + block->next->prev = block; + set->blocks = block; + } + + /* + * OK, do the allocation + */ + chunk = (AllocChunk) (block->freeptr); + + /* Prepare to initialize the chunk header. */ + VALGRIND_MAKE_MEM_UNDEFINED(chunk, ALLOC_CHUNKHDRSZ); + + block->freeptr += (chunk_size + ALLOC_CHUNKHDRSZ); + Assert(block->freeptr <= block->endptr); + + chunk->aset = (void *) set; + chunk->size = chunk_size; +#ifdef MEMORY_CONTEXT_CHECKING + chunk->requested_size = size; + /* set mark to catch clobber of "unused" space */ + if (size < chunk->size) + set_sentinel(AllocChunkGetPointer(chunk), size); +#endif +#ifdef RANDOMIZE_ALLOCATED_MEMORY + /* fill the allocated space with junk */ + randomize_mem((char *) AllocChunkGetPointer(chunk), size); +#endif + + /* Ensure any padding bytes are marked NOACCESS. */ + VALGRIND_MAKE_MEM_NOACCESS((char *) AllocChunkGetPointer(chunk) + size, + chunk_size - size); + + /* Disallow external access to private part of chunk header. */ + VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN); + + return AllocChunkGetPointer(chunk); +} + +/* + * AllocSetFree + * Frees allocated memory; memory is removed from the set. + */ +static void +AllocSetFree(MemoryContext context, void *pointer) +{ + AllocSet set = (AllocSet) context; + AllocChunk chunk = AllocPointerGetChunk(pointer); + + /* Allow access to private part of chunk header. */ + VALGRIND_MAKE_MEM_DEFINED(chunk, ALLOCCHUNK_PRIVATE_LEN); + +#ifdef MEMORY_CONTEXT_CHECKING + /* Test for someone scribbling on unused space in chunk */ + if (chunk->requested_size < chunk->size) + if (!sentinel_ok(pointer, chunk->requested_size)) + elog(WARNING, "detected write past chunk end in %s %p", + set->header.name, chunk); +#endif + + if (chunk->size > set->allocChunkLimit) + { + /* + * Big chunks are certain to have been allocated as single-chunk + * blocks. Just unlink that block and return it to malloc(). + */ + AllocBlock block = (AllocBlock) (((char *) chunk) - ALLOC_BLOCKHDRSZ); + + /* + * Try to verify that we have a sane block pointer: it should + * reference the correct aset, and freeptr and endptr should point + * just past the chunk. + */ + if (block->aset != set || + block->freeptr != block->endptr || + block->freeptr != ((char *) block) + + (chunk->size + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ)) + elog(ERROR, "could not find block containing chunk %p", chunk); + + /* OK, remove block from aset's list and free it */ + if (block->prev) + block->prev->next = block->next; + else + set->blocks = block->next; + if (block->next) + block->next->prev = block->prev; + + context->mem_allocated -= block->endptr - ((char *) block); + +#ifdef CLOBBER_FREED_MEMORY + wipe_mem(block, block->freeptr - ((char *) block)); +#endif + free(block); + } + else + { + /* Normal case, put the chunk into appropriate freelist */ + int fidx = AllocSetFreeIndex(chunk->size); + + chunk->aset = (void *) set->freelist[fidx]; + +#ifdef CLOBBER_FREED_MEMORY + wipe_mem(pointer, chunk->size); +#endif + +#ifdef MEMORY_CONTEXT_CHECKING + /* Reset requested_size to 0 in chunks that are on freelist */ + chunk->requested_size = 0; +#endif + set->freelist[fidx] = chunk; + } +} + +/* + * AllocSetRealloc + * Returns new pointer to allocated memory of given size or NULL if + * request could not be completed; this memory is added to the set. + * Memory associated with given pointer is copied into the new memory, + * and the old memory is freed. + * + * Without MEMORY_CONTEXT_CHECKING, we don't know the old request size. This + * makes our Valgrind client requests less-precise, hazarding false negatives. + * (In principle, we could use VALGRIND_GET_VBITS() to rediscover the old + * request size.) + */ +static void * +AllocSetRealloc(MemoryContext context, void *pointer, Size size) +{ + AllocSet set = (AllocSet) context; + AllocChunk chunk = AllocPointerGetChunk(pointer); + Size oldsize; + + /* Allow access to private part of chunk header. */ + VALGRIND_MAKE_MEM_DEFINED(chunk, ALLOCCHUNK_PRIVATE_LEN); + + oldsize = chunk->size; + +#ifdef MEMORY_CONTEXT_CHECKING + /* Test for someone scribbling on unused space in chunk */ + if (chunk->requested_size < oldsize) + if (!sentinel_ok(pointer, chunk->requested_size)) + elog(WARNING, "detected write past chunk end in %s %p", + set->header.name, chunk); +#endif + + if (oldsize > set->allocChunkLimit) + { + /* + * The chunk must have been allocated as a single-chunk block. Use + * realloc() to make the containing block bigger, or smaller, with + * minimum space wastage. + */ + AllocBlock block = (AllocBlock) (((char *) chunk) - ALLOC_BLOCKHDRSZ); + Size chksize; + Size blksize; + Size oldblksize; + + /* + * Try to verify that we have a sane block pointer: it should + * reference the correct aset, and freeptr and endptr should point + * just past the chunk. + */ + if (block->aset != set || + block->freeptr != block->endptr || + block->freeptr != ((char *) block) + + (oldsize + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ)) + elog(ERROR, "could not find block containing chunk %p", chunk); + + /* + * Even if the new request is less than set->allocChunkLimit, we stick + * with the single-chunk block approach. Therefore we need + * chunk->size to be bigger than set->allocChunkLimit, so we don't get + * confused about the chunk's status in future calls. + */ + chksize = Max(size, set->allocChunkLimit + 1); + chksize = MAXALIGN(chksize); + + /* Do the realloc */ + blksize = chksize + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ; + oldblksize = block->endptr - ((char *) block); + + block = (AllocBlock) realloc(block, blksize); + if (block == NULL) + { + /* Disallow external access to private part of chunk header. */ + VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN); + return NULL; + } + + /* updated separately, not to underflow when (oldblksize > blksize) */ + context->mem_allocated -= oldblksize; + context->mem_allocated += blksize; + + block->freeptr = block->endptr = ((char *) block) + blksize; + + /* Update pointers since block has likely been moved */ + chunk = (AllocChunk) (((char *) block) + ALLOC_BLOCKHDRSZ); + pointer = AllocChunkGetPointer(chunk); + if (block->prev) + block->prev->next = block; + else + set->blocks = block; + if (block->next) + block->next->prev = block; + chunk->size = chksize; + +#ifdef MEMORY_CONTEXT_CHECKING +#ifdef RANDOMIZE_ALLOCATED_MEMORY + /* We can only fill the extra space if we know the prior request */ + if (size > chunk->requested_size) + randomize_mem((char *) pointer + chunk->requested_size, + size - chunk->requested_size); +#endif + + /* + * realloc() (or randomize_mem()) will have left any newly-allocated + * part UNDEFINED, but we may need to adjust trailing bytes from the + * old allocation. + */ +#ifdef USE_VALGRIND + if (oldsize > chunk->requested_size) + VALGRIND_MAKE_MEM_UNDEFINED((char *) pointer + chunk->requested_size, + oldsize - chunk->requested_size); +#endif + + chunk->requested_size = size; + + /* set mark to catch clobber of "unused" space */ + if (size < chunk->size) + set_sentinel(pointer, size); +#else /* !MEMORY_CONTEXT_CHECKING */ + + /* + * We don't know how much of the old chunk size was the actual + * allocation; it could have been as small as one byte. We have to be + * conservative and just mark the entire old portion DEFINED. + */ + VALGRIND_MAKE_MEM_DEFINED(pointer, oldsize); +#endif + + /* Ensure any padding bytes are marked NOACCESS. */ + VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size, chksize - size); + + /* Disallow external access to private part of chunk header. */ + VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN); + + return pointer; + } + + /* + * Chunk sizes are aligned to power of 2 in AllocSetAlloc(). Maybe the + * allocated area already is >= the new size. (In particular, we will + * fall out here if the requested size is a decrease.) + */ + else if (oldsize >= size) + { +#ifdef MEMORY_CONTEXT_CHECKING + Size oldrequest = chunk->requested_size; + +#ifdef RANDOMIZE_ALLOCATED_MEMORY + /* We can only fill the extra space if we know the prior request */ + if (size > oldrequest) + randomize_mem((char *) pointer + oldrequest, + size - oldrequest); +#endif + + chunk->requested_size = size; + + /* + * If this is an increase, mark any newly-available part UNDEFINED. + * Otherwise, mark the obsolete part NOACCESS. + */ + if (size > oldrequest) + VALGRIND_MAKE_MEM_UNDEFINED((char *) pointer + oldrequest, + size - oldrequest); + else + VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size, + oldsize - size); + + /* set mark to catch clobber of "unused" space */ + if (size < oldsize) + set_sentinel(pointer, size); +#else /* !MEMORY_CONTEXT_CHECKING */ + + /* + * We don't have the information to determine whether we're growing + * the old request or shrinking it, so we conservatively mark the + * entire new allocation DEFINED. + */ + VALGRIND_MAKE_MEM_NOACCESS(pointer, oldsize); + VALGRIND_MAKE_MEM_DEFINED(pointer, size); +#endif + + /* Disallow external access to private part of chunk header. */ + VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN); + + return pointer; + } + else + { + /* + * Enlarge-a-small-chunk case. We just do this by brute force, ie, + * allocate a new chunk and copy the data. Since we know the existing + * data isn't huge, this won't involve any great memcpy expense, so + * it's not worth being smarter. (At one time we tried to avoid + * memcpy when it was possible to enlarge the chunk in-place, but that + * turns out to misbehave unpleasantly for repeated cycles of + * palloc/repalloc/pfree: the eventually freed chunks go into the + * wrong freelist for the next initial palloc request, and so we leak + * memory indefinitely. See pgsql-hackers archives for 2007-08-11.) + */ + AllocPointer newPointer; + + /* allocate new chunk */ + newPointer = AllocSetAlloc((MemoryContext) set, size); + + /* leave immediately if request was not completed */ + if (newPointer == NULL) + { + /* Disallow external access to private part of chunk header. */ + VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN); + return NULL; + } + + /* + * AllocSetAlloc() may have returned a region that is still NOACCESS. + * Change it to UNDEFINED for the moment; memcpy() will then transfer + * definedness from the old allocation to the new. If we know the old + * allocation, copy just that much. Otherwise, make the entire old + * chunk defined to avoid errors as we copy the currently-NOACCESS + * trailing bytes. + */ + VALGRIND_MAKE_MEM_UNDEFINED(newPointer, size); +#ifdef MEMORY_CONTEXT_CHECKING + oldsize = chunk->requested_size; +#else + VALGRIND_MAKE_MEM_DEFINED(pointer, oldsize); +#endif + + /* transfer existing data (certain to fit) */ + memcpy(newPointer, pointer, oldsize); + + /* free old chunk */ + AllocSetFree((MemoryContext) set, pointer); + + return newPointer; + } +} + +/* + * AllocSetGetChunkSpace + * Given a currently-allocated chunk, determine the total space + * it occupies (including all memory-allocation overhead). + */ +static Size +AllocSetGetChunkSpace(MemoryContext context, void *pointer) +{ + AllocChunk chunk = AllocPointerGetChunk(pointer); + Size result; + + VALGRIND_MAKE_MEM_DEFINED(chunk, ALLOCCHUNK_PRIVATE_LEN); + result = chunk->size + ALLOC_CHUNKHDRSZ; + VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN); + return result; +} + +/* + * AllocSetIsEmpty + * Is an allocset empty of any allocated space? + */ +static bool +AllocSetIsEmpty(MemoryContext context) +{ + /* + * For now, we say "empty" only if the context is new or just reset. We + * could examine the freelists to determine if all space has been freed, + * but it's not really worth the trouble for present uses of this + * functionality. + */ + if (context->isReset) + return true; + return false; +} + +/* + * AllocSetStats + * Compute stats about memory consumption of an allocset. + * + * printfunc: if not NULL, pass a human-readable stats string to this. + * passthru: pass this pointer through to printfunc. + * totals: if not NULL, add stats about this context into *totals. + * print_to_stderr: print stats to stderr if true, elog otherwise. + */ +static void +AllocSetStats(MemoryContext context, + MemoryStatsPrintFunc printfunc, void *passthru, + MemoryContextCounters *totals, bool print_to_stderr) +{ + AllocSet set = (AllocSet) context; + Size nblocks = 0; + Size freechunks = 0; + Size totalspace; + Size freespace = 0; + AllocBlock block; + int fidx; + + /* Include context header in totalspace */ + totalspace = MAXALIGN(sizeof(AllocSetContext)); + + for (block = set->blocks; block != NULL; block = block->next) + { + nblocks++; + totalspace += block->endptr - ((char *) block); + freespace += block->endptr - block->freeptr; + } + for (fidx = 0; fidx < ALLOCSET_NUM_FREELISTS; fidx++) + { + AllocChunk chunk; + + for (chunk = set->freelist[fidx]; chunk != NULL; + chunk = (AllocChunk) chunk->aset) + { + freechunks++; + freespace += chunk->size + ALLOC_CHUNKHDRSZ; + } + } + + if (printfunc) + { + char stats_string[200]; + + snprintf(stats_string, sizeof(stats_string), + "%zu total in %zd blocks; %zu free (%zd chunks); %zu used", + totalspace, nblocks, freespace, freechunks, + totalspace - freespace); + printfunc(context, passthru, stats_string, print_to_stderr); + } + + if (totals) + { + totals->nblocks += nblocks; + totals->freechunks += freechunks; + totals->totalspace += totalspace; + totals->freespace += freespace; + } +} + + +#ifdef MEMORY_CONTEXT_CHECKING + +/* + * AllocSetCheck + * Walk through chunks and check consistency of memory. + * + * NOTE: report errors as WARNING, *not* ERROR or FATAL. Otherwise you'll + * find yourself in an infinite loop when trouble occurs, because this + * routine will be entered again when elog cleanup tries to release memory! + */ +static void +AllocSetCheck(MemoryContext context) +{ + AllocSet set = (AllocSet) context; + const char *name = set->header.name; + AllocBlock prevblock; + AllocBlock block; + Size total_allocated = 0; + + for (prevblock = NULL, block = set->blocks; + block != NULL; + prevblock = block, block = block->next) + { + char *bpoz = ((char *) block) + ALLOC_BLOCKHDRSZ; + long blk_used = block->freeptr - bpoz; + long blk_data = 0; + long nchunks = 0; + + if (set->keeper == block) + total_allocated += block->endptr - ((char *) set); + else + total_allocated += block->endptr - ((char *) block); + + /* + * Empty block - empty can be keeper-block only + */ + if (!blk_used) + { + if (set->keeper != block) + elog(WARNING, "problem in alloc set %s: empty block %p", + name, block); + } + + /* + * Check block header fields + */ + if (block->aset != set || + block->prev != prevblock || + block->freeptr < bpoz || + block->freeptr > block->endptr) + elog(WARNING, "problem in alloc set %s: corrupt header in block %p", + name, block); + + /* + * Chunk walker + */ + while (bpoz < block->freeptr) + { + AllocChunk chunk = (AllocChunk) bpoz; + Size chsize, + dsize; + + /* Allow access to private part of chunk header. */ + VALGRIND_MAKE_MEM_DEFINED(chunk, ALLOCCHUNK_PRIVATE_LEN); + + chsize = chunk->size; /* aligned chunk size */ + dsize = chunk->requested_size; /* real data */ + + /* + * Check chunk size + */ + if (dsize > chsize) + elog(WARNING, "problem in alloc set %s: req size > alloc size for chunk %p in block %p", + name, chunk, block); + if (chsize < (1 << ALLOC_MINBITS)) + elog(WARNING, "problem in alloc set %s: bad size %zu for chunk %p in block %p", + name, chsize, chunk, block); + + /* single-chunk block? */ + if (chsize > set->allocChunkLimit && + chsize + ALLOC_CHUNKHDRSZ != blk_used) + elog(WARNING, "problem in alloc set %s: bad single-chunk %p in block %p", + name, chunk, block); + + /* + * If chunk is allocated, check for correct aset pointer. (If it's + * free, the aset is the freelist pointer, which we can't check as + * easily...) Note this is an incomplete test, since palloc(0) + * produces an allocated chunk with requested_size == 0. + */ + if (dsize > 0 && chunk->aset != (void *) set) + elog(WARNING, "problem in alloc set %s: bogus aset link in block %p, chunk %p", + name, block, chunk); + + /* + * Check for overwrite of padding space in an allocated chunk. + */ + if (chunk->aset == (void *) set && dsize < chsize && + !sentinel_ok(chunk, ALLOC_CHUNKHDRSZ + dsize)) + elog(WARNING, "problem in alloc set %s: detected write past chunk end in block %p, chunk %p", + name, block, chunk); + + /* + * If chunk is allocated, disallow external access to private part + * of chunk header. + */ + if (chunk->aset == (void *) set) + VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOCCHUNK_PRIVATE_LEN); + + blk_data += chsize; + nchunks++; + + bpoz += ALLOC_CHUNKHDRSZ + chsize; + } + + if ((blk_data + (nchunks * ALLOC_CHUNKHDRSZ)) != blk_used) + elog(WARNING, "problem in alloc set %s: found inconsistent memory block %p", + name, block); + } + + Assert(total_allocated == context->mem_allocated); +} + +#endif /* MEMORY_CONTEXT_CHECKING */ diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c new file mode 100644 index 0000000..7e2a20b --- /dev/null +++ b/src/backend/utils/mmgr/dsa.c @@ -0,0 +1,2287 @@ +/*------------------------------------------------------------------------- + * + * dsa.c + * Dynamic shared memory areas. + * + * This module provides dynamic shared memory areas which are built on top of + * DSM segments. While dsm.c allows segments of memory of shared memory to be + * created and shared between backends, it isn't designed to deal with small + * objects. A DSA area is a shared memory heap usually backed by one or more + * DSM segments which can allocate memory using dsa_allocate() and dsa_free(). + * Alternatively, it can be created in pre-existing shared memory, including a + * DSM segment, and then create extra DSM segments as required. Unlike the + * regular system heap, it deals in pseudo-pointers which must be converted to + * backend-local pointers before they are dereferenced. These pseudo-pointers + * can however be shared with other backends, and can be used to construct + * shared data structures. + * + * Each DSA area manages a set of DSM segments, adding new segments as + * required and detaching them when they are no longer needed. Each segment + * contains a number of 4KB pages, a free page manager for tracking + * consecutive runs of free pages, and a page map for tracking the source of + * objects allocated on each page. Allocation requests above 8KB are handled + * by choosing a segment and finding consecutive free pages in its free page + * manager. Allocation requests for smaller sizes are handled using pools of + * objects of a selection of sizes. Each pool consists of a number of 16 page + * (64KB) superblocks allocated in the same way as large objects. Allocation + * of large objects and new superblocks is serialized by a single LWLock, but + * allocation of small objects from pre-existing superblocks uses one LWLock + * per pool. Currently there is one pool, and therefore one lock, per size + * class. Per-core pools to increase concurrency and strategies for reducing + * the resulting fragmentation are areas for future research. Each superblock + * is managed with a 'span', which tracks the superblock's freelist. Free + * requests are handled by looking in the page map to find which span an + * address was allocated from, so that small objects can be returned to the + * appropriate free list, and large object pages can be returned directly to + * the free page map. When allocating, simple heuristics for selecting + * segments and superblocks try to encourage occupied memory to be + * concentrated, increasing the likelihood that whole superblocks can become + * empty and be returned to the free page manager, and whole segments can + * become empty and be returned to the operating system. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/mmgr/dsa.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "port/atomics.h" +#include "storage/dsm.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "utils/dsa.h" +#include "utils/freepage.h" +#include "utils/memutils.h" + +/* + * The size of the initial DSM segment that backs a dsa_area created by + * dsa_create. After creating some number of segments of this size we'll + * double this size, and so on. Larger segments may be created if necessary + * to satisfy large requests. + */ +#define DSA_INITIAL_SEGMENT_SIZE ((size_t) (1 * 1024 * 1024)) + +/* + * How many segments to create before we double the segment size. If this is + * low, then there is likely to be a lot of wasted space in the largest + * segment. If it is high, then we risk running out of segment slots (see + * dsm.c's limits on total number of segments), or limiting the total size + * an area can manage when using small pointers. + */ +#define DSA_NUM_SEGMENTS_AT_EACH_SIZE 2 + +/* + * The number of bits used to represent the offset part of a dsa_pointer. + * This controls the maximum size of a segment, the maximum possible + * allocation size and also the maximum number of segments per area. + */ +#if SIZEOF_DSA_POINTER == 4 +#define DSA_OFFSET_WIDTH 27 /* 32 segments of size up to 128MB */ +#else +#define DSA_OFFSET_WIDTH 40 /* 1024 segments of size up to 1TB */ +#endif + +/* + * The maximum number of DSM segments that an area can own, determined by + * the number of bits remaining (but capped at 1024). + */ +#define DSA_MAX_SEGMENTS \ + Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH))) + +/* The bitmask for extracting the offset from a dsa_pointer. */ +#define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1) + +/* The maximum size of a DSM segment. */ +#define DSA_MAX_SEGMENT_SIZE ((size_t) 1 << DSA_OFFSET_WIDTH) + +/* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */ +#define DSA_PAGES_PER_SUPERBLOCK 16 + +/* + * A magic number used as a sanity check for following DSM segments belonging + * to a DSA area (this number will be XORed with the area handle and + * the segment index). + */ +#define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608 + +/* Build a dsa_pointer given a segment number and offset. */ +#define DSA_MAKE_POINTER(segment_number, offset) \ + (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset)) + +/* Extract the segment number from a dsa_pointer. */ +#define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH) + +/* Extract the offset from a dsa_pointer. */ +#define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK) + +/* The type used for index segment indexes (zero based). */ +typedef size_t dsa_segment_index; + +/* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */ +#define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0) + +/* + * How many bins of segments do we have? The bins are used to categorize + * segments by their largest contiguous run of free pages. + */ +#define DSA_NUM_SEGMENT_BINS 16 + +/* + * What is the lowest bin that holds segments that *might* have n contiguous + * free pages? There is no point in looking in segments in lower bins; they + * definitely can't service a request for n free pages. + */ +#define contiguous_pages_to_segment_bin(n) Min(fls(n), DSA_NUM_SEGMENT_BINS - 1) + +/* Macros for access to locks. */ +#define DSA_AREA_LOCK(area) (&area->control->lock) +#define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock) + +/* + * The header for an individual segment. This lives at the start of each DSM + * segment owned by a DSA area including the first segment (where it appears + * as part of the dsa_area_control struct). + */ +typedef struct +{ + /* Sanity check magic value. */ + uint32 magic; + /* Total number of pages in this segment (excluding metadata area). */ + size_t usable_pages; + /* Total size of this segment in bytes. */ + size_t size; + + /* + * Index of the segment that precedes this one in the same segment bin, or + * DSA_SEGMENT_INDEX_NONE if this is the first one. + */ + dsa_segment_index prev; + + /* + * Index of the segment that follows this one in the same segment bin, or + * DSA_SEGMENT_INDEX_NONE if this is the last one. + */ + dsa_segment_index next; + /* The index of the bin that contains this segment. */ + size_t bin; + + /* + * A flag raised to indicate that this segment is being returned to the + * operating system and has been unpinned. + */ + bool freed; +} dsa_segment_header; + +/* + * Metadata for one superblock. + * + * For most blocks, span objects are stored out-of-line; that is, the span + * object is not stored within the block itself. But, as an exception, for a + * "span of spans", the span object is stored "inline". The allocation is + * always exactly one page, and the dsa_area_span object is located at + * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS, + * and the remaining fields are used just as they would be in an ordinary + * block. We can't allocate spans out of ordinary superblocks because + * creating an ordinary superblock requires us to be able to allocate a span + * *first*. Doing it this way avoids that circularity. + */ +typedef struct +{ + dsa_pointer pool; /* Containing pool. */ + dsa_pointer prevspan; /* Previous span. */ + dsa_pointer nextspan; /* Next span. */ + dsa_pointer start; /* Starting address. */ + size_t npages; /* Length of span in pages. */ + uint16 size_class; /* Size class. */ + uint16 ninitialized; /* Maximum number of objects ever allocated. */ + uint16 nallocatable; /* Number of objects currently allocatable. */ + uint16 firstfree; /* First object on free list. */ + uint16 nmax; /* Maximum number of objects ever possible. */ + uint16 fclass; /* Current fullness class. */ +} dsa_area_span; + +/* + * Given a pointer to an object in a span, access the index of the next free + * object in the same span (ie in the span's freelist) as an L-value. + */ +#define NextFreeObjectIndex(object) (* (uint16 *) (object)) + +/* + * Small allocations are handled by dividing a single block of memory into + * many small objects of equal size. The possible allocation sizes are + * defined by the following array. Larger size classes are spaced more widely + * than smaller size classes. We fudge the spacing for size classes >1kB to + * avoid space wastage: based on the knowledge that we plan to allocate 64kB + * blocks, we bump the maximum object size up to the largest multiple of + * 8 bytes that still lets us fit the same number of objects into one block. + * + * NB: Because of this fudging, if we were ever to use differently-sized blocks + * for small allocations, these size classes would need to be reworked to be + * optimal for the new size. + * + * NB: The optimal spacing for size classes, as well as the size of the blocks + * out of which small objects are allocated, is not a question that has one + * right answer. Some allocators (such as tcmalloc) use more closely-spaced + * size classes than we do here, while others (like aset.c) use more + * widely-spaced classes. Spacing the classes more closely avoids wasting + * memory within individual chunks, but also means a larger number of + * potentially-unfilled blocks. + */ +static const uint16 dsa_size_classes[] = { + sizeof(dsa_area_span), 0, /* special size classes */ + 8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */ + 80, 96, 112, 128, /* 4 classes separated by 16 bytes */ + 160, 192, 224, 256, /* 4 classes separated by 32 bytes */ + 320, 384, 448, 512, /* 4 classes separated by 64 bytes */ + 640, 768, 896, 1024, /* 4 classes separated by 128 bytes */ + 1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */ + 2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */ + 5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */ +}; +#define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes) + +/* Special size classes. */ +#define DSA_SCLASS_BLOCK_OF_SPANS 0 +#define DSA_SCLASS_SPAN_LARGE 1 + +/* + * The following lookup table is used to map the size of small objects + * (less than 1kB) onto the corresponding size class. To use this table, + * round the size of the object up to the next multiple of 8 bytes, and then + * index into this array. + */ +static const uint8 dsa_size_class_map[] = { + 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13, + 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17, + 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, + 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25 +}; +#define DSA_SIZE_CLASS_MAP_QUANTUM 8 + +/* + * Superblocks are binned by how full they are. Generally, each fullness + * class corresponds to one quartile, but the block being used for + * allocations is always at the head of the list for fullness class 1, + * regardless of how full it really is. + */ +#define DSA_FULLNESS_CLASSES 4 + +/* + * A dsa_area_pool represents a set of objects of a given size class. + * + * Perhaps there should be multiple pools for the same size class for + * contention avoidance, but for now there is just one! + */ +typedef struct +{ + /* A lock protecting access to this pool. */ + LWLock lock; + /* A set of linked lists of spans, arranged by fullness. */ + dsa_pointer spans[DSA_FULLNESS_CLASSES]; + /* Should we pad this out to a cacheline boundary? */ +} dsa_area_pool; + +/* + * The control block for an area. This lives in shared memory, at the start of + * the first DSM segment controlled by this area. + */ +typedef struct +{ + /* The segment header for the first segment. */ + dsa_segment_header segment_header; + /* The handle for this area. */ + dsa_handle handle; + /* The handles of the segments owned by this area. */ + dsm_handle segment_handles[DSA_MAX_SEGMENTS]; + /* Lists of segments, binned by maximum contiguous run of free pages. */ + dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS]; + /* The object pools for each size class. */ + dsa_area_pool pools[DSA_NUM_SIZE_CLASSES]; + /* The total size of all active segments. */ + size_t total_segment_size; + /* The maximum total size of backing storage we are allowed. */ + size_t max_total_segment_size; + /* Highest used segment index in the history of this area. */ + dsa_segment_index high_segment_index; + /* The reference count for this area. */ + int refcnt; + /* A flag indicating that this area has been pinned. */ + bool pinned; + /* The number of times that segments have been freed. */ + size_t freed_segment_counter; + /* The LWLock tranche ID. */ + int lwlock_tranche_id; + /* The general lock (protects everything except object pools). */ + LWLock lock; +} dsa_area_control; + +/* Given a pointer to a pool, find a dsa_pointer. */ +#define DsaAreaPoolToDsaPointer(area, p) \ + DSA_MAKE_POINTER(0, (char *) p - (char *) area->control) + +/* + * A dsa_segment_map is stored within the backend-private memory of each + * individual backend. It holds the base address of the segment within that + * backend, plus the addresses of key objects within the segment. Those + * could instead be derived from the base address but it's handy to have them + * around. + */ +typedef struct +{ + dsm_segment *segment; /* DSM segment */ + char *mapped_address; /* Address at which segment is mapped */ + dsa_segment_header *header; /* Header (same as mapped_address) */ + FreePageManager *fpm; /* Free page manager within segment. */ + dsa_pointer *pagemap; /* Page map within segment. */ +} dsa_segment_map; + +/* + * Per-backend state for a storage area. Backends obtain one of these by + * creating an area or attaching to an existing one using a handle. Each + * process that needs to use an area uses its own object to track where the + * segments are mapped. + */ +struct dsa_area +{ + /* Pointer to the control object in shared memory. */ + dsa_area_control *control; + + /* Has the mapping been pinned? */ + bool mapping_pinned; + + /* + * This backend's array of segment maps, ordered by segment index + * corresponding to control->segment_handles. Some of the area's segments + * may not be mapped in this backend yet, and some slots may have been + * freed and need to be detached; these operations happen on demand. + */ + dsa_segment_map segment_maps[DSA_MAX_SEGMENTS]; + + /* The highest segment index this backend has ever mapped. */ + dsa_segment_index high_segment_index; + + /* The last observed freed_segment_counter. */ + size_t freed_segment_counter; +}; + +#define DSA_SPAN_NOTHING_FREE ((uint16) -1) +#define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE) + +/* Given a pointer to a segment_map, obtain a segment index number. */ +#define get_segment_index(area, segment_map_ptr) \ + (segment_map_ptr - &area->segment_maps[0]) + +static void init_span(dsa_area *area, dsa_pointer span_pointer, + dsa_area_pool *pool, dsa_pointer start, size_t npages, + uint16 size_class); +static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool, + int fromclass, int toclass); +static inline dsa_pointer alloc_object(dsa_area *area, int size_class); +static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool, + int size_class); +static dsa_segment_map *get_segment_by_index(dsa_area *area, + dsa_segment_index index); +static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer); +static void unlink_span(dsa_area *area, dsa_area_span *span); +static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span, + dsa_pointer span_pointer, int fclass); +static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map); +static dsa_segment_map *get_best_segment(dsa_area *area, size_t npages); +static dsa_segment_map *make_new_segment(dsa_area *area, size_t requested_pages); +static dsa_area *create_internal(void *place, size_t size, + int tranche_id, + dsm_handle control_handle, + dsm_segment *control_segment); +static dsa_area *attach_internal(void *place, dsm_segment *segment, + dsa_handle handle); +static void check_for_freed_segments(dsa_area *area); +static void check_for_freed_segments_locked(dsa_area *area); + +/* + * Create a new shared area in a new DSM segment. Further DSM segments will + * be allocated as required to extend the available space. + * + * We can't allocate a LWLock tranche_id within this function, because tranche + * IDs are a scarce resource; there are only 64k available, using low numbers + * when possible matters, and we have no provision for recycling them. So, + * we require the caller to provide one. + */ +dsa_area * +dsa_create(int tranche_id) +{ + dsm_segment *segment; + dsa_area *area; + + /* + * Create the DSM segment that will hold the shared control object and the + * first segment of usable space. + */ + segment = dsm_create(DSA_INITIAL_SEGMENT_SIZE, 0); + + /* + * All segments backing this area are pinned, so that DSA can explicitly + * control their lifetime (otherwise a newly created segment belonging to + * this area might be freed when the only backend that happens to have it + * mapped in ends, corrupting the area). + */ + dsm_pin_segment(segment); + + /* Create a new DSA area with the control object in this segment. */ + area = create_internal(dsm_segment_address(segment), + DSA_INITIAL_SEGMENT_SIZE, + tranche_id, + dsm_segment_handle(segment), segment); + + /* Clean up when the control segment detaches. */ + on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place, + PointerGetDatum(dsm_segment_address(segment))); + + return area; +} + +/* + * Create a new shared area in an existing shared memory space, which may be + * either DSM or Postmaster-initialized memory. DSM segments will be + * allocated as required to extend the available space, though that can be + * prevented with dsa_set_size_limit(area, size) using the same size provided + * to dsa_create_in_place. + * + * Areas created in-place must eventually be released by the backend that + * created them and all backends that attach to them. This can be done + * explicitly with dsa_release_in_place, or, in the special case that 'place' + * happens to be in a pre-existing DSM segment, by passing in a pointer to the + * segment so that a detach hook can be registered with the containing DSM + * segment. + * + * See dsa_create() for a note about the tranche arguments. + */ +dsa_area * +dsa_create_in_place(void *place, size_t size, + int tranche_id, dsm_segment *segment) +{ + dsa_area *area; + + area = create_internal(place, size, tranche_id, + DSM_HANDLE_INVALID, NULL); + + /* + * Clean up when the control segment detaches, if a containing DSM segment + * was provided. + */ + if (segment != NULL) + on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place, + PointerGetDatum(place)); + + return area; +} + +/* + * Obtain a handle that can be passed to other processes so that they can + * attach to the given area. Cannot be called for areas created with + * dsa_create_in_place. + */ +dsa_handle +dsa_get_handle(dsa_area *area) +{ + Assert(area->control->handle != DSM_HANDLE_INVALID); + return area->control->handle; +} + +/* + * Attach to an area given a handle generated (possibly in another process) by + * dsa_get_handle. The area must have been created with dsa_create (not + * dsa_create_in_place). + */ +dsa_area * +dsa_attach(dsa_handle handle) +{ + dsm_segment *segment; + dsa_area *area; + + /* + * An area handle is really a DSM segment handle for the first segment, so + * we go ahead and attach to that. + */ + segment = dsm_attach(handle); + if (segment == NULL) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not attach to dynamic shared area"))); + + area = attach_internal(dsm_segment_address(segment), segment, handle); + + /* Clean up when the control segment detaches. */ + on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place, + PointerGetDatum(dsm_segment_address(segment))); + + return area; +} + +/* + * Attach to an area that was created with dsa_create_in_place. The caller + * must somehow know the location in memory that was used when the area was + * created, though it may be mapped at a different virtual address in this + * process. + * + * See dsa_create_in_place for note about releasing in-place areas, and the + * optional 'segment' argument which can be provided to allow automatic + * release if the containing memory happens to be a DSM segment. + */ +dsa_area * +dsa_attach_in_place(void *place, dsm_segment *segment) +{ + dsa_area *area; + + area = attach_internal(place, NULL, DSM_HANDLE_INVALID); + + /* + * Clean up when the control segment detaches, if a containing DSM segment + * was provided. + */ + if (segment != NULL) + on_dsm_detach(segment, &dsa_on_dsm_detach_release_in_place, + PointerGetDatum(place)); + + return area; +} + +/* + * Release a DSA area that was produced by dsa_create_in_place or + * dsa_attach_in_place. The 'segment' argument is ignored but provides an + * interface suitable for on_dsm_detach, for the convenience of users who want + * to create a DSA segment inside an existing DSM segment and have it + * automatically released when the containing DSM segment is detached. + * 'place' should be the address of the place where the area was created. + * + * This callback is automatically registered for the DSM segment containing + * the control object of in-place areas when a segment is provided to + * dsa_create_in_place or dsa_attach_in_place, and also for all areas created + * with dsa_create. + */ +void +dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place) +{ + dsa_release_in_place(DatumGetPointer(place)); +} + +/* + * Release a DSA area that was produced by dsa_create_in_place or + * dsa_attach_in_place. The 'code' argument is ignored but provides an + * interface suitable for on_shmem_exit or before_shmem_exit, for the + * convenience of users who want to create a DSA segment inside shared memory + * other than a DSM segment and have it automatically release at backend exit. + * 'place' should be the address of the place where the area was created. + */ +void +dsa_on_shmem_exit_release_in_place(int code, Datum place) +{ + dsa_release_in_place(DatumGetPointer(place)); +} + +/* + * Release a DSA area that was produced by dsa_create_in_place or + * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX' + * callbacks so that this is managed automatically, because failure to release + * an area created in-place leaks its segments permanently. + * + * This is also called automatically for areas produced by dsa_create or + * dsa_attach as an implementation detail. + */ +void +dsa_release_in_place(void *place) +{ + dsa_area_control *control = (dsa_area_control *) place; + int i; + + LWLockAcquire(&control->lock, LW_EXCLUSIVE); + Assert(control->segment_header.magic == + (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0)); + Assert(control->refcnt > 0); + if (--control->refcnt == 0) + { + for (i = 0; i <= control->high_segment_index; ++i) + { + dsm_handle handle; + + handle = control->segment_handles[i]; + if (handle != DSM_HANDLE_INVALID) + dsm_unpin_segment(handle); + } + } + LWLockRelease(&control->lock); +} + +/* + * Keep a DSA area attached until end of session or explicit detach. + * + * By default, areas are owned by the current resource owner, which means they + * are detached automatically when that scope ends. + */ +void +dsa_pin_mapping(dsa_area *area) +{ + int i; + + Assert(!area->mapping_pinned); + area->mapping_pinned = true; + + for (i = 0; i <= area->high_segment_index; ++i) + if (area->segment_maps[i].segment != NULL) + dsm_pin_mapping(area->segment_maps[i].segment); +} + +/* + * Allocate memory in this storage area. The return value is a dsa_pointer + * that can be passed to other processes, and converted to a local pointer + * with dsa_get_address. 'flags' is a bitmap which should be constructed + * from the following values: + * + * DSA_ALLOC_HUGE allows allocations >= 1GB. Otherwise, such allocations + * will result in an ERROR. + * + * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when + * no memory is available or a size limit established by dsa_set_size_limit + * would be exceeded. Otherwise, such allocations will result in an ERROR. + * + * DSA_ALLOC_ZERO causes the allocated memory to be zeroed. Otherwise, the + * contents of newly-allocated memory are indeterminate. + * + * These flags correspond to similarly named flags used by + * MemoryContextAllocExtended(). See also the macros dsa_allocate and + * dsa_allocate0 which expand to a call to this function with commonly used + * flags. + */ +dsa_pointer +dsa_allocate_extended(dsa_area *area, size_t size, int flags) +{ + uint16 size_class; + dsa_pointer start_pointer; + dsa_segment_map *segment_map; + dsa_pointer result; + + Assert(size > 0); + + /* Sanity check on huge individual allocation size. */ + if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) || + ((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size))) + elog(ERROR, "invalid DSA memory alloc request size %zu", size); + + /* + * If bigger than the largest size class, just grab a run of pages from + * the free page manager, instead of allocating an object from a pool. + * There will still be a span, but it's a special class of span that + * manages this whole allocation and simply gives all pages back to the + * free page manager when dsa_free is called. + */ + if (size > dsa_size_classes[lengthof(dsa_size_classes) - 1]) + { + size_t npages = fpm_size_to_pages(size); + size_t first_page; + dsa_pointer span_pointer; + dsa_area_pool *pool = &area->control->pools[DSA_SCLASS_SPAN_LARGE]; + + /* Obtain a span object. */ + span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS); + if (!DsaPointerIsValid(span_pointer)) + { + /* Raise error unless asked not to. */ + if ((flags & DSA_ALLOC_NO_OOM) == 0) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed on DSA request of size %zu.", + size))); + return InvalidDsaPointer; + } + + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + + /* Find a segment from which to allocate. */ + segment_map = get_best_segment(area, npages); + if (segment_map == NULL) + segment_map = make_new_segment(area, npages); + if (segment_map == NULL) + { + /* Can't make any more segments: game over. */ + LWLockRelease(DSA_AREA_LOCK(area)); + dsa_free(area, span_pointer); + + /* Raise error unless asked not to. */ + if ((flags & DSA_ALLOC_NO_OOM) == 0) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed on DSA request of size %zu.", + size))); + return InvalidDsaPointer; + } + + /* + * Ask the free page manager for a run of pages. This should always + * succeed, since both get_best_segment and make_new_segment should + * only return a non-NULL pointer if it actually contains enough + * contiguous freespace. If it does fail, something in our backend + * private state is out of whack, so use FATAL to kill the process. + */ + if (!FreePageManagerGet(segment_map->fpm, npages, &first_page)) + elog(FATAL, + "dsa_allocate could not find %zu free pages", npages); + LWLockRelease(DSA_AREA_LOCK(area)); + + start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map), + first_page * FPM_PAGE_SIZE); + + /* Initialize span and pagemap. */ + LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE), + LW_EXCLUSIVE); + init_span(area, span_pointer, pool, start_pointer, npages, + DSA_SCLASS_SPAN_LARGE); + segment_map->pagemap[first_page] = span_pointer; + LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE)); + + /* Zero-initialize the memory if requested. */ + if ((flags & DSA_ALLOC_ZERO) != 0) + memset(dsa_get_address(area, start_pointer), 0, size); + + return start_pointer; + } + + /* Map allocation to a size class. */ + if (size < lengthof(dsa_size_class_map) * DSA_SIZE_CLASS_MAP_QUANTUM) + { + int mapidx; + + /* For smaller sizes we have a lookup table... */ + mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) / + DSA_SIZE_CLASS_MAP_QUANTUM) - 1; + size_class = dsa_size_class_map[mapidx]; + } + else + { + uint16 min; + uint16 max; + + /* ... and for the rest we search by binary chop. */ + min = dsa_size_class_map[lengthof(dsa_size_class_map) - 1]; + max = lengthof(dsa_size_classes) - 1; + + while (min < max) + { + uint16 mid = (min + max) / 2; + uint16 class_size = dsa_size_classes[mid]; + + if (class_size < size) + min = mid + 1; + else + max = mid; + } + + size_class = min; + } + Assert(size <= dsa_size_classes[size_class]); + Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]); + + /* Attempt to allocate an object from the appropriate pool. */ + result = alloc_object(area, size_class); + + /* Check for failure to allocate. */ + if (!DsaPointerIsValid(result)) + { + /* Raise error unless asked not to. */ + if ((flags & DSA_ALLOC_NO_OOM) == 0) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed on DSA request of size %zu.", size))); + return InvalidDsaPointer; + } + + /* Zero-initialize the memory if requested. */ + if ((flags & DSA_ALLOC_ZERO) != 0) + memset(dsa_get_address(area, result), 0, size); + + return result; +} + +/* + * Free memory obtained with dsa_allocate. + */ +void +dsa_free(dsa_area *area, dsa_pointer dp) +{ + dsa_segment_map *segment_map; + int pageno; + dsa_pointer span_pointer; + dsa_area_span *span; + char *superblock; + char *object; + size_t size; + int size_class; + + /* Make sure we don't have a stale segment in the slot 'dp' refers to. */ + check_for_freed_segments(area); + + /* Locate the object, span and pool. */ + segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp)); + pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE; + span_pointer = segment_map->pagemap[pageno]; + span = dsa_get_address(area, span_pointer); + superblock = dsa_get_address(area, span->start); + object = dsa_get_address(area, dp); + size_class = span->size_class; + size = dsa_size_classes[size_class]; + + /* + * Special case for large objects that live in a special span: we return + * those pages directly to the free page manager and free the span. + */ + if (span->size_class == DSA_SCLASS_SPAN_LARGE) + { + +#ifdef CLOBBER_FREED_MEMORY + memset(object, 0x7f, span->npages * FPM_PAGE_SIZE); +#endif + + /* Give pages back to free page manager. */ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + FreePageManagerPut(segment_map->fpm, + DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE, + span->npages); + LWLockRelease(DSA_AREA_LOCK(area)); + /* Unlink span. */ + LWLockAcquire(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE), + LW_EXCLUSIVE); + unlink_span(area, span); + LWLockRelease(DSA_SCLASS_LOCK(area, DSA_SCLASS_SPAN_LARGE)); + /* Free the span object so it can be reused. */ + dsa_free(area, span_pointer); + return; + } + +#ifdef CLOBBER_FREED_MEMORY + memset(object, 0x7f, size); +#endif + + LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE); + + /* Put the object on the span's freelist. */ + Assert(object >= superblock); + Assert(object < superblock + DSA_SUPERBLOCK_SIZE); + Assert((object - superblock) % size == 0); + NextFreeObjectIndex(object) = span->firstfree; + span->firstfree = (object - superblock) / size; + ++span->nallocatable; + + /* + * See if the span needs to moved to a different fullness class, or be + * freed so its pages can be given back to the segment. + */ + if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1) + { + /* + * The block was completely full and is located in the + * highest-numbered fullness class, which is never scanned for free + * chunks. We must move it to the next-lower fullness class. + */ + unlink_span(area, span); + add_span_to_fullness_class(area, span, span_pointer, + DSA_FULLNESS_CLASSES - 2); + + /* + * If this is the only span, and there is no active span, then we + * should probably move this span to fullness class 1. (Otherwise if + * you allocate exactly all the objects in the only span, it moves to + * class 3, then you free them all, it moves to 2, and then is given + * back, leaving no active span). + */ + } + else if (span->nallocatable == span->nmax && + (span->fclass != 1 || span->prevspan != InvalidDsaPointer)) + { + /* + * This entire block is free, and it's not the active block for this + * size class. Return the memory to the free page manager. We don't + * do this for the active block to prevent hysteresis: if we + * repeatedly allocate and free the only chunk in the active block, it + * will be very inefficient if we deallocate and reallocate the block + * every time. + */ + destroy_superblock(area, span_pointer); + } + + LWLockRelease(DSA_SCLASS_LOCK(area, size_class)); +} + +/* + * Obtain a backend-local address for a dsa_pointer. 'dp' must point to + * memory allocated by the given area (possibly in another process) that + * hasn't yet been freed. This may cause a segment to be mapped into the + * current process if required, and may cause freed segments to be unmapped. + */ +void * +dsa_get_address(dsa_area *area, dsa_pointer dp) +{ + dsa_segment_index index; + size_t offset; + + /* Convert InvalidDsaPointer to NULL. */ + if (!DsaPointerIsValid(dp)) + return NULL; + + /* Process any requests to detach from freed segments. */ + check_for_freed_segments(area); + + /* Break the dsa_pointer into its components. */ + index = DSA_EXTRACT_SEGMENT_NUMBER(dp); + offset = DSA_EXTRACT_OFFSET(dp); + Assert(index < DSA_MAX_SEGMENTS); + + /* Check if we need to cause this segment to be mapped in. */ + if (unlikely(area->segment_maps[index].mapped_address == NULL)) + { + /* Call for effect (we don't need the result). */ + get_segment_by_index(area, index); + } + + return area->segment_maps[index].mapped_address + offset; +} + +/* + * Pin this area, so that it will continue to exist even if all backends + * detach from it. In that case, the area can still be reattached to if a + * handle has been recorded somewhere. + */ +void +dsa_pin(dsa_area *area) +{ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + if (area->control->pinned) + { + LWLockRelease(DSA_AREA_LOCK(area)); + elog(ERROR, "dsa_area already pinned"); + } + area->control->pinned = true; + ++area->control->refcnt; + LWLockRelease(DSA_AREA_LOCK(area)); +} + +/* + * Undo the effects of dsa_pin, so that the given area can be freed when no + * backends are attached to it. May be called only if dsa_pin has been + * called. + */ +void +dsa_unpin(dsa_area *area) +{ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + Assert(area->control->refcnt > 1); + if (!area->control->pinned) + { + LWLockRelease(DSA_AREA_LOCK(area)); + elog(ERROR, "dsa_area not pinned"); + } + area->control->pinned = false; + --area->control->refcnt; + LWLockRelease(DSA_AREA_LOCK(area)); +} + +/* + * Set the total size limit for this area. This limit is checked whenever new + * segments need to be allocated from the operating system. If the new size + * limit is already exceeded, this has no immediate effect. + * + * Note that the total virtual memory usage may be temporarily larger than + * this limit when segments have been freed, but not yet detached by all + * backends that have attached to them. + */ +void +dsa_set_size_limit(dsa_area *area, size_t limit) +{ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + area->control->max_total_segment_size = limit; + LWLockRelease(DSA_AREA_LOCK(area)); +} + +/* + * Aggressively free all spare memory in the hope of returning DSM segments to + * the operating system. + */ +void +dsa_trim(dsa_area *area) +{ + int size_class; + + /* + * Trim in reverse pool order so we get to the spans-of-spans last, just + * in case any become entirely free while processing all the other pools. + */ + for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class) + { + dsa_area_pool *pool = &area->control->pools[size_class]; + dsa_pointer span_pointer; + + if (size_class == DSA_SCLASS_SPAN_LARGE) + { + /* Large object frees give back segments aggressively already. */ + continue; + } + + /* + * Search fullness class 1 only. That is where we expect to find an + * entirely empty superblock (entirely empty superblocks in other + * fullness classes are returned to the free page map by dsa_free). + */ + LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE); + span_pointer = pool->spans[1]; + while (DsaPointerIsValid(span_pointer)) + { + dsa_area_span *span = dsa_get_address(area, span_pointer); + dsa_pointer next = span->nextspan; + + if (span->nallocatable == span->nmax) + destroy_superblock(area, span_pointer); + + span_pointer = next; + } + LWLockRelease(DSA_SCLASS_LOCK(area, size_class)); + } +} + +/* + * Print out debugging information about the internal state of the shared + * memory area. + */ +void +dsa_dump(dsa_area *area) +{ + size_t i, + j; + + /* + * Note: This gives an inconsistent snapshot as it acquires and releases + * individual locks as it goes... + */ + + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + check_for_freed_segments_locked(area); + fprintf(stderr, "dsa_area handle %x:\n", area->control->handle); + fprintf(stderr, " max_total_segment_size: %zu\n", + area->control->max_total_segment_size); + fprintf(stderr, " total_segment_size: %zu\n", + area->control->total_segment_size); + fprintf(stderr, " refcnt: %d\n", area->control->refcnt); + fprintf(stderr, " pinned: %c\n", area->control->pinned ? 't' : 'f'); + fprintf(stderr, " segment bins:\n"); + for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i) + { + if (area->control->segment_bins[i] != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_index segment_index; + + fprintf(stderr, + " segment bin %zu (at least %d contiguous pages free):\n", + i, 1 << (i - 1)); + segment_index = area->control->segment_bins[i]; + while (segment_index != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_map *segment_map; + + segment_map = + get_segment_by_index(area, segment_index); + + fprintf(stderr, + " segment index %zu, usable_pages = %zu, " + "contiguous_pages = %zu, mapped at %p\n", + segment_index, + segment_map->header->usable_pages, + fpm_largest(segment_map->fpm), + segment_map->mapped_address); + segment_index = segment_map->header->next; + } + } + } + LWLockRelease(DSA_AREA_LOCK(area)); + + fprintf(stderr, " pools:\n"); + for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i) + { + bool found = false; + + LWLockAcquire(DSA_SCLASS_LOCK(area, i), LW_EXCLUSIVE); + for (j = 0; j < DSA_FULLNESS_CLASSES; ++j) + if (DsaPointerIsValid(area->control->pools[i].spans[j])) + found = true; + if (found) + { + if (i == DSA_SCLASS_BLOCK_OF_SPANS) + fprintf(stderr, " pool for blocks of span objects:\n"); + else if (i == DSA_SCLASS_SPAN_LARGE) + fprintf(stderr, " pool for large object spans:\n"); + else + fprintf(stderr, + " pool for size class %zu (object size %hu bytes):\n", + i, dsa_size_classes[i]); + for (j = 0; j < DSA_FULLNESS_CLASSES; ++j) + { + if (!DsaPointerIsValid(area->control->pools[i].spans[j])) + fprintf(stderr, " fullness class %zu is empty\n", j); + else + { + dsa_pointer span_pointer = area->control->pools[i].spans[j]; + + fprintf(stderr, " fullness class %zu:\n", j); + while (DsaPointerIsValid(span_pointer)) + { + dsa_area_span *span; + + span = dsa_get_address(area, span_pointer); + fprintf(stderr, + " span descriptor at " + DSA_POINTER_FORMAT ", superblock at " + DSA_POINTER_FORMAT + ", pages = %zu, objects free = %hu/%hu\n", + span_pointer, span->start, span->npages, + span->nallocatable, span->nmax); + span_pointer = span->nextspan; + } + } + } + } + LWLockRelease(DSA_SCLASS_LOCK(area, i)); + } +} + +/* + * Return the smallest size that you can successfully provide to + * dsa_create_in_place. + */ +size_t +dsa_minimum_size(void) +{ + size_t size; + int pages = 0; + + size = MAXALIGN(sizeof(dsa_area_control)) + + MAXALIGN(sizeof(FreePageManager)); + + /* Figure out how many pages we need, including the page map... */ + while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages) + { + ++pages; + size += sizeof(dsa_pointer); + } + + return pages * FPM_PAGE_SIZE; +} + +/* + * Workhorse function for dsa_create and dsa_create_in_place. + */ +static dsa_area * +create_internal(void *place, size_t size, + int tranche_id, + dsm_handle control_handle, + dsm_segment *control_segment) +{ + dsa_area_control *control; + dsa_area *area; + dsa_segment_map *segment_map; + size_t usable_pages; + size_t total_pages; + size_t metadata_bytes; + int i; + + /* Sanity check on the space we have to work in. */ + if (size < dsa_minimum_size()) + elog(ERROR, "dsa_area space must be at least %zu, but %zu provided", + dsa_minimum_size(), size); + + /* Now figure out how much space is usable */ + total_pages = size / FPM_PAGE_SIZE; + metadata_bytes = + MAXALIGN(sizeof(dsa_area_control)) + + MAXALIGN(sizeof(FreePageManager)) + + total_pages * sizeof(dsa_pointer); + /* Add padding up to next page boundary. */ + if (metadata_bytes % FPM_PAGE_SIZE != 0) + metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE); + Assert(metadata_bytes <= size); + usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE; + + /* + * Initialize the dsa_area_control object located at the start of the + * space. + */ + control = (dsa_area_control *) place; + memset(place, 0, sizeof(*control)); + control->segment_header.magic = + DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0; + control->segment_header.next = DSA_SEGMENT_INDEX_NONE; + control->segment_header.prev = DSA_SEGMENT_INDEX_NONE; + control->segment_header.usable_pages = usable_pages; + control->segment_header.freed = false; + control->segment_header.size = DSA_INITIAL_SEGMENT_SIZE; + control->handle = control_handle; + control->max_total_segment_size = (size_t) -1; + control->total_segment_size = size; + control->segment_handles[0] = control_handle; + for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i) + control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE; + control->refcnt = 1; + control->lwlock_tranche_id = tranche_id; + + /* + * Create the dsa_area object that this backend will use to access the + * area. Other backends will need to obtain their own dsa_area object by + * attaching. + */ + area = palloc(sizeof(dsa_area)); + area->control = control; + area->mapping_pinned = false; + memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS); + area->high_segment_index = 0; + area->freed_segment_counter = 0; + LWLockInitialize(&control->lock, control->lwlock_tranche_id); + for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i) + LWLockInitialize(DSA_SCLASS_LOCK(area, i), + control->lwlock_tranche_id); + + /* Set up the segment map for this process's mapping. */ + segment_map = &area->segment_maps[0]; + segment_map->segment = control_segment; + segment_map->mapped_address = place; + segment_map->header = (dsa_segment_header *) place; + segment_map->fpm = (FreePageManager *) + (segment_map->mapped_address + + MAXALIGN(sizeof(dsa_area_control))); + segment_map->pagemap = (dsa_pointer *) + (segment_map->mapped_address + + MAXALIGN(sizeof(dsa_area_control)) + + MAXALIGN(sizeof(FreePageManager))); + + /* Set up the free page map. */ + FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address); + /* There can be 0 usable pages if size is dsa_minimum_size(). */ + + if (usable_pages > 0) + FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE, + usable_pages); + + /* Put this segment into the appropriate bin. */ + control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0; + segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages); + + return area; +} + +/* + * Workhorse function for dsa_attach and dsa_attach_in_place. + */ +static dsa_area * +attach_internal(void *place, dsm_segment *segment, dsa_handle handle) +{ + dsa_area_control *control; + dsa_area *area; + dsa_segment_map *segment_map; + + control = (dsa_area_control *) place; + Assert(control->handle == handle); + Assert(control->segment_handles[0] == handle); + Assert(control->segment_header.magic == + (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0)); + + /* Build the backend-local area object. */ + area = palloc(sizeof(dsa_area)); + area->control = control; + area->mapping_pinned = false; + memset(&area->segment_maps[0], 0, + sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS); + area->high_segment_index = 0; + + /* Set up the segment map for this process's mapping. */ + segment_map = &area->segment_maps[0]; + segment_map->segment = segment; /* NULL for in-place */ + segment_map->mapped_address = place; + segment_map->header = (dsa_segment_header *) segment_map->mapped_address; + segment_map->fpm = (FreePageManager *) + (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control))); + segment_map->pagemap = (dsa_pointer *) + (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) + + MAXALIGN(sizeof(FreePageManager))); + + /* Bump the reference count. */ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + if (control->refcnt == 0) + { + /* We can't attach to a DSA area that has already been destroyed. */ + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not attach to dynamic shared area"))); + } + ++control->refcnt; + area->freed_segment_counter = area->control->freed_segment_counter; + LWLockRelease(DSA_AREA_LOCK(area)); + + return area; +} + +/* + * Add a new span to fullness class 1 of the indicated pool. + */ +static void +init_span(dsa_area *area, + dsa_pointer span_pointer, + dsa_area_pool *pool, dsa_pointer start, size_t npages, + uint16 size_class) +{ + dsa_area_span *span = dsa_get_address(area, span_pointer); + size_t obsize = dsa_size_classes[size_class]; + + /* + * The per-pool lock must be held because we manipulate the span list for + * this pool. + */ + Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class))); + + /* Push this span onto the front of the span list for fullness class 1. */ + if (DsaPointerIsValid(pool->spans[1])) + { + dsa_area_span *head = (dsa_area_span *) + dsa_get_address(area, pool->spans[1]); + + head->prevspan = span_pointer; + } + span->pool = DsaAreaPoolToDsaPointer(area, pool); + span->nextspan = pool->spans[1]; + span->prevspan = InvalidDsaPointer; + pool->spans[1] = span_pointer; + + span->start = start; + span->npages = npages; + span->size_class = size_class; + span->ninitialized = 0; + if (size_class == DSA_SCLASS_BLOCK_OF_SPANS) + { + /* + * A block-of-spans contains its own descriptor, so mark one object as + * initialized and reduce the count of allocatable objects by one. + * Doing this here has the side effect of also reducing nmax by one, + * which is important to make sure we free this object at the correct + * time. + */ + span->ninitialized = 1; + span->nallocatable = FPM_PAGE_SIZE / obsize - 1; + } + else if (size_class != DSA_SCLASS_SPAN_LARGE) + span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize; + span->firstfree = DSA_SPAN_NOTHING_FREE; + span->nmax = span->nallocatable; + span->fclass = 1; +} + +/* + * Transfer the first span in one fullness class to the head of another + * fullness class. + */ +static bool +transfer_first_span(dsa_area *area, + dsa_area_pool *pool, int fromclass, int toclass) +{ + dsa_pointer span_pointer; + dsa_area_span *span; + dsa_area_span *nextspan; + + /* Can't do it if source list is empty. */ + span_pointer = pool->spans[fromclass]; + if (!DsaPointerIsValid(span_pointer)) + return false; + + /* Remove span from head of source list. */ + span = dsa_get_address(area, span_pointer); + pool->spans[fromclass] = span->nextspan; + if (DsaPointerIsValid(span->nextspan)) + { + nextspan = (dsa_area_span *) + dsa_get_address(area, span->nextspan); + nextspan->prevspan = InvalidDsaPointer; + } + + /* Add span to head of target list. */ + span->nextspan = pool->spans[toclass]; + pool->spans[toclass] = span_pointer; + if (DsaPointerIsValid(span->nextspan)) + { + nextspan = (dsa_area_span *) + dsa_get_address(area, span->nextspan); + nextspan->prevspan = span_pointer; + } + span->fclass = toclass; + + return true; +} + +/* + * Allocate one object of the requested size class from the given area. + */ +static inline dsa_pointer +alloc_object(dsa_area *area, int size_class) +{ + dsa_area_pool *pool = &area->control->pools[size_class]; + dsa_area_span *span; + dsa_pointer block; + dsa_pointer result; + char *object; + size_t size; + + /* + * Even though ensure_active_superblock can in turn call alloc_object if + * it needs to allocate a new span, that's always from a different pool, + * and the order of lock acquisition is always the same, so it's OK that + * we hold this lock for the duration of this function. + */ + Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class))); + LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE); + + /* + * If there's no active superblock, we must successfully obtain one or + * fail the request. + */ + if (!DsaPointerIsValid(pool->spans[1]) && + !ensure_active_superblock(area, pool, size_class)) + { + result = InvalidDsaPointer; + } + else + { + /* + * There should be a block in fullness class 1 at this point, and it + * should never be completely full. Thus we can either pop an object + * from the free list or, failing that, initialize a new object. + */ + Assert(DsaPointerIsValid(pool->spans[1])); + span = (dsa_area_span *) + dsa_get_address(area, pool->spans[1]); + Assert(span->nallocatable > 0); + block = span->start; + Assert(size_class < DSA_NUM_SIZE_CLASSES); + size = dsa_size_classes[size_class]; + if (span->firstfree != DSA_SPAN_NOTHING_FREE) + { + result = block + span->firstfree * size; + object = dsa_get_address(area, result); + span->firstfree = NextFreeObjectIndex(object); + } + else + { + result = block + span->ninitialized * size; + ++span->ninitialized; + } + --span->nallocatable; + + /* If it's now full, move it to the highest-numbered fullness class. */ + if (span->nallocatable == 0) + transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1); + } + + Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class))); + LWLockRelease(DSA_SCLASS_LOCK(area, size_class)); + + return result; +} + +/* + * Ensure an active (i.e. fullness class 1) superblock, unless all existing + * superblocks are completely full and no more can be allocated. + * + * Fullness classes K of 0..N are loosely intended to represent blocks whose + * utilization percentage is at least K/N, but we only enforce this rigorously + * for the highest-numbered fullness class, which always contains exactly + * those blocks that are completely full. It's otherwise acceptable for a + * block to be in a higher-numbered fullness class than the one to which it + * logically belongs. In addition, the active block, which is always the + * first block in fullness class 1, is permitted to have a higher allocation + * percentage than would normally be allowable for that fullness class; we + * don't move it until it's completely full, and then it goes to the + * highest-numbered fullness class. + * + * It might seem odd that the active block is the head of fullness class 1 + * rather than fullness class 0, but experience with other allocators has + * shown that it's usually better to allocate from a block that's moderately + * full rather than one that's nearly empty. Insofar as is reasonably + * possible, we want to avoid performing new allocations in a block that would + * otherwise become empty soon. + */ +static bool +ensure_active_superblock(dsa_area *area, dsa_area_pool *pool, + int size_class) +{ + dsa_pointer span_pointer; + dsa_pointer start_pointer; + size_t obsize = dsa_size_classes[size_class]; + size_t nmax; + int fclass; + size_t npages = 1; + size_t first_page; + size_t i; + dsa_segment_map *segment_map; + + Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class))); + + /* + * Compute the number of objects that will fit in a block of this size + * class. Span-of-spans blocks are just a single page, and the first + * object isn't available for use because it describes the block-of-spans + * itself. + */ + if (size_class == DSA_SCLASS_BLOCK_OF_SPANS) + nmax = FPM_PAGE_SIZE / obsize - 1; + else + nmax = DSA_SUPERBLOCK_SIZE / obsize; + + /* + * If fullness class 1 is empty, try to find a span to put in it by + * scanning higher-numbered fullness classes (excluding the last one, + * whose blocks are certain to all be completely full). + */ + for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass) + { + span_pointer = pool->spans[fclass]; + + while (DsaPointerIsValid(span_pointer)) + { + int tfclass; + dsa_area_span *span; + dsa_area_span *nextspan; + dsa_area_span *prevspan; + dsa_pointer next_span_pointer; + + span = (dsa_area_span *) + dsa_get_address(area, span_pointer); + next_span_pointer = span->nextspan; + + /* Figure out what fullness class should contain this span. */ + tfclass = (nmax - span->nallocatable) + * (DSA_FULLNESS_CLASSES - 1) / nmax; + + /* Look up next span. */ + if (DsaPointerIsValid(span->nextspan)) + nextspan = (dsa_area_span *) + dsa_get_address(area, span->nextspan); + else + nextspan = NULL; + + /* + * If utilization has dropped enough that this now belongs in some + * other fullness class, move it there. + */ + if (tfclass < fclass) + { + /* Remove from the current fullness class list. */ + if (pool->spans[fclass] == span_pointer) + { + /* It was the head; remove it. */ + Assert(!DsaPointerIsValid(span->prevspan)); + pool->spans[fclass] = span->nextspan; + if (nextspan != NULL) + nextspan->prevspan = InvalidDsaPointer; + } + else + { + /* It was not the head. */ + Assert(DsaPointerIsValid(span->prevspan)); + prevspan = (dsa_area_span *) + dsa_get_address(area, span->prevspan); + prevspan->nextspan = span->nextspan; + } + if (nextspan != NULL) + nextspan->prevspan = span->prevspan; + + /* Push onto the head of the new fullness class list. */ + span->nextspan = pool->spans[tfclass]; + pool->spans[tfclass] = span_pointer; + span->prevspan = InvalidDsaPointer; + if (DsaPointerIsValid(span->nextspan)) + { + nextspan = (dsa_area_span *) + dsa_get_address(area, span->nextspan); + nextspan->prevspan = span_pointer; + } + span->fclass = tfclass; + } + + /* Advance to next span on list. */ + span_pointer = next_span_pointer; + } + + /* Stop now if we found a suitable block. */ + if (DsaPointerIsValid(pool->spans[1])) + return true; + } + + /* + * If there are no blocks that properly belong in fullness class 1, pick + * one from some other fullness class and move it there anyway, so that we + * have an allocation target. Our last choice is to transfer a block + * that's almost empty (and might become completely empty soon if left + * alone), but even that is better than failing, which is what we must do + * if there are no blocks at all with freespace. + */ + Assert(!DsaPointerIsValid(pool->spans[1])); + for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass) + if (transfer_first_span(area, pool, fclass, 1)) + return true; + if (!DsaPointerIsValid(pool->spans[1]) && + transfer_first_span(area, pool, 0, 1)) + return true; + + /* + * We failed to find an existing span with free objects, so we need to + * allocate a new superblock and construct a new span to manage it. + * + * First, get a dsa_area_span object to describe the new superblock block + * ... unless this allocation is for a dsa_area_span object, in which case + * that's surely not going to work. We handle that case by storing the + * span describing a block-of-spans inline. + */ + if (size_class != DSA_SCLASS_BLOCK_OF_SPANS) + { + span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS); + if (!DsaPointerIsValid(span_pointer)) + return false; + npages = DSA_PAGES_PER_SUPERBLOCK; + } + + /* Find or create a segment and allocate the superblock. */ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + segment_map = get_best_segment(area, npages); + if (segment_map == NULL) + { + segment_map = make_new_segment(area, npages); + if (segment_map == NULL) + { + LWLockRelease(DSA_AREA_LOCK(area)); + return false; + } + } + + /* + * This shouldn't happen: get_best_segment() or make_new_segment() + * promised that we can successfully allocate npages. + */ + if (!FreePageManagerGet(segment_map->fpm, npages, &first_page)) + elog(FATAL, + "dsa_allocate could not find %zu free pages for superblock", + npages); + LWLockRelease(DSA_AREA_LOCK(area)); + + /* Compute the start of the superblock. */ + start_pointer = + DSA_MAKE_POINTER(get_segment_index(area, segment_map), + first_page * FPM_PAGE_SIZE); + + /* + * If this is a block-of-spans, carve the descriptor right out of the + * allocated space. + */ + if (size_class == DSA_SCLASS_BLOCK_OF_SPANS) + { + /* + * We have a pointer into the segment. We need to build a dsa_pointer + * from the segment index and offset into the segment. + */ + span_pointer = start_pointer; + } + + /* Initialize span and pagemap. */ + init_span(area, span_pointer, pool, start_pointer, npages, size_class); + for (i = 0; i < npages; ++i) + segment_map->pagemap[first_page + i] = span_pointer; + + return true; +} + +/* + * Return the segment map corresponding to a given segment index, mapping the + * segment in if necessary. For internal segment book-keeping, this is called + * with the area lock held. It is also called by dsa_free and dsa_get_address + * without any locking, relying on the fact they have a known live segment + * index and they always call check_for_freed_segments to ensures that any + * freed segment occupying the same slot is detached first. + */ +static dsa_segment_map * +get_segment_by_index(dsa_area *area, dsa_segment_index index) +{ + if (unlikely(area->segment_maps[index].mapped_address == NULL)) + { + dsm_handle handle; + dsm_segment *segment; + dsa_segment_map *segment_map; + + /* + * If we are reached by dsa_free or dsa_get_address, there must be at + * least one object allocated in the referenced segment. Otherwise, + * their caller has a double-free or access-after-free bug, which we + * have no hope of detecting. So we know it's safe to access this + * array slot without holding a lock; it won't change underneath us. + * Furthermore, we know that we can see the latest contents of the + * slot, as explained in check_for_freed_segments, which those + * functions call before arriving here. + */ + handle = area->control->segment_handles[index]; + + /* It's an error to try to access an unused slot. */ + if (handle == DSM_HANDLE_INVALID) + elog(ERROR, + "dsa_area could not attach to a segment that has been freed"); + + segment = dsm_attach(handle); + if (segment == NULL) + elog(ERROR, "dsa_area could not attach to segment"); + if (area->mapping_pinned) + dsm_pin_mapping(segment); + segment_map = &area->segment_maps[index]; + segment_map->segment = segment; + segment_map->mapped_address = dsm_segment_address(segment); + segment_map->header = + (dsa_segment_header *) segment_map->mapped_address; + segment_map->fpm = (FreePageManager *) + (segment_map->mapped_address + + MAXALIGN(sizeof(dsa_segment_header))); + segment_map->pagemap = (dsa_pointer *) + (segment_map->mapped_address + + MAXALIGN(sizeof(dsa_segment_header)) + + MAXALIGN(sizeof(FreePageManager))); + + /* Remember the highest index this backend has ever mapped. */ + if (area->high_segment_index < index) + area->high_segment_index = index; + + Assert(segment_map->header->magic == + (DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ index)); + } + + /* + * Callers of dsa_get_address() and dsa_free() don't hold the area lock, + * but it's a bug in the calling code and undefined behavior if the + * address is not live (ie if the segment might possibly have been freed, + * they're trying to use a dangling pointer). + * + * For dsa.c code that holds the area lock to manipulate segment_bins + * lists, it would be a bug if we ever reach a freed segment here. After + * it's marked as freed, the only thing any backend should do with it is + * unmap it, and it should always have done that in + * check_for_freed_segments_locked() before arriving here to resolve an + * index to a segment_map. + * + * Either way we can assert that we aren't returning a freed segment. + */ + Assert(!area->segment_maps[index].header->freed); + + return &area->segment_maps[index]; +} + +/* + * Return a superblock to the free page manager. If the underlying segment + * has become entirely free, then return it to the operating system. + * + * The appropriate pool lock must be held. + */ +static void +destroy_superblock(dsa_area *area, dsa_pointer span_pointer) +{ + dsa_area_span *span = dsa_get_address(area, span_pointer); + int size_class = span->size_class; + dsa_segment_map *segment_map; + + + /* Remove it from its fullness class list. */ + unlink_span(area, span); + + /* + * Note: Here we acquire the area lock while we already hold a per-pool + * lock. We never hold the area lock and then take a pool lock, or we + * could deadlock. + */ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + check_for_freed_segments_locked(area); + segment_map = + get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(span->start)); + FreePageManagerPut(segment_map->fpm, + DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE, + span->npages); + /* Check if the segment is now entirely free. */ + if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages) + { + dsa_segment_index index = get_segment_index(area, segment_map); + + /* If it's not the segment with extra control data, free it. */ + if (index != 0) + { + /* + * Give it back to the OS, and allow other backends to detect that + * they need to detach. + */ + unlink_segment(area, segment_map); + segment_map->header->freed = true; + Assert(area->control->total_segment_size >= + segment_map->header->size); + area->control->total_segment_size -= + segment_map->header->size; + dsm_unpin_segment(dsm_segment_handle(segment_map->segment)); + dsm_detach(segment_map->segment); + area->control->segment_handles[index] = DSM_HANDLE_INVALID; + ++area->control->freed_segment_counter; + segment_map->segment = NULL; + segment_map->header = NULL; + segment_map->mapped_address = NULL; + } + } + LWLockRelease(DSA_AREA_LOCK(area)); + + /* + * Span-of-spans blocks store the span which describes them within the + * block itself, so freeing the storage implicitly frees the descriptor + * also. If this is a block of any other type, we need to separately free + * the span object also. This recursive call to dsa_free will acquire the + * span pool's lock. We can't deadlock because the acquisition order is + * always some other pool and then the span pool. + */ + if (size_class != DSA_SCLASS_BLOCK_OF_SPANS) + dsa_free(area, span_pointer); +} + +static void +unlink_span(dsa_area *area, dsa_area_span *span) +{ + if (DsaPointerIsValid(span->nextspan)) + { + dsa_area_span *next = dsa_get_address(area, span->nextspan); + + next->prevspan = span->prevspan; + } + if (DsaPointerIsValid(span->prevspan)) + { + dsa_area_span *prev = dsa_get_address(area, span->prevspan); + + prev->nextspan = span->nextspan; + } + else + { + dsa_area_pool *pool = dsa_get_address(area, span->pool); + + pool->spans[span->fclass] = span->nextspan; + } +} + +static void +add_span_to_fullness_class(dsa_area *area, dsa_area_span *span, + dsa_pointer span_pointer, + int fclass) +{ + dsa_area_pool *pool = dsa_get_address(area, span->pool); + + if (DsaPointerIsValid(pool->spans[fclass])) + { + dsa_area_span *head = dsa_get_address(area, + pool->spans[fclass]); + + head->prevspan = span_pointer; + } + span->prevspan = InvalidDsaPointer; + span->nextspan = pool->spans[fclass]; + pool->spans[fclass] = span_pointer; + span->fclass = fclass; +} + +/* + * Detach from an area that was either created or attached to by this process. + */ +void +dsa_detach(dsa_area *area) +{ + int i; + + /* Detach from all segments. */ + for (i = 0; i <= area->high_segment_index; ++i) + if (area->segment_maps[i].segment != NULL) + dsm_detach(area->segment_maps[i].segment); + + /* + * Note that 'detaching' (= detaching from DSM segments) doesn't include + * 'releasing' (= adjusting the reference count). It would be nice to + * combine these operations, but client code might never get around to + * calling dsa_detach because of an error path, and a detach hook on any + * particular segment is too late to detach other segments in the area + * without risking a 'leak' warning in the non-error path. + */ + + /* Free the backend-local area object. */ + pfree(area); +} + +/* + * Unlink a segment from the bin that contains it. + */ +static void +unlink_segment(dsa_area *area, dsa_segment_map *segment_map) +{ + if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_map *prev; + + prev = get_segment_by_index(area, segment_map->header->prev); + prev->header->next = segment_map->header->next; + } + else + { + Assert(area->control->segment_bins[segment_map->header->bin] == + get_segment_index(area, segment_map)); + area->control->segment_bins[segment_map->header->bin] = + segment_map->header->next; + } + if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_map *next; + + next = get_segment_by_index(area, segment_map->header->next); + next->header->prev = segment_map->header->prev; + } +} + +/* + * Find a segment that could satisfy a request for 'npages' of contiguous + * memory, or return NULL if none can be found. This may involve attaching to + * segments that weren't previously attached so that we can query their free + * pages map. + */ +static dsa_segment_map * +get_best_segment(dsa_area *area, size_t npages) +{ + size_t bin; + + Assert(LWLockHeldByMe(DSA_AREA_LOCK(area))); + check_for_freed_segments_locked(area); + + /* + * Start searching from the first bin that *might* have enough contiguous + * pages. + */ + for (bin = contiguous_pages_to_segment_bin(npages); + bin < DSA_NUM_SEGMENT_BINS; + ++bin) + { + /* + * The minimum contiguous size that any segment in this bin should + * have. We'll re-bin if we see segments with fewer. + */ + size_t threshold = (size_t) 1 << (bin - 1); + dsa_segment_index segment_index; + + /* Search this bin for a segment with enough contiguous space. */ + segment_index = area->control->segment_bins[bin]; + while (segment_index != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_map *segment_map; + dsa_segment_index next_segment_index; + size_t contiguous_pages; + + segment_map = get_segment_by_index(area, segment_index); + next_segment_index = segment_map->header->next; + contiguous_pages = fpm_largest(segment_map->fpm); + + /* Not enough for the request, still enough for this bin. */ + if (contiguous_pages >= threshold && contiguous_pages < npages) + { + segment_index = next_segment_index; + continue; + } + + /* Re-bin it if it's no longer in the appropriate bin. */ + if (contiguous_pages < threshold) + { + size_t new_bin; + + new_bin = contiguous_pages_to_segment_bin(contiguous_pages); + + /* Remove it from its current bin. */ + unlink_segment(area, segment_map); + + /* Push it onto the front of its new bin. */ + segment_map->header->prev = DSA_SEGMENT_INDEX_NONE; + segment_map->header->next = + area->control->segment_bins[new_bin]; + segment_map->header->bin = new_bin; + area->control->segment_bins[new_bin] = segment_index; + if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_map *next; + + next = get_segment_by_index(area, + segment_map->header->next); + Assert(next->header->bin == new_bin); + next->header->prev = segment_index; + } + + /* + * But fall through to see if it's enough to satisfy this + * request anyway.... + */ + } + + /* Check if we are done. */ + if (contiguous_pages >= npages) + return segment_map; + + /* Continue searching the same bin. */ + segment_index = next_segment_index; + } + } + + /* Not found. */ + return NULL; +} + +/* + * Create a new segment that can handle at least requested_pages. Returns + * NULL if the requested total size limit or maximum allowed number of + * segments would be exceeded. + */ +static dsa_segment_map * +make_new_segment(dsa_area *area, size_t requested_pages) +{ + dsa_segment_index new_index; + size_t metadata_bytes; + size_t total_size; + size_t total_pages; + size_t usable_pages; + dsa_segment_map *segment_map; + dsm_segment *segment; + + Assert(LWLockHeldByMe(DSA_AREA_LOCK(area))); + + /* Find a segment slot that is not in use (linearly for now). */ + for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index) + { + if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID) + break; + } + if (new_index == DSA_MAX_SEGMENTS) + return NULL; + + /* + * If the total size limit is already exceeded, then we exit early and + * avoid arithmetic wraparound in the unsigned expressions below. + */ + if (area->control->total_segment_size >= + area->control->max_total_segment_size) + return NULL; + + /* + * The size should be at least as big as requested, and at least big + * enough to follow a geometric series that approximately doubles the + * total storage each time we create a new segment. We use geometric + * growth because the underlying DSM system isn't designed for large + * numbers of segments (otherwise we might even consider just using one + * DSM segment for each large allocation and for each superblock, and then + * we wouldn't need to use FreePageManager). + * + * We decide on a total segment size first, so that we produce tidy + * power-of-two sized segments. This is a good property to have if we + * move to huge pages in the future. Then we work back to the number of + * pages we can fit. + */ + total_size = DSA_INITIAL_SEGMENT_SIZE * + ((size_t) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE)); + total_size = Min(total_size, DSA_MAX_SEGMENT_SIZE); + total_size = Min(total_size, + area->control->max_total_segment_size - + area->control->total_segment_size); + + total_pages = total_size / FPM_PAGE_SIZE; + metadata_bytes = + MAXALIGN(sizeof(dsa_segment_header)) + + MAXALIGN(sizeof(FreePageManager)) + + sizeof(dsa_pointer) * total_pages; + + /* Add padding up to next page boundary. */ + if (metadata_bytes % FPM_PAGE_SIZE != 0) + metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE); + if (total_size <= metadata_bytes) + return NULL; + usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE; + Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size); + + /* See if that is enough... */ + if (requested_pages > usable_pages) + { + /* + * We'll make an odd-sized segment, working forward from the requested + * number of pages. + */ + usable_pages = requested_pages; + metadata_bytes = + MAXALIGN(sizeof(dsa_segment_header)) + + MAXALIGN(sizeof(FreePageManager)) + + usable_pages * sizeof(dsa_pointer); + + /* Add padding up to next page boundary. */ + if (metadata_bytes % FPM_PAGE_SIZE != 0) + metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE); + total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE; + + /* Is that too large for dsa_pointer's addressing scheme? */ + if (total_size > DSA_MAX_SEGMENT_SIZE) + return NULL; + + /* Would that exceed the limit? */ + if (total_size > area->control->max_total_segment_size - + area->control->total_segment_size) + return NULL; + } + + /* Create the segment. */ + segment = dsm_create(total_size, 0); + if (segment == NULL) + return NULL; + dsm_pin_segment(segment); + if (area->mapping_pinned) + dsm_pin_mapping(segment); + + /* Store the handle in shared memory to be found by index. */ + area->control->segment_handles[new_index] = + dsm_segment_handle(segment); + /* Track the highest segment index in the history of the area. */ + if (area->control->high_segment_index < new_index) + area->control->high_segment_index = new_index; + /* Track the highest segment index this backend has ever mapped. */ + if (area->high_segment_index < new_index) + area->high_segment_index = new_index; + /* Track total size of all segments. */ + area->control->total_segment_size += total_size; + Assert(area->control->total_segment_size <= + area->control->max_total_segment_size); + + /* Build a segment map for this segment in this backend. */ + segment_map = &area->segment_maps[new_index]; + segment_map->segment = segment; + segment_map->mapped_address = dsm_segment_address(segment); + segment_map->header = (dsa_segment_header *) segment_map->mapped_address; + segment_map->fpm = (FreePageManager *) + (segment_map->mapped_address + + MAXALIGN(sizeof(dsa_segment_header))); + segment_map->pagemap = (dsa_pointer *) + (segment_map->mapped_address + + MAXALIGN(sizeof(dsa_segment_header)) + + MAXALIGN(sizeof(FreePageManager))); + + /* Set up the free page map. */ + FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address); + FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE, + usable_pages); + + /* Set up the segment header and put it in the appropriate bin. */ + segment_map->header->magic = + DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index; + segment_map->header->usable_pages = usable_pages; + segment_map->header->size = total_size; + segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages); + segment_map->header->prev = DSA_SEGMENT_INDEX_NONE; + segment_map->header->next = + area->control->segment_bins[segment_map->header->bin]; + segment_map->header->freed = false; + area->control->segment_bins[segment_map->header->bin] = new_index; + if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE) + { + dsa_segment_map *next = + get_segment_by_index(area, segment_map->header->next); + + Assert(next->header->bin == segment_map->header->bin); + next->header->prev = new_index; + } + + return segment_map; +} + +/* + * Check if any segments have been freed by destroy_superblock, so we can + * detach from them in this backend. This function is called by + * dsa_get_address and dsa_free to make sure that a dsa_pointer they have + * received can be resolved to the correct segment. + * + * The danger we want to defend against is that there could be an old segment + * mapped into a given slot in this backend, and the dsa_pointer they have + * might refer to some new segment in the same slot. So those functions must + * be sure to process all instructions to detach from a freed segment that had + * been generated by the time this process received the dsa_pointer, before + * they call get_segment_by_index. + */ +static void +check_for_freed_segments(dsa_area *area) +{ + size_t freed_segment_counter; + + /* + * Any other process that has freed a segment has incremented + * freed_segment_counter while holding an LWLock, and that must precede + * any backend creating a new segment in the same slot while holding an + * LWLock, and that must precede the creation of any dsa_pointer pointing + * into the new segment which might reach us here, and the caller must + * have sent the dsa_pointer to this process using appropriate memory + * synchronization (some kind of locking or atomic primitive or system + * call). So all we need to do on the reading side is ask for the load of + * freed_segment_counter to follow the caller's load of the dsa_pointer it + * has, and we can be sure to detect any segments that had been freed as + * of the time that the dsa_pointer reached this process. + */ + pg_read_barrier(); + freed_segment_counter = area->control->freed_segment_counter; + if (unlikely(area->freed_segment_counter != freed_segment_counter)) + { + /* Check all currently mapped segments to find what's been freed. */ + LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE); + check_for_freed_segments_locked(area); + LWLockRelease(DSA_AREA_LOCK(area)); + } +} + +/* + * Workhorse for check_for_freed_segments(), and also used directly in path + * where the area lock is already held. This should be called after acquiring + * the lock but before looking up any segment by index number, to make sure we + * unmap any stale segments that might have previously had the same index as a + * current segment. + */ +static void +check_for_freed_segments_locked(dsa_area *area) +{ + size_t freed_segment_counter; + int i; + + Assert(LWLockHeldByMe(DSA_AREA_LOCK(area))); + freed_segment_counter = area->control->freed_segment_counter; + if (unlikely(area->freed_segment_counter != freed_segment_counter)) + { + for (i = 0; i <= area->high_segment_index; ++i) + { + if (area->segment_maps[i].header != NULL && + area->segment_maps[i].header->freed) + { + dsm_detach(area->segment_maps[i].segment); + area->segment_maps[i].segment = NULL; + area->segment_maps[i].header = NULL; + area->segment_maps[i].mapped_address = NULL; + } + } + area->freed_segment_counter = freed_segment_counter; + } +} diff --git a/src/backend/utils/mmgr/freepage.c b/src/backend/utils/mmgr/freepage.c new file mode 100644 index 0000000..4208317 --- /dev/null +++ b/src/backend/utils/mmgr/freepage.c @@ -0,0 +1,1886 @@ +/*------------------------------------------------------------------------- + * + * freepage.c + * Management of free memory pages. + * + * The intention of this code is to provide infrastructure for memory + * allocators written specifically for PostgreSQL. At least in the case + * of dynamic shared memory, we can't simply use malloc() or even + * relatively thin wrappers like palloc() which sit on top of it, because + * no allocator built into the operating system will deal with relative + * pointers. In the future, we may find other cases in which greater + * control over our own memory management seems desirable. + * + * A FreePageManager keeps track of which 4kB pages of memory are currently + * unused from the point of view of some higher-level memory allocator. + * Unlike a user-facing allocator such as palloc(), a FreePageManager can + * only allocate and free in units of whole pages, and freeing an + * allocation can only be done given knowledge of its length in pages. + * + * Since a free page manager has only a fixed amount of dedicated memory, + * and since there is no underlying allocator, it uses the free pages + * it is given to manage to store its bookkeeping data. It keeps multiple + * freelists of runs of pages, sorted by the size of the run; the head of + * each freelist is stored in the FreePageManager itself, and the first + * page of each run contains a relative pointer to the next run. See + * FreePageManagerGetInternal for more details on how the freelists are + * managed. + * + * To avoid memory fragmentation, it's important to consolidate adjacent + * spans of pages whenever possible; otherwise, large allocation requests + * might not be satisfied even when sufficient contiguous space is + * available. Therefore, in addition to the freelists, we maintain an + * in-memory btree of free page ranges ordered by page number. If a + * range being freed precedes or follows a range that is already free, + * the existing range is extended; if it exactly bridges the gap between + * free ranges, then the two existing ranges are consolidated with the + * newly-freed range to form one great big range of free pages. + * + * When there is only one range of free pages, the btree is trivial and + * is stored within the FreePageManager proper; otherwise, pages are + * allocated from the area under management as needed. Even in cases + * where memory fragmentation is very severe, only a tiny fraction of + * the pages under management are consumed by this btree. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/mmgr/freepage.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "lib/stringinfo.h" +#include "miscadmin.h" + +#include "utils/freepage.h" +#include "utils/relptr.h" + + +/* Magic numbers to identify various page types */ +#define FREE_PAGE_SPAN_LEADER_MAGIC 0xea4020f0 +#define FREE_PAGE_LEAF_MAGIC 0x98eae728 +#define FREE_PAGE_INTERNAL_MAGIC 0x19aa32c9 + +/* Doubly linked list of spans of free pages; stored in first page of span. */ +struct FreePageSpanLeader +{ + int magic; /* always FREE_PAGE_SPAN_LEADER_MAGIC */ + Size npages; /* number of pages in span */ + RelptrFreePageSpanLeader prev; + RelptrFreePageSpanLeader next; +}; + +/* Common header for btree leaf and internal pages. */ +typedef struct FreePageBtreeHeader +{ + int magic; /* FREE_PAGE_LEAF_MAGIC or + * FREE_PAGE_INTERNAL_MAGIC */ + Size nused; /* number of items used */ + RelptrFreePageBtree parent; /* uplink */ +} FreePageBtreeHeader; + +/* Internal key; points to next level of btree. */ +typedef struct FreePageBtreeInternalKey +{ + Size first_page; /* low bound for keys on child page */ + RelptrFreePageBtree child; /* downlink */ +} FreePageBtreeInternalKey; + +/* Leaf key; no payload data. */ +typedef struct FreePageBtreeLeafKey +{ + Size first_page; /* first page in span */ + Size npages; /* number of pages in span */ +} FreePageBtreeLeafKey; + +/* Work out how many keys will fit on a page. */ +#define FPM_ITEMS_PER_INTERNAL_PAGE \ + ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \ + sizeof(FreePageBtreeInternalKey)) +#define FPM_ITEMS_PER_LEAF_PAGE \ + ((FPM_PAGE_SIZE - sizeof(FreePageBtreeHeader)) / \ + sizeof(FreePageBtreeLeafKey)) + +/* A btree page of either sort */ +struct FreePageBtree +{ + FreePageBtreeHeader hdr; + union + { + FreePageBtreeInternalKey internal_key[FPM_ITEMS_PER_INTERNAL_PAGE]; + FreePageBtreeLeafKey leaf_key[FPM_ITEMS_PER_LEAF_PAGE]; + } u; +}; + +/* Results of a btree search */ +typedef struct FreePageBtreeSearchResult +{ + FreePageBtree *page; + Size index; + bool found; + unsigned split_pages; +} FreePageBtreeSearchResult; + +/* Helper functions */ +static void FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, + FreePageBtree *btp); +static Size FreePageBtreeCleanup(FreePageManager *fpm); +static FreePageBtree *FreePageBtreeFindLeftSibling(char *base, + FreePageBtree *btp); +static FreePageBtree *FreePageBtreeFindRightSibling(char *base, + FreePageBtree *btp); +static Size FreePageBtreeFirstKey(FreePageBtree *btp); +static FreePageBtree *FreePageBtreeGetRecycled(FreePageManager *fpm); +static void FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, + Size index, Size first_page, FreePageBtree *child); +static void FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, + Size first_page, Size npages); +static void FreePageBtreeRecycle(FreePageManager *fpm, Size pageno); +static void FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, + Size index); +static void FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp); +static void FreePageBtreeSearch(FreePageManager *fpm, Size first_page, + FreePageBtreeSearchResult *result); +static Size FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page); +static Size FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page); +static FreePageBtree *FreePageBtreeSplitPage(FreePageManager *fpm, + FreePageBtree *btp); +static void FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp); +static void FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp, + FreePageBtree *parent, int level, StringInfo buf); +static void FreePageManagerDumpSpans(FreePageManager *fpm, + FreePageSpanLeader *span, Size expected_pages, + StringInfo buf); +static bool FreePageManagerGetInternal(FreePageManager *fpm, Size npages, + Size *first_page); +static Size FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, + Size npages, bool soft); +static void FreePagePopSpanLeader(FreePageManager *fpm, Size pageno); +static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, + Size npages); +static Size FreePageManagerLargestContiguous(FreePageManager *fpm); +static void FreePageManagerUpdateLargest(FreePageManager *fpm); + +#ifdef FPM_EXTRA_ASSERTS +static Size sum_free_pages(FreePageManager *fpm); +#endif + +/* + * Initialize a new, empty free page manager. + * + * 'fpm' should reference caller-provided memory large enough to contain a + * FreePageManager. We'll initialize it here. + * + * 'base' is the address to which all pointers are relative. When managing + * a dynamic shared memory segment, it should normally be the base of the + * segment. When managing backend-private memory, it can be either NULL or, + * if managing a single contiguous extent of memory, the start of that extent. + */ +void +FreePageManagerInitialize(FreePageManager *fpm, char *base) +{ + Size f; + + relptr_store(base, fpm->self, fpm); + relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL); + relptr_store(base, fpm->btree_recycle, (FreePageSpanLeader *) NULL); + fpm->btree_depth = 0; + fpm->btree_recycle_count = 0; + fpm->singleton_first_page = 0; + fpm->singleton_npages = 0; + fpm->contiguous_pages = 0; + fpm->contiguous_pages_dirty = true; +#ifdef FPM_EXTRA_ASSERTS + fpm->free_pages = 0; +#endif + + for (f = 0; f < FPM_NUM_FREELISTS; f++) + relptr_store(base, fpm->freelist[f], (FreePageSpanLeader *) NULL); +} + +/* + * Allocate a run of pages of the given length from the free page manager. + * The return value indicates whether we were able to satisfy the request; + * if true, the first page of the allocation is stored in *first_page. + */ +bool +FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page) +{ + bool result; + Size contiguous_pages; + + result = FreePageManagerGetInternal(fpm, npages, first_page); + + /* + * It's a bit counterintuitive, but allocating pages can actually create + * opportunities for cleanup that create larger ranges. We might pull a + * key out of the btree that enables the item at the head of the btree + * recycle list to be inserted; and then if there are more items behind it + * one of those might cause two currently-separated ranges to merge, + * creating a single range of contiguous pages larger than any that + * existed previously. It might be worth trying to improve the cleanup + * algorithm to avoid such corner cases, but for now we just notice the + * condition and do the appropriate reporting. + */ + contiguous_pages = FreePageBtreeCleanup(fpm); + if (fpm->contiguous_pages < contiguous_pages) + fpm->contiguous_pages = contiguous_pages; + + /* + * FreePageManagerGetInternal may have set contiguous_pages_dirty. + * Recompute contiguous_pages if so. + */ + FreePageManagerUpdateLargest(fpm); + +#ifdef FPM_EXTRA_ASSERTS + if (result) + { + Assert(fpm->free_pages >= npages); + fpm->free_pages -= npages; + } + Assert(fpm->free_pages == sum_free_pages(fpm)); + Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm)); +#endif + return result; +} + +#ifdef FPM_EXTRA_ASSERTS +static void +sum_free_pages_recurse(FreePageManager *fpm, FreePageBtree *btp, Size *sum) +{ + char *base = fpm_segment_base(fpm); + + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC || + btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); + ++*sum; + if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) + { + Size index; + + + for (index = 0; index < btp->hdr.nused; ++index) + { + FreePageBtree *child; + + child = relptr_access(base, btp->u.internal_key[index].child); + sum_free_pages_recurse(fpm, child, sum); + } + } +} +static Size +sum_free_pages(FreePageManager *fpm) +{ + FreePageSpanLeader *recycle; + char *base = fpm_segment_base(fpm); + Size sum = 0; + int list; + + /* Count the spans by scanning the freelists. */ + for (list = 0; list < FPM_NUM_FREELISTS; ++list) + { + + if (!relptr_is_null(fpm->freelist[list])) + { + FreePageSpanLeader *candidate = + relptr_access(base, fpm->freelist[list]); + + do + { + sum += candidate->npages; + candidate = relptr_access(base, candidate->next); + } while (candidate != NULL); + } + } + + /* Count btree internal pages. */ + if (fpm->btree_depth > 0) + { + FreePageBtree *root = relptr_access(base, fpm->btree_root); + + sum_free_pages_recurse(fpm, root, &sum); + } + + /* Count the recycle list. */ + for (recycle = relptr_access(base, fpm->btree_recycle); + recycle != NULL; + recycle = relptr_access(base, recycle->next)) + { + Assert(recycle->npages == 1); + ++sum; + } + + return sum; +} +#endif + +/* + * Compute the size of the largest run of pages that the user could + * successfully get. + */ +static Size +FreePageManagerLargestContiguous(FreePageManager *fpm) +{ + char *base; + Size largest; + + base = fpm_segment_base(fpm); + largest = 0; + if (!relptr_is_null(fpm->freelist[FPM_NUM_FREELISTS - 1])) + { + FreePageSpanLeader *candidate; + + candidate = relptr_access(base, fpm->freelist[FPM_NUM_FREELISTS - 1]); + do + { + if (candidate->npages > largest) + largest = candidate->npages; + candidate = relptr_access(base, candidate->next); + } while (candidate != NULL); + } + else + { + Size f = FPM_NUM_FREELISTS - 1; + + do + { + --f; + if (!relptr_is_null(fpm->freelist[f])) + { + largest = f + 1; + break; + } + } while (f > 0); + } + + return largest; +} + +/* + * Recompute the size of the largest run of pages that the user could + * successfully get, if it has been marked dirty. + */ +static void +FreePageManagerUpdateLargest(FreePageManager *fpm) +{ + if (!fpm->contiguous_pages_dirty) + return; + + fpm->contiguous_pages = FreePageManagerLargestContiguous(fpm); + fpm->contiguous_pages_dirty = false; +} + +/* + * Transfer a run of pages to the free page manager. + */ +void +FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages) +{ + Size contiguous_pages; + + Assert(npages > 0); + + /* Record the new pages. */ + contiguous_pages = + FreePageManagerPutInternal(fpm, first_page, npages, false); + + /* + * If the new range we inserted into the page manager was contiguous with + * an existing range, it may have opened up cleanup opportunities. + */ + if (contiguous_pages > npages) + { + Size cleanup_contiguous_pages; + + cleanup_contiguous_pages = FreePageBtreeCleanup(fpm); + if (cleanup_contiguous_pages > contiguous_pages) + contiguous_pages = cleanup_contiguous_pages; + } + + /* See if we now have a new largest chunk. */ + if (fpm->contiguous_pages < contiguous_pages) + fpm->contiguous_pages = contiguous_pages; + + /* + * The earlier call to FreePageManagerPutInternal may have set + * contiguous_pages_dirty if it needed to allocate internal pages, so + * recompute contiguous_pages if necessary. + */ + FreePageManagerUpdateLargest(fpm); + +#ifdef FPM_EXTRA_ASSERTS + fpm->free_pages += npages; + Assert(fpm->free_pages == sum_free_pages(fpm)); + Assert(fpm->contiguous_pages == FreePageManagerLargestContiguous(fpm)); +#endif +} + +/* + * Produce a debugging dump of the state of a free page manager. + */ +char * +FreePageManagerDump(FreePageManager *fpm) +{ + char *base = fpm_segment_base(fpm); + StringInfoData buf; + FreePageSpanLeader *recycle; + bool dumped_any_freelist = false; + Size f; + + /* Initialize output buffer. */ + initStringInfo(&buf); + + /* Dump general stuff. */ + appendStringInfo(&buf, "metadata: self %zu max contiguous pages = %zu\n", + relptr_offset(fpm->self), fpm->contiguous_pages); + + /* Dump btree. */ + if (fpm->btree_depth > 0) + { + FreePageBtree *root; + + appendStringInfo(&buf, "btree depth %u:\n", fpm->btree_depth); + root = relptr_access(base, fpm->btree_root); + FreePageManagerDumpBtree(fpm, root, NULL, 0, &buf); + } + else if (fpm->singleton_npages > 0) + { + appendStringInfo(&buf, "singleton: %zu(%zu)\n", + fpm->singleton_first_page, fpm->singleton_npages); + } + + /* Dump btree recycle list. */ + recycle = relptr_access(base, fpm->btree_recycle); + if (recycle != NULL) + { + appendStringInfoString(&buf, "btree recycle:"); + FreePageManagerDumpSpans(fpm, recycle, 1, &buf); + } + + /* Dump free lists. */ + for (f = 0; f < FPM_NUM_FREELISTS; ++f) + { + FreePageSpanLeader *span; + + if (relptr_is_null(fpm->freelist[f])) + continue; + if (!dumped_any_freelist) + { + appendStringInfoString(&buf, "freelists:\n"); + dumped_any_freelist = true; + } + appendStringInfo(&buf, " %zu:", f + 1); + span = relptr_access(base, fpm->freelist[f]); + FreePageManagerDumpSpans(fpm, span, f + 1, &buf); + } + + /* And return result to caller. */ + return buf.data; +} + + +/* + * The first_page value stored at index zero in any non-root page must match + * the first_page value stored in its parent at the index which points to that + * page. So when the value stored at index zero in a btree page changes, we've + * got to walk up the tree adjusting ancestor keys until we reach an ancestor + * where that key isn't index zero. This function should be called after + * updating the first key on the target page; it will propagate the change + * upward as far as needed. + * + * We assume here that the first key on the page has not changed enough to + * require changes in the ordering of keys on its ancestor pages. Thus, + * if we search the parent page for the first key greater than or equal to + * the first key on the current page, the downlink to this page will be either + * the exact index returned by the search (if the first key decreased) + * or one less (if the first key increased). + */ +static void +FreePageBtreeAdjustAncestorKeys(FreePageManager *fpm, FreePageBtree *btp) +{ + char *base = fpm_segment_base(fpm); + Size first_page; + FreePageBtree *parent; + FreePageBtree *child; + + /* This might be either a leaf or an internal page. */ + Assert(btp->hdr.nused > 0); + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE); + first_page = btp->u.leaf_key[0].first_page; + } + else + { + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE); + first_page = btp->u.internal_key[0].first_page; + } + child = btp; + + /* Loop until we find an ancestor that does not require adjustment. */ + for (;;) + { + Size s; + + parent = relptr_access(base, child->hdr.parent); + if (parent == NULL) + break; + s = FreePageBtreeSearchInternal(parent, first_page); + + /* Key is either at index s or index s-1; figure out which. */ + if (s >= parent->hdr.nused) + { + Assert(s == parent->hdr.nused); + --s; + } + else + { + FreePageBtree *check; + + check = relptr_access(base, parent->u.internal_key[s].child); + if (check != child) + { + Assert(s > 0); + --s; + } + } + +#ifdef USE_ASSERT_CHECKING + /* Debugging double-check. */ + { + FreePageBtree *check; + + check = relptr_access(base, parent->u.internal_key[s].child); + Assert(s < parent->hdr.nused); + Assert(child == check); + } +#endif + + /* Update the parent key. */ + parent->u.internal_key[s].first_page = first_page; + + /* + * If this is the first key in the parent, go up another level; else + * done. + */ + if (s > 0) + break; + child = parent; + } +} + +/* + * Attempt to reclaim space from the free-page btree. The return value is + * the largest range of contiguous pages created by the cleanup operation. + */ +static Size +FreePageBtreeCleanup(FreePageManager *fpm) +{ + char *base = fpm_segment_base(fpm); + Size max_contiguous_pages = 0; + + /* Attempt to shrink the depth of the btree. */ + while (!relptr_is_null(fpm->btree_root)) + { + FreePageBtree *root = relptr_access(base, fpm->btree_root); + + /* If the root contains only one key, reduce depth by one. */ + if (root->hdr.nused == 1) + { + /* Shrink depth of tree by one. */ + Assert(fpm->btree_depth > 0); + --fpm->btree_depth; + if (root->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + /* If root is a leaf, convert only entry to singleton range. */ + relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL); + fpm->singleton_first_page = root->u.leaf_key[0].first_page; + fpm->singleton_npages = root->u.leaf_key[0].npages; + } + else + { + FreePageBtree *newroot; + + /* If root is an internal page, make only child the root. */ + Assert(root->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + relptr_copy(fpm->btree_root, root->u.internal_key[0].child); + newroot = relptr_access(base, fpm->btree_root); + relptr_store(base, newroot->hdr.parent, (FreePageBtree *) NULL); + } + FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, root)); + } + else if (root->hdr.nused == 2 && + root->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + Size end_of_first; + Size start_of_second; + + end_of_first = root->u.leaf_key[0].first_page + + root->u.leaf_key[0].npages; + start_of_second = root->u.leaf_key[1].first_page; + + if (end_of_first + 1 == start_of_second) + { + Size root_page = fpm_pointer_to_page(base, root); + + if (end_of_first == root_page) + { + FreePagePopSpanLeader(fpm, root->u.leaf_key[0].first_page); + FreePagePopSpanLeader(fpm, root->u.leaf_key[1].first_page); + fpm->singleton_first_page = root->u.leaf_key[0].first_page; + fpm->singleton_npages = root->u.leaf_key[0].npages + + root->u.leaf_key[1].npages + 1; + fpm->btree_depth = 0; + relptr_store(base, fpm->btree_root, + (FreePageBtree *) NULL); + FreePagePushSpanLeader(fpm, fpm->singleton_first_page, + fpm->singleton_npages); + Assert(max_contiguous_pages == 0); + max_contiguous_pages = fpm->singleton_npages; + } + } + + /* Whether it worked or not, it's time to stop. */ + break; + } + else + { + /* Nothing more to do. Stop. */ + break; + } + } + + /* + * Attempt to free recycled btree pages. We skip this if releasing the + * recycled page would require a btree page split, because the page we're + * trying to recycle would be consumed by the split, which would be + * counterproductive. + * + * We also currently only ever attempt to recycle the first page on the + * list; that could be made more aggressive, but it's not clear that the + * complexity would be worthwhile. + */ + while (fpm->btree_recycle_count > 0) + { + FreePageBtree *btp; + Size first_page; + Size contiguous_pages; + + btp = FreePageBtreeGetRecycled(fpm); + first_page = fpm_pointer_to_page(base, btp); + contiguous_pages = FreePageManagerPutInternal(fpm, first_page, 1, true); + if (contiguous_pages == 0) + { + FreePageBtreeRecycle(fpm, first_page); + break; + } + else + { + if (contiguous_pages > max_contiguous_pages) + max_contiguous_pages = contiguous_pages; + } + } + + return max_contiguous_pages; +} + +/* + * Consider consolidating the given page with its left or right sibling, + * if it's fairly empty. + */ +static void +FreePageBtreeConsolidate(FreePageManager *fpm, FreePageBtree *btp) +{ + char *base = fpm_segment_base(fpm); + FreePageBtree *np; + Size max; + + /* + * We only try to consolidate pages that are less than a third full. We + * could be more aggressive about this, but that might risk performing + * consolidation only to end up splitting again shortly thereafter. Since + * the btree should be very small compared to the space under management, + * our goal isn't so much to ensure that it always occupies the absolutely + * smallest possible number of pages as to reclaim pages before things get + * too egregiously out of hand. + */ + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + max = FPM_ITEMS_PER_LEAF_PAGE; + else + { + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + max = FPM_ITEMS_PER_INTERNAL_PAGE; + } + if (btp->hdr.nused >= max / 3) + return; + + /* + * If we can fit our right sibling's keys onto this page, consolidate. + */ + np = FreePageBtreeFindRightSibling(base, btp); + if (np != NULL && btp->hdr.nused + np->hdr.nused <= max) + { + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + memcpy(&btp->u.leaf_key[btp->hdr.nused], &np->u.leaf_key[0], + sizeof(FreePageBtreeLeafKey) * np->hdr.nused); + btp->hdr.nused += np->hdr.nused; + } + else + { + memcpy(&btp->u.internal_key[btp->hdr.nused], &np->u.internal_key[0], + sizeof(FreePageBtreeInternalKey) * np->hdr.nused); + btp->hdr.nused += np->hdr.nused; + FreePageBtreeUpdateParentPointers(base, btp); + } + FreePageBtreeRemovePage(fpm, np); + return; + } + + /* + * If we can fit our keys onto our left sibling's page, consolidate. In + * this case, we move our keys onto the other page rather than vice versa, + * to avoid having to adjust ancestor keys. + */ + np = FreePageBtreeFindLeftSibling(base, btp); + if (np != NULL && btp->hdr.nused + np->hdr.nused <= max) + { + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + memcpy(&np->u.leaf_key[np->hdr.nused], &btp->u.leaf_key[0], + sizeof(FreePageBtreeLeafKey) * btp->hdr.nused); + np->hdr.nused += btp->hdr.nused; + } + else + { + memcpy(&np->u.internal_key[np->hdr.nused], &btp->u.internal_key[0], + sizeof(FreePageBtreeInternalKey) * btp->hdr.nused); + np->hdr.nused += btp->hdr.nused; + FreePageBtreeUpdateParentPointers(base, np); + } + FreePageBtreeRemovePage(fpm, btp); + return; + } +} + +/* + * Find the passed page's left sibling; that is, the page at the same level + * of the tree whose keyspace immediately precedes ours. + */ +static FreePageBtree * +FreePageBtreeFindLeftSibling(char *base, FreePageBtree *btp) +{ + FreePageBtree *p = btp; + int levels = 0; + + /* Move up until we can move left. */ + for (;;) + { + Size first_page; + Size index; + + first_page = FreePageBtreeFirstKey(p); + p = relptr_access(base, p->hdr.parent); + + if (p == NULL) + return NULL; /* we were passed the rightmost page */ + + index = FreePageBtreeSearchInternal(p, first_page); + if (index > 0) + { + Assert(p->u.internal_key[index].first_page == first_page); + p = relptr_access(base, p->u.internal_key[index - 1].child); + break; + } + Assert(index == 0); + ++levels; + } + + /* Descend left. */ + while (levels > 0) + { + Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + p = relptr_access(base, p->u.internal_key[p->hdr.nused - 1].child); + --levels; + } + Assert(p->hdr.magic == btp->hdr.magic); + + return p; +} + +/* + * Find the passed page's right sibling; that is, the page at the same level + * of the tree whose keyspace immediately follows ours. + */ +static FreePageBtree * +FreePageBtreeFindRightSibling(char *base, FreePageBtree *btp) +{ + FreePageBtree *p = btp; + int levels = 0; + + /* Move up until we can move right. */ + for (;;) + { + Size first_page; + Size index; + + first_page = FreePageBtreeFirstKey(p); + p = relptr_access(base, p->hdr.parent); + + if (p == NULL) + return NULL; /* we were passed the rightmost page */ + + index = FreePageBtreeSearchInternal(p, first_page); + if (index < p->hdr.nused - 1) + { + Assert(p->u.internal_key[index].first_page == first_page); + p = relptr_access(base, p->u.internal_key[index + 1].child); + break; + } + Assert(index == p->hdr.nused - 1); + ++levels; + } + + /* Descend left. */ + while (levels > 0) + { + Assert(p->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + p = relptr_access(base, p->u.internal_key[0].child); + --levels; + } + Assert(p->hdr.magic == btp->hdr.magic); + + return p; +} + +/* + * Get the first key on a btree page. + */ +static Size +FreePageBtreeFirstKey(FreePageBtree *btp) +{ + Assert(btp->hdr.nused > 0); + + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + return btp->u.leaf_key[0].first_page; + else + { + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + return btp->u.internal_key[0].first_page; + } +} + +/* + * Get a page from the btree recycle list for use as a btree page. + */ +static FreePageBtree * +FreePageBtreeGetRecycled(FreePageManager *fpm) +{ + char *base = fpm_segment_base(fpm); + FreePageSpanLeader *victim = relptr_access(base, fpm->btree_recycle); + FreePageSpanLeader *newhead; + + Assert(victim != NULL); + newhead = relptr_access(base, victim->next); + if (newhead != NULL) + relptr_copy(newhead->prev, victim->prev); + relptr_store(base, fpm->btree_recycle, newhead); + Assert(fpm_pointer_is_page_aligned(base, victim)); + fpm->btree_recycle_count--; + return (FreePageBtree *) victim; +} + +/* + * Insert an item into an internal page. + */ +static void +FreePageBtreeInsertInternal(char *base, FreePageBtree *btp, Size index, + Size first_page, FreePageBtree *child) +{ + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + Assert(btp->hdr.nused <= FPM_ITEMS_PER_INTERNAL_PAGE); + Assert(index <= btp->hdr.nused); + memmove(&btp->u.internal_key[index + 1], &btp->u.internal_key[index], + sizeof(FreePageBtreeInternalKey) * (btp->hdr.nused - index)); + btp->u.internal_key[index].first_page = first_page; + relptr_store(base, btp->u.internal_key[index].child, child); + ++btp->hdr.nused; +} + +/* + * Insert an item into a leaf page. + */ +static void +FreePageBtreeInsertLeaf(FreePageBtree *btp, Size index, Size first_page, + Size npages) +{ + Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); + Assert(btp->hdr.nused <= FPM_ITEMS_PER_LEAF_PAGE); + Assert(index <= btp->hdr.nused); + memmove(&btp->u.leaf_key[index + 1], &btp->u.leaf_key[index], + sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index)); + btp->u.leaf_key[index].first_page = first_page; + btp->u.leaf_key[index].npages = npages; + ++btp->hdr.nused; +} + +/* + * Put a page on the btree recycle list. + */ +static void +FreePageBtreeRecycle(FreePageManager *fpm, Size pageno) +{ + char *base = fpm_segment_base(fpm); + FreePageSpanLeader *head = relptr_access(base, fpm->btree_recycle); + FreePageSpanLeader *span; + + span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno); + span->magic = FREE_PAGE_SPAN_LEADER_MAGIC; + span->npages = 1; + relptr_store(base, span->next, head); + relptr_store(base, span->prev, (FreePageSpanLeader *) NULL); + if (head != NULL) + relptr_store(base, head->prev, span); + relptr_store(base, fpm->btree_recycle, span); + fpm->btree_recycle_count++; +} + +/* + * Remove an item from the btree at the given position on the given page. + */ +static void +FreePageBtreeRemove(FreePageManager *fpm, FreePageBtree *btp, Size index) +{ + Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); + Assert(index < btp->hdr.nused); + + /* When last item is removed, extirpate entire page from btree. */ + if (btp->hdr.nused == 1) + { + FreePageBtreeRemovePage(fpm, btp); + return; + } + + /* Physically remove the key from the page. */ + --btp->hdr.nused; + if (index < btp->hdr.nused) + memmove(&btp->u.leaf_key[index], &btp->u.leaf_key[index + 1], + sizeof(FreePageBtreeLeafKey) * (btp->hdr.nused - index)); + + /* If we just removed the first key, adjust ancestor keys. */ + if (index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, btp); + + /* Consider whether to consolidate this page with a sibling. */ + FreePageBtreeConsolidate(fpm, btp); +} + +/* + * Remove a page from the btree. Caller is responsible for having relocated + * any keys from this page that are still wanted. The page is placed on the + * recycled list. + */ +static void +FreePageBtreeRemovePage(FreePageManager *fpm, FreePageBtree *btp) +{ + char *base = fpm_segment_base(fpm); + FreePageBtree *parent; + Size index; + Size first_page; + + for (;;) + { + /* Find parent page. */ + parent = relptr_access(base, btp->hdr.parent); + if (parent == NULL) + { + /* We are removing the root page. */ + relptr_store(base, fpm->btree_root, (FreePageBtree *) NULL); + fpm->btree_depth = 0; + Assert(fpm->singleton_first_page == 0); + Assert(fpm->singleton_npages == 0); + return; + } + + /* + * If the parent contains only one item, we need to remove it as well. + */ + if (parent->hdr.nused > 1) + break; + FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp)); + btp = parent; + } + + /* Find and remove the downlink. */ + first_page = FreePageBtreeFirstKey(btp); + if (parent->hdr.magic == FREE_PAGE_LEAF_MAGIC) + { + index = FreePageBtreeSearchLeaf(parent, first_page); + Assert(index < parent->hdr.nused); + if (index < parent->hdr.nused - 1) + memmove(&parent->u.leaf_key[index], + &parent->u.leaf_key[index + 1], + sizeof(FreePageBtreeLeafKey) + * (parent->hdr.nused - index - 1)); + } + else + { + index = FreePageBtreeSearchInternal(parent, first_page); + Assert(index < parent->hdr.nused); + if (index < parent->hdr.nused - 1) + memmove(&parent->u.internal_key[index], + &parent->u.internal_key[index + 1], + sizeof(FreePageBtreeInternalKey) + * (parent->hdr.nused - index - 1)); + } + parent->hdr.nused--; + Assert(parent->hdr.nused > 0); + + /* Recycle the page. */ + FreePageBtreeRecycle(fpm, fpm_pointer_to_page(base, btp)); + + /* Adjust ancestor keys if needed. */ + if (index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, parent); + + /* Consider whether to consolidate the parent with a sibling. */ + FreePageBtreeConsolidate(fpm, parent); +} + +/* + * Search the btree for an entry for the given first page and initialize + * *result with the results of the search. result->page and result->index + * indicate either the position of an exact match or the position at which + * the new key should be inserted. result->found is true for an exact match, + * otherwise false. result->split_pages will contain the number of additional + * btree pages that will be needed when performing a split to insert a key. + * Except as described above, the contents of fields in the result object are + * undefined on return. + */ +static void +FreePageBtreeSearch(FreePageManager *fpm, Size first_page, + FreePageBtreeSearchResult *result) +{ + char *base = fpm_segment_base(fpm); + FreePageBtree *btp = relptr_access(base, fpm->btree_root); + Size index; + + result->split_pages = 1; + + /* If the btree is empty, there's nothing to find. */ + if (btp == NULL) + { + result->page = NULL; + result->found = false; + return; + } + + /* Descend until we hit a leaf. */ + while (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) + { + FreePageBtree *child; + bool found_exact; + + index = FreePageBtreeSearchInternal(btp, first_page); + found_exact = index < btp->hdr.nused && + btp->u.internal_key[index].first_page == first_page; + + /* + * If we found an exact match we descend directly. Otherwise, we + * descend into the child to the left if possible so that we can find + * the insertion point at that child's high end. + */ + if (!found_exact && index > 0) + --index; + + /* Track required split depth for leaf insert. */ + if (btp->hdr.nused >= FPM_ITEMS_PER_INTERNAL_PAGE) + { + Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE); + result->split_pages++; + } + else + result->split_pages = 0; + + /* Descend to appropriate child page. */ + Assert(index < btp->hdr.nused); + child = relptr_access(base, btp->u.internal_key[index].child); + Assert(relptr_access(base, child->hdr.parent) == btp); + btp = child; + } + + /* Track required split depth for leaf insert. */ + if (btp->hdr.nused >= FPM_ITEMS_PER_LEAF_PAGE) + { + Assert(btp->hdr.nused == FPM_ITEMS_PER_INTERNAL_PAGE); + result->split_pages++; + } + else + result->split_pages = 0; + + /* Search leaf page. */ + index = FreePageBtreeSearchLeaf(btp, first_page); + + /* Assemble results. */ + result->page = btp; + result->index = index; + result->found = index < btp->hdr.nused && + first_page == btp->u.leaf_key[index].first_page; +} + +/* + * Search an internal page for the first key greater than or equal to a given + * page number. Returns the index of that key, or one greater than the number + * of keys on the page if none. + */ +static Size +FreePageBtreeSearchInternal(FreePageBtree *btp, Size first_page) +{ + Size low = 0; + Size high = btp->hdr.nused; + + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + Assert(high > 0 && high <= FPM_ITEMS_PER_INTERNAL_PAGE); + + while (low < high) + { + Size mid = (low + high) / 2; + Size val = btp->u.internal_key[mid].first_page; + + if (first_page == val) + return mid; + else if (first_page < val) + high = mid; + else + low = mid + 1; + } + + return low; +} + +/* + * Search a leaf page for the first key greater than or equal to a given + * page number. Returns the index of that key, or one greater than the number + * of keys on the page if none. + */ +static Size +FreePageBtreeSearchLeaf(FreePageBtree *btp, Size first_page) +{ + Size low = 0; + Size high = btp->hdr.nused; + + Assert(btp->hdr.magic == FREE_PAGE_LEAF_MAGIC); + Assert(high > 0 && high <= FPM_ITEMS_PER_LEAF_PAGE); + + while (low < high) + { + Size mid = (low + high) / 2; + Size val = btp->u.leaf_key[mid].first_page; + + if (first_page == val) + return mid; + else if (first_page < val) + high = mid; + else + low = mid + 1; + } + + return low; +} + +/* + * Allocate a new btree page and move half the keys from the provided page + * to the new page. Caller is responsible for making sure that there's a + * page available from fpm->btree_recycle. Returns a pointer to the new page, + * to which caller must add a downlink. + */ +static FreePageBtree * +FreePageBtreeSplitPage(FreePageManager *fpm, FreePageBtree *btp) +{ + FreePageBtree *newsibling; + + newsibling = FreePageBtreeGetRecycled(fpm); + newsibling->hdr.magic = btp->hdr.magic; + newsibling->hdr.nused = btp->hdr.nused / 2; + relptr_copy(newsibling->hdr.parent, btp->hdr.parent); + btp->hdr.nused -= newsibling->hdr.nused; + + if (btp->hdr.magic == FREE_PAGE_LEAF_MAGIC) + memcpy(&newsibling->u.leaf_key, + &btp->u.leaf_key[btp->hdr.nused], + sizeof(FreePageBtreeLeafKey) * newsibling->hdr.nused); + else + { + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + memcpy(&newsibling->u.internal_key, + &btp->u.internal_key[btp->hdr.nused], + sizeof(FreePageBtreeInternalKey) * newsibling->hdr.nused); + FreePageBtreeUpdateParentPointers(fpm_segment_base(fpm), newsibling); + } + + return newsibling; +} + +/* + * When internal pages are split or merged, the parent pointers of their + * children must be updated. + */ +static void +FreePageBtreeUpdateParentPointers(char *base, FreePageBtree *btp) +{ + Size i; + + Assert(btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC); + for (i = 0; i < btp->hdr.nused; ++i) + { + FreePageBtree *child; + + child = relptr_access(base, btp->u.internal_key[i].child); + relptr_store(base, child->hdr.parent, btp); + } +} + +/* + * Debugging dump of btree data. + */ +static void +FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp, + FreePageBtree *parent, int level, StringInfo buf) +{ + char *base = fpm_segment_base(fpm); + Size pageno = fpm_pointer_to_page(base, btp); + Size index; + FreePageBtree *check_parent; + + check_stack_depth(); + check_parent = relptr_access(base, btp->hdr.parent); + appendStringInfo(buf, " %zu@%d %c", pageno, level, + btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC ? 'i' : 'l'); + if (parent != check_parent) + appendStringInfo(buf, " [actual parent %zu, expected %zu]", + fpm_pointer_to_page(base, check_parent), + fpm_pointer_to_page(base, parent)); + appendStringInfoChar(buf, ':'); + for (index = 0; index < btp->hdr.nused; ++index) + { + if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) + appendStringInfo(buf, " %zu->%zu", + btp->u.internal_key[index].first_page, + relptr_offset(btp->u.internal_key[index].child) / FPM_PAGE_SIZE); + else + appendStringInfo(buf, " %zu(%zu)", + btp->u.leaf_key[index].first_page, + btp->u.leaf_key[index].npages); + } + appendStringInfoChar(buf, '\n'); + + if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC) + { + for (index = 0; index < btp->hdr.nused; ++index) + { + FreePageBtree *child; + + child = relptr_access(base, btp->u.internal_key[index].child); + FreePageManagerDumpBtree(fpm, child, btp, level + 1, buf); + } + } +} + +/* + * Debugging dump of free-span data. + */ +static void +FreePageManagerDumpSpans(FreePageManager *fpm, FreePageSpanLeader *span, + Size expected_pages, StringInfo buf) +{ + char *base = fpm_segment_base(fpm); + + while (span != NULL) + { + if (span->npages != expected_pages) + appendStringInfo(buf, " %zu(%zu)", fpm_pointer_to_page(base, span), + span->npages); + else + appendStringInfo(buf, " %zu", fpm_pointer_to_page(base, span)); + span = relptr_access(base, span->next); + } + + appendStringInfoChar(buf, '\n'); +} + +/* + * This function allocates a run of pages of the given length from the free + * page manager. + */ +static bool +FreePageManagerGetInternal(FreePageManager *fpm, Size npages, Size *first_page) +{ + char *base = fpm_segment_base(fpm); + FreePageSpanLeader *victim = NULL; + FreePageSpanLeader *prev; + FreePageSpanLeader *next; + FreePageBtreeSearchResult result; + Size victim_page = 0; /* placate compiler */ + Size f; + + /* + * Search for a free span. + * + * Right now, we use a simple best-fit policy here, but it's possible for + * this to result in memory fragmentation if we're repeatedly asked to + * allocate chunks just a little smaller than what we have available. + * Hopefully, this is unlikely, because we expect most requests to be + * single pages or superblock-sized chunks -- but no policy can be optimal + * under all circumstances unless it has knowledge of future allocation + * patterns. + */ + for (f = Min(npages, FPM_NUM_FREELISTS) - 1; f < FPM_NUM_FREELISTS; ++f) + { + /* Skip empty freelists. */ + if (relptr_is_null(fpm->freelist[f])) + continue; + + /* + * All of the freelists except the last one contain only items of a + * single size, so we just take the first one. But the final free + * list contains everything too big for any of the other lists, so we + * need to search the list. + */ + if (f < FPM_NUM_FREELISTS - 1) + victim = relptr_access(base, fpm->freelist[f]); + else + { + FreePageSpanLeader *candidate; + + candidate = relptr_access(base, fpm->freelist[f]); + do + { + if (candidate->npages >= npages && (victim == NULL || + victim->npages > candidate->npages)) + { + victim = candidate; + if (victim->npages == npages) + break; + } + candidate = relptr_access(base, candidate->next); + } while (candidate != NULL); + } + break; + } + + /* If we didn't find an allocatable span, return failure. */ + if (victim == NULL) + return false; + + /* Remove span from free list. */ + Assert(victim->magic == FREE_PAGE_SPAN_LEADER_MAGIC); + prev = relptr_access(base, victim->prev); + next = relptr_access(base, victim->next); + if (prev != NULL) + relptr_copy(prev->next, victim->next); + else + relptr_copy(fpm->freelist[f], victim->next); + if (next != NULL) + relptr_copy(next->prev, victim->prev); + victim_page = fpm_pointer_to_page(base, victim); + + /* Decide whether we might be invalidating contiguous_pages. */ + if (f == FPM_NUM_FREELISTS - 1 && + victim->npages == fpm->contiguous_pages) + { + /* + * The victim span came from the oversized freelist, and had the same + * size as the longest span. There may or may not be another one of + * the same size, so contiguous_pages must be recomputed just to be + * safe. + */ + fpm->contiguous_pages_dirty = true; + } + else if (f + 1 == fpm->contiguous_pages && + relptr_is_null(fpm->freelist[f])) + { + /* + * The victim span came from a fixed sized freelist, and it was the + * list for spans of the same size as the current longest span, and + * the list is now empty after removing the victim. So + * contiguous_pages must be recomputed without a doubt. + */ + fpm->contiguous_pages_dirty = true; + } + + /* + * If we haven't initialized the btree yet, the victim must be the single + * span stored within the FreePageManager itself. Otherwise, we need to + * update the btree. + */ + if (relptr_is_null(fpm->btree_root)) + { + Assert(victim_page == fpm->singleton_first_page); + Assert(victim->npages == fpm->singleton_npages); + Assert(victim->npages >= npages); + fpm->singleton_first_page += npages; + fpm->singleton_npages -= npages; + if (fpm->singleton_npages > 0) + FreePagePushSpanLeader(fpm, fpm->singleton_first_page, + fpm->singleton_npages); + } + else + { + /* + * If the span we found is exactly the right size, remove it from the + * btree completely. Otherwise, adjust the btree entry to reflect the + * still-unallocated portion of the span, and put that portion on the + * appropriate free list. + */ + FreePageBtreeSearch(fpm, victim_page, &result); + Assert(result.found); + if (victim->npages == npages) + FreePageBtreeRemove(fpm, result.page, result.index); + else + { + FreePageBtreeLeafKey *key; + + /* Adjust btree to reflect remaining pages. */ + Assert(victim->npages > npages); + key = &result.page->u.leaf_key[result.index]; + Assert(key->npages == victim->npages); + key->first_page += npages; + key->npages -= npages; + if (result.index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, result.page); + + /* Put the unallocated pages back on the appropriate free list. */ + FreePagePushSpanLeader(fpm, victim_page + npages, + victim->npages - npages); + } + } + + /* Return results to caller. */ + *first_page = fpm_pointer_to_page(base, victim); + return true; +} + +/* + * Put a range of pages into the btree and freelists, consolidating it with + * existing free spans just before and/or after it. If 'soft' is true, + * only perform the insertion if it can be done without allocating new btree + * pages; if false, do it always. Returns 0 if the soft flag caused the + * insertion to be skipped, or otherwise the size of the contiguous span + * created by the insertion. This may be larger than npages if we're able + * to consolidate with an adjacent range. + */ +static Size +FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages, + bool soft) +{ + char *base = fpm_segment_base(fpm); + FreePageBtreeSearchResult result; + FreePageBtreeLeafKey *prevkey = NULL; + FreePageBtreeLeafKey *nextkey = NULL; + FreePageBtree *np; + Size nindex; + + Assert(npages > 0); + + /* We can store a single free span without initializing the btree. */ + if (fpm->btree_depth == 0) + { + if (fpm->singleton_npages == 0) + { + /* Don't have a span yet; store this one. */ + fpm->singleton_first_page = first_page; + fpm->singleton_npages = npages; + FreePagePushSpanLeader(fpm, first_page, npages); + return fpm->singleton_npages; + } + else if (fpm->singleton_first_page + fpm->singleton_npages == + first_page) + { + /* New span immediately follows sole existing span. */ + fpm->singleton_npages += npages; + FreePagePopSpanLeader(fpm, fpm->singleton_first_page); + FreePagePushSpanLeader(fpm, fpm->singleton_first_page, + fpm->singleton_npages); + return fpm->singleton_npages; + } + else if (first_page + npages == fpm->singleton_first_page) + { + /* New span immediately precedes sole existing span. */ + FreePagePopSpanLeader(fpm, fpm->singleton_first_page); + fpm->singleton_first_page = first_page; + fpm->singleton_npages += npages; + FreePagePushSpanLeader(fpm, fpm->singleton_first_page, + fpm->singleton_npages); + return fpm->singleton_npages; + } + else + { + /* Not contiguous; we need to initialize the btree. */ + Size root_page; + FreePageBtree *root; + + if (!relptr_is_null(fpm->btree_recycle)) + root = FreePageBtreeGetRecycled(fpm); + else if (soft) + return 0; /* Should not allocate if soft. */ + else if (FreePageManagerGetInternal(fpm, 1, &root_page)) + root = (FreePageBtree *) fpm_page_to_pointer(base, root_page); + else + { + /* We'd better be able to get a page from the existing range. */ + elog(FATAL, "free page manager btree is corrupt"); + } + + /* Create the btree and move the preexisting range into it. */ + root->hdr.magic = FREE_PAGE_LEAF_MAGIC; + root->hdr.nused = 1; + relptr_store(base, root->hdr.parent, (FreePageBtree *) NULL); + root->u.leaf_key[0].first_page = fpm->singleton_first_page; + root->u.leaf_key[0].npages = fpm->singleton_npages; + relptr_store(base, fpm->btree_root, root); + fpm->singleton_first_page = 0; + fpm->singleton_npages = 0; + fpm->btree_depth = 1; + + /* + * Corner case: it may be that the btree root took the very last + * free page. In that case, the sole btree entry covers a zero + * page run, which is invalid. Overwrite it with the entry we're + * trying to insert and get out. + */ + if (root->u.leaf_key[0].npages == 0) + { + root->u.leaf_key[0].first_page = first_page; + root->u.leaf_key[0].npages = npages; + FreePagePushSpanLeader(fpm, first_page, npages); + return npages; + } + + /* Fall through to insert the new key. */ + } + } + + /* Search the btree. */ + FreePageBtreeSearch(fpm, first_page, &result); + Assert(!result.found); + if (result.index > 0) + prevkey = &result.page->u.leaf_key[result.index - 1]; + if (result.index < result.page->hdr.nused) + { + np = result.page; + nindex = result.index; + nextkey = &result.page->u.leaf_key[result.index]; + } + else + { + np = FreePageBtreeFindRightSibling(base, result.page); + nindex = 0; + if (np != NULL) + nextkey = &np->u.leaf_key[0]; + } + + /* Consolidate with the previous entry if possible. */ + if (prevkey != NULL && prevkey->first_page + prevkey->npages >= first_page) + { + bool remove_next = false; + Size result; + + Assert(prevkey->first_page + prevkey->npages == first_page); + prevkey->npages = (first_page - prevkey->first_page) + npages; + + /* Check whether we can *also* consolidate with the following entry. */ + if (nextkey != NULL && + prevkey->first_page + prevkey->npages >= nextkey->first_page) + { + Assert(prevkey->first_page + prevkey->npages == + nextkey->first_page); + prevkey->npages = (nextkey->first_page - prevkey->first_page) + + nextkey->npages; + FreePagePopSpanLeader(fpm, nextkey->first_page); + remove_next = true; + } + + /* Put the span on the correct freelist and save size. */ + FreePagePopSpanLeader(fpm, prevkey->first_page); + FreePagePushSpanLeader(fpm, prevkey->first_page, prevkey->npages); + result = prevkey->npages; + + /* + * If we consolidated with both the preceding and following entries, + * we must remove the following entry. We do this last, because + * removing an element from the btree may invalidate pointers we hold + * into the current data structure. + * + * NB: The btree is technically in an invalid state a this point + * because we've already updated prevkey to cover the same key space + * as nextkey. FreePageBtreeRemove() shouldn't notice that, though. + */ + if (remove_next) + FreePageBtreeRemove(fpm, np, nindex); + + return result; + } + + /* Consolidate with the next entry if possible. */ + if (nextkey != NULL && first_page + npages >= nextkey->first_page) + { + Size newpages; + + /* Compute new size for span. */ + Assert(first_page + npages == nextkey->first_page); + newpages = (nextkey->first_page - first_page) + nextkey->npages; + + /* Put span on correct free list. */ + FreePagePopSpanLeader(fpm, nextkey->first_page); + FreePagePushSpanLeader(fpm, first_page, newpages); + + /* Update key in place. */ + nextkey->first_page = first_page; + nextkey->npages = newpages; + + /* If reducing first key on page, ancestors might need adjustment. */ + if (nindex == 0) + FreePageBtreeAdjustAncestorKeys(fpm, np); + + return nextkey->npages; + } + + /* Split leaf page and as many of its ancestors as necessary. */ + if (result.split_pages > 0) + { + /* + * NB: We could consider various coping strategies here to avoid a + * split; most obviously, if np != result.page, we could target that + * page instead. More complicated shuffling strategies could be + * possible as well; basically, unless every single leaf page is 100% + * full, we can jam this key in there if we try hard enough. It's + * unlikely that trying that hard is worthwhile, but it's possible we + * might need to make more than no effort. For now, we just do the + * easy thing, which is nothing. + */ + + /* If this is a soft insert, it's time to give up. */ + if (soft) + return 0; + + /* Check whether we need to allocate more btree pages to split. */ + if (result.split_pages > fpm->btree_recycle_count) + { + Size pages_needed; + Size recycle_page; + Size i; + + /* + * Allocate the required number of pages and split each one in + * turn. This should never fail, because if we've got enough + * spans of free pages kicking around that we need additional + * storage space just to remember them all, then we should + * certainly have enough to expand the btree, which should only + * ever use a tiny number of pages compared to the number under + * management. If it does, something's badly screwed up. + */ + pages_needed = result.split_pages - fpm->btree_recycle_count; + for (i = 0; i < pages_needed; ++i) + { + if (!FreePageManagerGetInternal(fpm, 1, &recycle_page)) + elog(FATAL, "free page manager btree is corrupt"); + FreePageBtreeRecycle(fpm, recycle_page); + } + + /* + * The act of allocating pages to recycle may have invalidated the + * results of our previous btree research, so repeat it. (We could + * recheck whether any of our split-avoidance strategies that were + * not viable before now are, but it hardly seems worthwhile, so + * we don't bother. Consolidation can't be possible now if it + * wasn't previously.) + */ + FreePageBtreeSearch(fpm, first_page, &result); + + /* + * The act of allocating pages for use in constructing our btree + * should never cause any page to become more full, so the new + * split depth should be no greater than the old one, and perhaps + * less if we fortuitously allocated a chunk that freed up a slot + * on the page we need to update. + */ + Assert(result.split_pages <= fpm->btree_recycle_count); + } + + /* If we still need to perform a split, do it. */ + if (result.split_pages > 0) + { + FreePageBtree *split_target = result.page; + FreePageBtree *child = NULL; + Size key = first_page; + + for (;;) + { + FreePageBtree *newsibling; + FreePageBtree *parent; + + /* Identify parent page, which must receive downlink. */ + parent = relptr_access(base, split_target->hdr.parent); + + /* Split the page - downlink not added yet. */ + newsibling = FreePageBtreeSplitPage(fpm, split_target); + + /* + * At this point in the loop, we're always carrying a pending + * insertion. On the first pass, it's the actual key we're + * trying to insert; on subsequent passes, it's the downlink + * that needs to be added as a result of the split performed + * during the previous loop iteration. Since we've just split + * the page, there's definitely room on one of the two + * resulting pages. + */ + if (child == NULL) + { + Size index; + FreePageBtree *insert_into; + + insert_into = key < newsibling->u.leaf_key[0].first_page ? + split_target : newsibling; + index = FreePageBtreeSearchLeaf(insert_into, key); + FreePageBtreeInsertLeaf(insert_into, index, key, npages); + if (index == 0 && insert_into == split_target) + FreePageBtreeAdjustAncestorKeys(fpm, split_target); + } + else + { + Size index; + FreePageBtree *insert_into; + + insert_into = + key < newsibling->u.internal_key[0].first_page ? + split_target : newsibling; + index = FreePageBtreeSearchInternal(insert_into, key); + FreePageBtreeInsertInternal(base, insert_into, index, + key, child); + relptr_store(base, child->hdr.parent, insert_into); + if (index == 0 && insert_into == split_target) + FreePageBtreeAdjustAncestorKeys(fpm, split_target); + } + + /* If the page we just split has no parent, split the root. */ + if (parent == NULL) + { + FreePageBtree *newroot; + + newroot = FreePageBtreeGetRecycled(fpm); + newroot->hdr.magic = FREE_PAGE_INTERNAL_MAGIC; + newroot->hdr.nused = 2; + relptr_store(base, newroot->hdr.parent, + (FreePageBtree *) NULL); + newroot->u.internal_key[0].first_page = + FreePageBtreeFirstKey(split_target); + relptr_store(base, newroot->u.internal_key[0].child, + split_target); + relptr_store(base, split_target->hdr.parent, newroot); + newroot->u.internal_key[1].first_page = + FreePageBtreeFirstKey(newsibling); + relptr_store(base, newroot->u.internal_key[1].child, + newsibling); + relptr_store(base, newsibling->hdr.parent, newroot); + relptr_store(base, fpm->btree_root, newroot); + fpm->btree_depth++; + + break; + } + + /* If the parent page isn't full, insert the downlink. */ + key = newsibling->u.internal_key[0].first_page; + if (parent->hdr.nused < FPM_ITEMS_PER_INTERNAL_PAGE) + { + Size index; + + index = FreePageBtreeSearchInternal(parent, key); + FreePageBtreeInsertInternal(base, parent, index, + key, newsibling); + relptr_store(base, newsibling->hdr.parent, parent); + if (index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, parent); + break; + } + + /* The parent also needs to be split, so loop around. */ + child = newsibling; + split_target = parent; + } + + /* + * The loop above did the insert, so just need to update the free + * list, and we're done. + */ + FreePagePushSpanLeader(fpm, first_page, npages); + + return npages; + } + } + + /* Physically add the key to the page. */ + Assert(result.page->hdr.nused < FPM_ITEMS_PER_LEAF_PAGE); + FreePageBtreeInsertLeaf(result.page, result.index, first_page, npages); + + /* If new first key on page, ancestors might need adjustment. */ + if (result.index == 0) + FreePageBtreeAdjustAncestorKeys(fpm, result.page); + + /* Put it on the free list. */ + FreePagePushSpanLeader(fpm, first_page, npages); + + return npages; +} + +/* + * Remove a FreePageSpanLeader from the linked-list that contains it, either + * because we're changing the size of the span, or because we're allocating it. + */ +static void +FreePagePopSpanLeader(FreePageManager *fpm, Size pageno) +{ + char *base = fpm_segment_base(fpm); + FreePageSpanLeader *span; + FreePageSpanLeader *next; + FreePageSpanLeader *prev; + + span = (FreePageSpanLeader *) fpm_page_to_pointer(base, pageno); + + next = relptr_access(base, span->next); + prev = relptr_access(base, span->prev); + if (next != NULL) + relptr_copy(next->prev, span->prev); + if (prev != NULL) + relptr_copy(prev->next, span->next); + else + { + Size f = Min(span->npages, FPM_NUM_FREELISTS) - 1; + + Assert(relptr_offset(fpm->freelist[f]) == pageno * FPM_PAGE_SIZE); + relptr_copy(fpm->freelist[f], span->next); + } +} + +/* + * Initialize a new FreePageSpanLeader and put it on the appropriate free list. + */ +static void +FreePagePushSpanLeader(FreePageManager *fpm, Size first_page, Size npages) +{ + char *base = fpm_segment_base(fpm); + Size f = Min(npages, FPM_NUM_FREELISTS) - 1; + FreePageSpanLeader *head = relptr_access(base, fpm->freelist[f]); + FreePageSpanLeader *span; + + span = (FreePageSpanLeader *) fpm_page_to_pointer(base, first_page); + span->magic = FREE_PAGE_SPAN_LEADER_MAGIC; + span->npages = npages; + relptr_store(base, span->next, head); + relptr_store(base, span->prev, (FreePageSpanLeader *) NULL); + if (head != NULL) + relptr_store(base, head->prev, span); + relptr_store(base, fpm->freelist[f], span); +} diff --git a/src/backend/utils/mmgr/generation.c b/src/backend/utils/mmgr/generation.c new file mode 100644 index 0000000..584cd61 --- /dev/null +++ b/src/backend/utils/mmgr/generation.c @@ -0,0 +1,832 @@ +/*------------------------------------------------------------------------- + * + * generation.c + * Generational allocator definitions. + * + * Generation is a custom MemoryContext implementation designed for cases of + * chunks with similar lifespan. + * + * Portions Copyright (c) 2017-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/mmgr/generation.c + * + * + * This memory context is based on the assumption that the chunks are freed + * roughly in the same order as they were allocated (FIFO), or in groups with + * similar lifespan (generations - hence the name of the context). This is + * typical for various queue-like use cases, i.e. when tuples are constructed, + * processed and then thrown away. + * + * The memory context uses a very simple approach to free space management. + * Instead of a complex global freelist, each block tracks a number + * of allocated and freed chunks. Freed chunks are not reused, and once all + * chunks in a block are freed, the whole block is thrown away. When the + * chunks allocated in the same block have similar lifespan, this works + * very well and is very cheap. + * + * The current implementation only uses a fixed block size - maybe it should + * adapt a min/max block size range, and grow the blocks automatically. + * It already uses dedicated blocks for oversized chunks. + * + * XXX It might be possible to improve this by keeping a small freelist for + * only a small number of recent blocks, but it's not clear it's worth the + * additional complexity. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "lib/ilist.h" +#include "utils/memdebug.h" +#include "utils/memutils.h" + + +#define Generation_BLOCKHDRSZ MAXALIGN(sizeof(GenerationBlock)) +#define Generation_CHUNKHDRSZ sizeof(GenerationChunk) + +typedef struct GenerationBlock GenerationBlock; /* forward reference */ +typedef struct GenerationChunk GenerationChunk; + +typedef void *GenerationPointer; + +/* + * GenerationContext is a simple memory context not reusing allocated chunks, + * and freeing blocks once all chunks are freed. + */ +typedef struct GenerationContext +{ + MemoryContextData header; /* Standard memory-context fields */ + + /* Generational context parameters */ + Size blockSize; /* standard block size */ + + GenerationBlock *block; /* current (most recently allocated) block */ + dlist_head blocks; /* list of blocks */ +} GenerationContext; + +/* + * GenerationBlock + * GenerationBlock is the unit of memory that is obtained by generation.c + * from malloc(). It contains one or more GenerationChunks, which are + * the units requested by palloc() and freed by pfree(). GenerationChunks + * cannot be returned to malloc() individually, instead pfree() + * updates the free counter of the block and when all chunks in a block + * are free the whole block is returned to malloc(). + * + * GenerationBlock is the header data for a block --- the usable space + * within the block begins at the next alignment boundary. + */ +struct GenerationBlock +{ + dlist_node node; /* doubly-linked list of blocks */ + Size blksize; /* allocated size of this block */ + int nchunks; /* number of chunks in the block */ + int nfree; /* number of free chunks */ + char *freeptr; /* start of free space in this block */ + char *endptr; /* end of space in this block */ +}; + +/* + * GenerationChunk + * The prefix of each piece of memory in a GenerationBlock + * + * Note: to meet the memory context APIs, the payload area of the chunk must + * be maxaligned, and the "context" link must be immediately adjacent to the + * payload area (cf. GetMemoryChunkContext). We simplify matters for this + * module by requiring sizeof(GenerationChunk) to be maxaligned, and then + * we can ensure things work by adding any required alignment padding before + * the pointer fields. There is a static assertion below that the alignment + * is done correctly. + */ +struct GenerationChunk +{ + /* size is always the size of the usable space in the chunk */ + Size size; +#ifdef MEMORY_CONTEXT_CHECKING + /* when debugging memory usage, also store actual requested size */ + /* this is zero in a free chunk */ + Size requested_size; + +#define GENERATIONCHUNK_RAWSIZE (SIZEOF_SIZE_T * 2 + SIZEOF_VOID_P * 2) +#else +#define GENERATIONCHUNK_RAWSIZE (SIZEOF_SIZE_T + SIZEOF_VOID_P * 2) +#endif /* MEMORY_CONTEXT_CHECKING */ + + /* ensure proper alignment by adding padding if needed */ +#if (GENERATIONCHUNK_RAWSIZE % MAXIMUM_ALIGNOF) != 0 + char padding[MAXIMUM_ALIGNOF - GENERATIONCHUNK_RAWSIZE % MAXIMUM_ALIGNOF]; +#endif + + GenerationBlock *block; /* block owning this chunk */ + GenerationContext *context; /* owning context, or NULL if freed chunk */ + /* there must not be any padding to reach a MAXALIGN boundary here! */ +}; + +/* + * Only the "context" field should be accessed outside this module. + * We keep the rest of an allocated chunk's header marked NOACCESS when using + * valgrind. But note that freed chunk headers are kept accessible, for + * simplicity. + */ +#define GENERATIONCHUNK_PRIVATE_LEN offsetof(GenerationChunk, context) + +/* + * GenerationIsValid + * True iff set is valid allocation set. + */ +#define GenerationIsValid(set) PointerIsValid(set) + +#define GenerationPointerGetChunk(ptr) \ + ((GenerationChunk *)(((char *)(ptr)) - Generation_CHUNKHDRSZ)) +#define GenerationChunkGetPointer(chk) \ + ((GenerationPointer *)(((char *)(chk)) + Generation_CHUNKHDRSZ)) + +/* + * These functions implement the MemoryContext API for Generation contexts. + */ +static void *GenerationAlloc(MemoryContext context, Size size); +static void GenerationFree(MemoryContext context, void *pointer); +static void *GenerationRealloc(MemoryContext context, void *pointer, Size size); +static void GenerationReset(MemoryContext context); +static void GenerationDelete(MemoryContext context); +static Size GenerationGetChunkSpace(MemoryContext context, void *pointer); +static bool GenerationIsEmpty(MemoryContext context); +static void GenerationStats(MemoryContext context, + MemoryStatsPrintFunc printfunc, void *passthru, + MemoryContextCounters *totals, + bool print_to_stderr); + +#ifdef MEMORY_CONTEXT_CHECKING +static void GenerationCheck(MemoryContext context); +#endif + +/* + * This is the virtual function table for Generation contexts. + */ +static const MemoryContextMethods GenerationMethods = { + GenerationAlloc, + GenerationFree, + GenerationRealloc, + GenerationReset, + GenerationDelete, + GenerationGetChunkSpace, + GenerationIsEmpty, + GenerationStats +#ifdef MEMORY_CONTEXT_CHECKING + ,GenerationCheck +#endif +}; + + +/* + * Public routines + */ + + +/* + * GenerationContextCreate + * Create a new Generation context. + * + * parent: parent context, or NULL if top-level context + * name: name of context (must be statically allocated) + * blockSize: generation block size + */ +MemoryContext +GenerationContextCreate(MemoryContext parent, + const char *name, + Size blockSize) +{ + GenerationContext *set; + + /* Assert we padded GenerationChunk properly */ + StaticAssertStmt(Generation_CHUNKHDRSZ == MAXALIGN(Generation_CHUNKHDRSZ), + "sizeof(GenerationChunk) is not maxaligned"); + StaticAssertStmt(offsetof(GenerationChunk, context) + sizeof(MemoryContext) == + Generation_CHUNKHDRSZ, + "padding calculation in GenerationChunk is wrong"); + + /* + * First, validate allocation parameters. (If we're going to throw an + * error, we should do so before the context is created, not after.) We + * somewhat arbitrarily enforce a minimum 1K block size, mostly because + * that's what AllocSet does. + */ + if (blockSize != MAXALIGN(blockSize) || + blockSize < 1024 || + !AllocHugeSizeIsValid(blockSize)) + elog(ERROR, "invalid blockSize for memory context: %zu", + blockSize); + + /* + * Allocate the context header. Unlike aset.c, we never try to combine + * this with the first regular block, since that would prevent us from + * freeing the first generation of allocations. + */ + + set = (GenerationContext *) malloc(MAXALIGN(sizeof(GenerationContext))); + if (set == NULL) + { + MemoryContextStats(TopMemoryContext); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed while creating memory context \"%s\".", + name))); + } + + /* + * Avoid writing code that can fail between here and MemoryContextCreate; + * we'd leak the header if we ereport in this stretch. + */ + + /* Fill in GenerationContext-specific header fields */ + set->blockSize = blockSize; + set->block = NULL; + dlist_init(&set->blocks); + + /* Finally, do the type-independent part of context creation */ + MemoryContextCreate((MemoryContext) set, + T_GenerationContext, + &GenerationMethods, + parent, + name); + + return (MemoryContext) set; +} + +/* + * GenerationReset + * Frees all memory which is allocated in the given set. + * + * The code simply frees all the blocks in the context - we don't keep any + * keeper blocks or anything like that. + */ +static void +GenerationReset(MemoryContext context) +{ + GenerationContext *set = (GenerationContext *) context; + dlist_mutable_iter miter; + + AssertArg(GenerationIsValid(set)); + +#ifdef MEMORY_CONTEXT_CHECKING + /* Check for corruption and leaks before freeing */ + GenerationCheck(context); +#endif + + dlist_foreach_modify(miter, &set->blocks) + { + GenerationBlock *block = dlist_container(GenerationBlock, node, miter.cur); + + dlist_delete(miter.cur); + + context->mem_allocated -= block->blksize; + +#ifdef CLOBBER_FREED_MEMORY + wipe_mem(block, block->blksize); +#endif + + free(block); + } + + set->block = NULL; + + Assert(dlist_is_empty(&set->blocks)); +} + +/* + * GenerationDelete + * Free all memory which is allocated in the given context. + */ +static void +GenerationDelete(MemoryContext context) +{ + /* Reset to release all the GenerationBlocks */ + GenerationReset(context); + /* And free the context header */ + free(context); +} + +/* + * GenerationAlloc + * Returns pointer to allocated memory of given size or NULL if + * request could not be completed; memory is added to the set. + * + * No request may exceed: + * MAXALIGN_DOWN(SIZE_MAX) - Generation_BLOCKHDRSZ - Generation_CHUNKHDRSZ + * All callers use a much-lower limit. + * + * Note: when using valgrind, it doesn't matter how the returned allocation + * is marked, as mcxt.c will set it to UNDEFINED. In some paths we will + * return space that is marked NOACCESS - GenerationRealloc has to beware! + */ +static void * +GenerationAlloc(MemoryContext context, Size size) +{ + GenerationContext *set = (GenerationContext *) context; + GenerationBlock *block; + GenerationChunk *chunk; + Size chunk_size = MAXALIGN(size); + + /* is it an over-sized chunk? if yes, allocate special block */ + if (chunk_size > set->blockSize / 8) + { + Size blksize = chunk_size + Generation_BLOCKHDRSZ + Generation_CHUNKHDRSZ; + + block = (GenerationBlock *) malloc(blksize); + if (block == NULL) + return NULL; + + context->mem_allocated += blksize; + + /* block with a single (used) chunk */ + block->blksize = blksize; + block->nchunks = 1; + block->nfree = 0; + + /* the block is completely full */ + block->freeptr = block->endptr = ((char *) block) + blksize; + + chunk = (GenerationChunk *) (((char *) block) + Generation_BLOCKHDRSZ); + chunk->block = block; + chunk->context = set; + chunk->size = chunk_size; + +#ifdef MEMORY_CONTEXT_CHECKING + chunk->requested_size = size; + /* set mark to catch clobber of "unused" space */ + if (size < chunk_size) + set_sentinel(GenerationChunkGetPointer(chunk), size); +#endif +#ifdef RANDOMIZE_ALLOCATED_MEMORY + /* fill the allocated space with junk */ + randomize_mem((char *) GenerationChunkGetPointer(chunk), size); +#endif + + /* add the block to the list of allocated blocks */ + dlist_push_head(&set->blocks, &block->node); + + /* Ensure any padding bytes are marked NOACCESS. */ + VALGRIND_MAKE_MEM_NOACCESS((char *) GenerationChunkGetPointer(chunk) + size, + chunk_size - size); + + /* Disallow external access to private part of chunk header. */ + VALGRIND_MAKE_MEM_NOACCESS(chunk, GENERATIONCHUNK_PRIVATE_LEN); + + return GenerationChunkGetPointer(chunk); + } + + /* + * Not an over-sized chunk. Is there enough space in the current block? If + * not, allocate a new "regular" block. + */ + block = set->block; + + if ((block == NULL) || + (block->endptr - block->freeptr) < Generation_CHUNKHDRSZ + chunk_size) + { + Size blksize = set->blockSize; + + block = (GenerationBlock *) malloc(blksize); + + if (block == NULL) + return NULL; + + context->mem_allocated += blksize; + + block->blksize = blksize; + block->nchunks = 0; + block->nfree = 0; + + block->freeptr = ((char *) block) + Generation_BLOCKHDRSZ; + block->endptr = ((char *) block) + blksize; + + /* Mark unallocated space NOACCESS. */ + VALGRIND_MAKE_MEM_NOACCESS(block->freeptr, + blksize - Generation_BLOCKHDRSZ); + + /* add it to the doubly-linked list of blocks */ + dlist_push_head(&set->blocks, &block->node); + + /* and also use it as the current allocation block */ + set->block = block; + } + + /* we're supposed to have a block with enough free space now */ + Assert(block != NULL); + Assert((block->endptr - block->freeptr) >= Generation_CHUNKHDRSZ + chunk_size); + + chunk = (GenerationChunk *) block->freeptr; + + /* Prepare to initialize the chunk header. */ + VALGRIND_MAKE_MEM_UNDEFINED(chunk, Generation_CHUNKHDRSZ); + + block->nchunks += 1; + block->freeptr += (Generation_CHUNKHDRSZ + chunk_size); + + Assert(block->freeptr <= block->endptr); + + chunk->block = block; + chunk->context = set; + chunk->size = chunk_size; + +#ifdef MEMORY_CONTEXT_CHECKING + chunk->requested_size = size; + /* set mark to catch clobber of "unused" space */ + if (size < chunk->size) + set_sentinel(GenerationChunkGetPointer(chunk), size); +#endif +#ifdef RANDOMIZE_ALLOCATED_MEMORY + /* fill the allocated space with junk */ + randomize_mem((char *) GenerationChunkGetPointer(chunk), size); +#endif + + /* Ensure any padding bytes are marked NOACCESS. */ + VALGRIND_MAKE_MEM_NOACCESS((char *) GenerationChunkGetPointer(chunk) + size, + chunk_size - size); + + /* Disallow external access to private part of chunk header. */ + VALGRIND_MAKE_MEM_NOACCESS(chunk, GENERATIONCHUNK_PRIVATE_LEN); + + return GenerationChunkGetPointer(chunk); +} + +/* + * GenerationFree + * Update number of chunks in the block, and if all chunks in the block + * are now free then discard the block. + */ +static void +GenerationFree(MemoryContext context, void *pointer) +{ + GenerationContext *set = (GenerationContext *) context; + GenerationChunk *chunk = GenerationPointerGetChunk(pointer); + GenerationBlock *block; + + /* Allow access to private part of chunk header. */ + VALGRIND_MAKE_MEM_DEFINED(chunk, GENERATIONCHUNK_PRIVATE_LEN); + + block = chunk->block; + +#ifdef MEMORY_CONTEXT_CHECKING + /* Test for someone scribbling on unused space in chunk */ + if (chunk->requested_size < chunk->size) + if (!sentinel_ok(pointer, chunk->requested_size)) + elog(WARNING, "detected write past chunk end in %s %p", + ((MemoryContext) set)->name, chunk); +#endif + +#ifdef CLOBBER_FREED_MEMORY + wipe_mem(pointer, chunk->size); +#endif + + /* Reset context to NULL in freed chunks */ + chunk->context = NULL; + +#ifdef MEMORY_CONTEXT_CHECKING + /* Reset requested_size to 0 in freed chunks */ + chunk->requested_size = 0; +#endif + + block->nfree += 1; + + Assert(block->nchunks > 0); + Assert(block->nfree <= block->nchunks); + + /* If there are still allocated chunks in the block, we're done. */ + if (block->nfree < block->nchunks) + return; + + /* + * The block is empty, so let's get rid of it. First remove it from the + * list of blocks, then return it to malloc(). + */ + dlist_delete(&block->node); + + /* Also make sure the block is not marked as the current block. */ + if (set->block == block) + set->block = NULL; + + context->mem_allocated -= block->blksize; + free(block); +} + +/* + * GenerationRealloc + * When handling repalloc, we simply allocate a new chunk, copy the data + * and discard the old one. The only exception is when the new size fits + * into the old chunk - in that case we just update chunk header. + */ +static void * +GenerationRealloc(MemoryContext context, void *pointer, Size size) +{ + GenerationContext *set = (GenerationContext *) context; + GenerationChunk *chunk = GenerationPointerGetChunk(pointer); + GenerationPointer newPointer; + Size oldsize; + + /* Allow access to private part of chunk header. */ + VALGRIND_MAKE_MEM_DEFINED(chunk, GENERATIONCHUNK_PRIVATE_LEN); + + oldsize = chunk->size; + +#ifdef MEMORY_CONTEXT_CHECKING + /* Test for someone scribbling on unused space in chunk */ + if (chunk->requested_size < oldsize) + if (!sentinel_ok(pointer, chunk->requested_size)) + elog(WARNING, "detected write past chunk end in %s %p", + ((MemoryContext) set)->name, chunk); +#endif + + /* + * Maybe the allocated area already is >= the new size. (In particular, + * we always fall out here if the requested size is a decrease.) + * + * This memory context does not use power-of-2 chunk sizing and instead + * carves the chunks to be as small as possible, so most repalloc() calls + * will end up in the palloc/memcpy/pfree branch. + * + * XXX Perhaps we should annotate this condition with unlikely()? + */ + if (oldsize >= size) + { +#ifdef MEMORY_CONTEXT_CHECKING + Size oldrequest = chunk->requested_size; + +#ifdef RANDOMIZE_ALLOCATED_MEMORY + /* We can only fill the extra space if we know the prior request */ + if (size > oldrequest) + randomize_mem((char *) pointer + oldrequest, + size - oldrequest); +#endif + + chunk->requested_size = size; + + /* + * If this is an increase, mark any newly-available part UNDEFINED. + * Otherwise, mark the obsolete part NOACCESS. + */ + if (size > oldrequest) + VALGRIND_MAKE_MEM_UNDEFINED((char *) pointer + oldrequest, + size - oldrequest); + else + VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size, + oldsize - size); + + /* set mark to catch clobber of "unused" space */ + if (size < oldsize) + set_sentinel(pointer, size); +#else /* !MEMORY_CONTEXT_CHECKING */ + + /* + * We don't have the information to determine whether we're growing + * the old request or shrinking it, so we conservatively mark the + * entire new allocation DEFINED. + */ + VALGRIND_MAKE_MEM_NOACCESS(pointer, oldsize); + VALGRIND_MAKE_MEM_DEFINED(pointer, size); +#endif + + /* Disallow external access to private part of chunk header. */ + VALGRIND_MAKE_MEM_NOACCESS(chunk, GENERATIONCHUNK_PRIVATE_LEN); + + return pointer; + } + + /* allocate new chunk */ + newPointer = GenerationAlloc((MemoryContext) set, size); + + /* leave immediately if request was not completed */ + if (newPointer == NULL) + { + /* Disallow external access to private part of chunk header. */ + VALGRIND_MAKE_MEM_NOACCESS(chunk, GENERATIONCHUNK_PRIVATE_LEN); + return NULL; + } + + /* + * GenerationAlloc() may have returned a region that is still NOACCESS. + * Change it to UNDEFINED for the moment; memcpy() will then transfer + * definedness from the old allocation to the new. If we know the old + * allocation, copy just that much. Otherwise, make the entire old chunk + * defined to avoid errors as we copy the currently-NOACCESS trailing + * bytes. + */ + VALGRIND_MAKE_MEM_UNDEFINED(newPointer, size); +#ifdef MEMORY_CONTEXT_CHECKING + oldsize = chunk->requested_size; +#else + VALGRIND_MAKE_MEM_DEFINED(pointer, oldsize); +#endif + + /* transfer existing data (certain to fit) */ + memcpy(newPointer, pointer, oldsize); + + /* free old chunk */ + GenerationFree((MemoryContext) set, pointer); + + return newPointer; +} + +/* + * GenerationGetChunkSpace + * Given a currently-allocated chunk, determine the total space + * it occupies (including all memory-allocation overhead). + */ +static Size +GenerationGetChunkSpace(MemoryContext context, void *pointer) +{ + GenerationChunk *chunk = GenerationPointerGetChunk(pointer); + Size result; + + VALGRIND_MAKE_MEM_DEFINED(chunk, GENERATIONCHUNK_PRIVATE_LEN); + result = chunk->size + Generation_CHUNKHDRSZ; + VALGRIND_MAKE_MEM_NOACCESS(chunk, GENERATIONCHUNK_PRIVATE_LEN); + return result; +} + +/* + * GenerationIsEmpty + * Is a GenerationContext empty of any allocated space? + */ +static bool +GenerationIsEmpty(MemoryContext context) +{ + GenerationContext *set = (GenerationContext *) context; + + return dlist_is_empty(&set->blocks); +} + +/* + * GenerationStats + * Compute stats about memory consumption of a Generation context. + * + * printfunc: if not NULL, pass a human-readable stats string to this. + * passthru: pass this pointer through to printfunc. + * totals: if not NULL, add stats about this context into *totals. + * print_to_stderr: print stats to stderr if true, elog otherwise. + * + * XXX freespace only accounts for empty space at the end of the block, not + * space of freed chunks (which is unknown). + */ +static void +GenerationStats(MemoryContext context, + MemoryStatsPrintFunc printfunc, void *passthru, + MemoryContextCounters *totals, bool print_to_stderr) +{ + GenerationContext *set = (GenerationContext *) context; + Size nblocks = 0; + Size nchunks = 0; + Size nfreechunks = 0; + Size totalspace; + Size freespace = 0; + dlist_iter iter; + + /* Include context header in totalspace */ + totalspace = MAXALIGN(sizeof(GenerationContext)); + + dlist_foreach(iter, &set->blocks) + { + GenerationBlock *block = dlist_container(GenerationBlock, node, iter.cur); + + nblocks++; + nchunks += block->nchunks; + nfreechunks += block->nfree; + totalspace += block->blksize; + freespace += (block->endptr - block->freeptr); + } + + if (printfunc) + { + char stats_string[200]; + + snprintf(stats_string, sizeof(stats_string), + "%zu total in %zd blocks (%zd chunks); %zu free (%zd chunks); %zu used", + totalspace, nblocks, nchunks, freespace, + nfreechunks, totalspace - freespace); + printfunc(context, passthru, stats_string, print_to_stderr); + } + + if (totals) + { + totals->nblocks += nblocks; + totals->freechunks += nfreechunks; + totals->totalspace += totalspace; + totals->freespace += freespace; + } +} + + +#ifdef MEMORY_CONTEXT_CHECKING + +/* + * GenerationCheck + * Walk through chunks and check consistency of memory. + * + * NOTE: report errors as WARNING, *not* ERROR or FATAL. Otherwise you'll + * find yourself in an infinite loop when trouble occurs, because this + * routine will be entered again when elog cleanup tries to release memory! + */ +static void +GenerationCheck(MemoryContext context) +{ + GenerationContext *gen = (GenerationContext *) context; + const char *name = context->name; + dlist_iter iter; + Size total_allocated = 0; + + /* walk all blocks in this context */ + dlist_foreach(iter, &gen->blocks) + { + GenerationBlock *block = dlist_container(GenerationBlock, node, iter.cur); + int nfree, + nchunks; + char *ptr; + + total_allocated += block->blksize; + + /* + * nfree > nchunks is surely wrong, and we don't expect to see + * equality either, because such a block should have gotten freed. + */ + if (block->nfree >= block->nchunks) + elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p exceeds %d allocated", + name, block->nfree, block, block->nchunks); + + /* Now walk through the chunks and count them. */ + nfree = 0; + nchunks = 0; + ptr = ((char *) block) + Generation_BLOCKHDRSZ; + + while (ptr < block->freeptr) + { + GenerationChunk *chunk = (GenerationChunk *) ptr; + + /* Allow access to private part of chunk header. */ + VALGRIND_MAKE_MEM_DEFINED(chunk, GENERATIONCHUNK_PRIVATE_LEN); + + /* move to the next chunk */ + ptr += (chunk->size + Generation_CHUNKHDRSZ); + + nchunks += 1; + + /* chunks have both block and context pointers, so check both */ + if (chunk->block != block) + elog(WARNING, "problem in Generation %s: bogus block link in block %p, chunk %p", + name, block, chunk); + + /* + * Check for valid context pointer. Note this is an incomplete + * test, since palloc(0) produces an allocated chunk with + * requested_size == 0. + */ + if ((chunk->requested_size > 0 && chunk->context != gen) || + (chunk->context != gen && chunk->context != NULL)) + elog(WARNING, "problem in Generation %s: bogus context link in block %p, chunk %p", + name, block, chunk); + + /* now make sure the chunk size is correct */ + if (chunk->size < chunk->requested_size || + chunk->size != MAXALIGN(chunk->size)) + elog(WARNING, "problem in Generation %s: bogus chunk size in block %p, chunk %p", + name, block, chunk); + + /* is chunk allocated? */ + if (chunk->context != NULL) + { + /* check sentinel, but only in allocated blocks */ + if (chunk->requested_size < chunk->size && + !sentinel_ok(chunk, Generation_CHUNKHDRSZ + chunk->requested_size)) + elog(WARNING, "problem in Generation %s: detected write past chunk end in block %p, chunk %p", + name, block, chunk); + } + else + nfree += 1; + + /* + * If chunk is allocated, disallow external access to private part + * of chunk header. + */ + if (chunk->context != NULL) + VALGRIND_MAKE_MEM_NOACCESS(chunk, GENERATIONCHUNK_PRIVATE_LEN); + } + + /* + * Make sure we got the expected number of allocated and free chunks + * (as tracked in the block header). + */ + if (nchunks != block->nchunks) + elog(WARNING, "problem in Generation %s: number of allocated chunks %d in block %p does not match header %d", + name, nchunks, block, block->nchunks); + + if (nfree != block->nfree) + elog(WARNING, "problem in Generation %s: number of free chunks %d in block %p does not match header %d", + name, nfree, block, block->nfree); + } + + Assert(total_allocated == context->mem_allocated); +} + +#endif /* MEMORY_CONTEXT_CHECKING */ diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c new file mode 100644 index 0000000..a5f31e2 --- /dev/null +++ b/src/backend/utils/mmgr/mcxt.c @@ -0,0 +1,1341 @@ +/*------------------------------------------------------------------------- + * + * mcxt.c + * POSTGRES memory context management code. + * + * This module handles context management operations that are independent + * of the particular kind of context being operated on. It calls + * context-type-specific operations via the function pointers in a + * context's MemoryContextMethods struct. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/utils/mmgr/mcxt.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/procsignal.h" +#include "utils/fmgrprotos.h" +#include "utils/memdebug.h" +#include "utils/memutils.h" + + +/***************************************************************************** + * GLOBAL MEMORY * + *****************************************************************************/ + +/* + * CurrentMemoryContext + * Default memory context for allocations. + */ +MemoryContext CurrentMemoryContext = NULL; + +/* + * Standard top-level contexts. For a description of the purpose of each + * of these contexts, refer to src/backend/utils/mmgr/README + */ +MemoryContext TopMemoryContext = NULL; +MemoryContext ErrorContext = NULL; +MemoryContext PostmasterContext = NULL; +MemoryContext CacheMemoryContext = NULL; +MemoryContext MessageContext = NULL; +MemoryContext TopTransactionContext = NULL; +MemoryContext CurTransactionContext = NULL; + +/* This is a transient link to the active portal's memory context: */ +MemoryContext PortalContext = NULL; + +static void MemoryContextCallResetCallbacks(MemoryContext context); +static void MemoryContextStatsInternal(MemoryContext context, int level, + bool print, int max_children, + MemoryContextCounters *totals, + bool print_to_stderr); +static void MemoryContextStatsPrint(MemoryContext context, void *passthru, + const char *stats_string, + bool print_to_stderr); + +/* + * You should not do memory allocations within a critical section, because + * an out-of-memory error will be escalated to a PANIC. To enforce that + * rule, the allocation functions Assert that. + */ +#define AssertNotInCriticalSection(context) \ + Assert(CritSectionCount == 0 || (context)->allowInCritSection) + + +/***************************************************************************** + * EXPORTED ROUTINES * + *****************************************************************************/ + + +/* + * MemoryContextInit + * Start up the memory-context subsystem. + * + * This must be called before creating contexts or allocating memory in + * contexts. TopMemoryContext and ErrorContext are initialized here; + * other contexts must be created afterwards. + * + * In normal multi-backend operation, this is called once during + * postmaster startup, and not at all by individual backend startup + * (since the backends inherit an already-initialized context subsystem + * by virtue of being forked off the postmaster). But in an EXEC_BACKEND + * build, each process must do this for itself. + * + * In a standalone backend this must be called during backend startup. + */ +void +MemoryContextInit(void) +{ + AssertState(TopMemoryContext == NULL); + + /* + * First, initialize TopMemoryContext, which is the parent of all others. + */ + TopMemoryContext = AllocSetContextCreate((MemoryContext) NULL, + "TopMemoryContext", + ALLOCSET_DEFAULT_SIZES); + + /* + * Not having any other place to point CurrentMemoryContext, make it point + * to TopMemoryContext. Caller should change this soon! + */ + CurrentMemoryContext = TopMemoryContext; + + /* + * Initialize ErrorContext as an AllocSetContext with slow growth rate --- + * we don't really expect much to be allocated in it. More to the point, + * require it to contain at least 8K at all times. This is the only case + * where retained memory in a context is *essential* --- we want to be + * sure ErrorContext still has some memory even if we've run out + * elsewhere! Also, allow allocations in ErrorContext within a critical + * section. Otherwise a PANIC will cause an assertion failure in the error + * reporting code, before printing out the real cause of the failure. + * + * This should be the last step in this function, as elog.c assumes memory + * management works once ErrorContext is non-null. + */ + ErrorContext = AllocSetContextCreate(TopMemoryContext, + "ErrorContext", + 8 * 1024, + 8 * 1024, + 8 * 1024); + MemoryContextAllowInCriticalSection(ErrorContext, true); +} + +/* + * MemoryContextReset + * Release all space allocated within a context and delete all its + * descendant contexts (but not the named context itself). + */ +void +MemoryContextReset(MemoryContext context) +{ + AssertArg(MemoryContextIsValid(context)); + + /* save a function call in common case where there are no children */ + if (context->firstchild != NULL) + MemoryContextDeleteChildren(context); + + /* save a function call if no pallocs since startup or last reset */ + if (!context->isReset) + MemoryContextResetOnly(context); +} + +/* + * MemoryContextResetOnly + * Release all space allocated within a context. + * Nothing is done to the context's descendant contexts. + */ +void +MemoryContextResetOnly(MemoryContext context) +{ + AssertArg(MemoryContextIsValid(context)); + + /* Nothing to do if no pallocs since startup or last reset */ + if (!context->isReset) + { + MemoryContextCallResetCallbacks(context); + + /* + * If context->ident points into the context's memory, it will become + * a dangling pointer. We could prevent that by setting it to NULL + * here, but that would break valid coding patterns that keep the + * ident elsewhere, e.g. in a parent context. Another idea is to use + * MemoryContextContains(), but we don't require ident strings to be + * in separately-palloc'd chunks, so that risks false positives. So + * for now we assume the programmer got it right. + */ + + context->methods->reset(context); + context->isReset = true; + VALGRIND_DESTROY_MEMPOOL(context); + VALGRIND_CREATE_MEMPOOL(context, 0, false); + } +} + +/* + * MemoryContextResetChildren + * Release all space allocated within a context's descendants, + * but don't delete the contexts themselves. The named context + * itself is not touched. + */ +void +MemoryContextResetChildren(MemoryContext context) +{ + MemoryContext child; + + AssertArg(MemoryContextIsValid(context)); + + for (child = context->firstchild; child != NULL; child = child->nextchild) + { + MemoryContextResetChildren(child); + MemoryContextResetOnly(child); + } +} + +/* + * MemoryContextDelete + * Delete a context and its descendants, and release all space + * allocated therein. + * + * The type-specific delete routine removes all storage for the context, + * but we have to recurse to handle the children. + * We must also delink the context from its parent, if it has one. + */ +void +MemoryContextDelete(MemoryContext context) +{ + AssertArg(MemoryContextIsValid(context)); + /* We had better not be deleting TopMemoryContext ... */ + Assert(context != TopMemoryContext); + /* And not CurrentMemoryContext, either */ + Assert(context != CurrentMemoryContext); + + /* save a function call in common case where there are no children */ + if (context->firstchild != NULL) + MemoryContextDeleteChildren(context); + + /* + * It's not entirely clear whether 'tis better to do this before or after + * delinking the context; but an error in a callback will likely result in + * leaking the whole context (if it's not a root context) if we do it + * after, so let's do it before. + */ + MemoryContextCallResetCallbacks(context); + + /* + * We delink the context from its parent before deleting it, so that if + * there's an error we won't have deleted/busted contexts still attached + * to the context tree. Better a leak than a crash. + */ + MemoryContextSetParent(context, NULL); + + /* + * Also reset the context's ident pointer, in case it points into the + * context. This would only matter if someone tries to get stats on the + * (already unlinked) context, which is unlikely, but let's be safe. + */ + context->ident = NULL; + + context->methods->delete_context(context); + + VALGRIND_DESTROY_MEMPOOL(context); +} + +/* + * MemoryContextDeleteChildren + * Delete all the descendants of the named context and release all + * space allocated therein. The named context itself is not touched. + */ +void +MemoryContextDeleteChildren(MemoryContext context) +{ + AssertArg(MemoryContextIsValid(context)); + + /* + * MemoryContextDelete will delink the child from me, so just iterate as + * long as there is a child. + */ + while (context->firstchild != NULL) + MemoryContextDelete(context->firstchild); +} + +/* + * MemoryContextRegisterResetCallback + * Register a function to be called before next context reset/delete. + * Such callbacks will be called in reverse order of registration. + * + * The caller is responsible for allocating a MemoryContextCallback struct + * to hold the info about this callback request, and for filling in the + * "func" and "arg" fields in the struct to show what function to call with + * what argument. Typically the callback struct should be allocated within + * the specified context, since that means it will automatically be freed + * when no longer needed. + * + * There is no API for deregistering a callback once registered. If you + * want it to not do anything anymore, adjust the state pointed to by its + * "arg" to indicate that. + */ +void +MemoryContextRegisterResetCallback(MemoryContext context, + MemoryContextCallback *cb) +{ + AssertArg(MemoryContextIsValid(context)); + + /* Push onto head so this will be called before older registrants. */ + cb->next = context->reset_cbs; + context->reset_cbs = cb; + /* Mark the context as non-reset (it probably is already). */ + context->isReset = false; +} + +/* + * MemoryContextCallResetCallbacks + * Internal function to call all registered callbacks for context. + */ +static void +MemoryContextCallResetCallbacks(MemoryContext context) +{ + MemoryContextCallback *cb; + + /* + * We pop each callback from the list before calling. That way, if an + * error occurs inside the callback, we won't try to call it a second time + * in the likely event that we reset or delete the context later. + */ + while ((cb = context->reset_cbs) != NULL) + { + context->reset_cbs = cb->next; + cb->func(cb->arg); + } +} + +/* + * MemoryContextSetIdentifier + * Set the identifier string for a memory context. + * + * An identifier can be provided to help distinguish among different contexts + * of the same kind in memory context stats dumps. The identifier string + * must live at least as long as the context it is for; typically it is + * allocated inside that context, so that it automatically goes away on + * context deletion. Pass id = NULL to forget any old identifier. + */ +void +MemoryContextSetIdentifier(MemoryContext context, const char *id) +{ + AssertArg(MemoryContextIsValid(context)); + context->ident = id; +} + +/* + * MemoryContextSetParent + * Change a context to belong to a new parent (or no parent). + * + * We provide this as an API function because it is sometimes useful to + * change a context's lifespan after creation. For example, a context + * might be created underneath a transient context, filled with data, + * and then reparented underneath CacheMemoryContext to make it long-lived. + * In this way no special effort is needed to get rid of the context in case + * a failure occurs before its contents are completely set up. + * + * Callers often assume that this function cannot fail, so don't put any + * elog(ERROR) calls in it. + * + * A possible caller error is to reparent a context under itself, creating + * a loop in the context graph. We assert here that context != new_parent, + * but checking for multi-level loops seems more trouble than it's worth. + */ +void +MemoryContextSetParent(MemoryContext context, MemoryContext new_parent) +{ + AssertArg(MemoryContextIsValid(context)); + AssertArg(context != new_parent); + + /* Fast path if it's got correct parent already */ + if (new_parent == context->parent) + return; + + /* Delink from existing parent, if any */ + if (context->parent) + { + MemoryContext parent = context->parent; + + if (context->prevchild != NULL) + context->prevchild->nextchild = context->nextchild; + else + { + Assert(parent->firstchild == context); + parent->firstchild = context->nextchild; + } + + if (context->nextchild != NULL) + context->nextchild->prevchild = context->prevchild; + } + + /* And relink */ + if (new_parent) + { + AssertArg(MemoryContextIsValid(new_parent)); + context->parent = new_parent; + context->prevchild = NULL; + context->nextchild = new_parent->firstchild; + if (new_parent->firstchild != NULL) + new_parent->firstchild->prevchild = context; + new_parent->firstchild = context; + } + else + { + context->parent = NULL; + context->prevchild = NULL; + context->nextchild = NULL; + } +} + +/* + * MemoryContextAllowInCriticalSection + * Allow/disallow allocations in this memory context within a critical + * section. + * + * Normally, memory allocations are not allowed within a critical section, + * because a failure would lead to PANIC. There are a few exceptions to + * that, like allocations related to debugging code that is not supposed to + * be enabled in production. This function can be used to exempt specific + * memory contexts from the assertion in palloc(). + */ +void +MemoryContextAllowInCriticalSection(MemoryContext context, bool allow) +{ + AssertArg(MemoryContextIsValid(context)); + + context->allowInCritSection = allow; +} + +/* + * GetMemoryChunkSpace + * Given a currently-allocated chunk, determine the total space + * it occupies (including all memory-allocation overhead). + * + * This is useful for measuring the total space occupied by a set of + * allocated chunks. + */ +Size +GetMemoryChunkSpace(void *pointer) +{ + MemoryContext context = GetMemoryChunkContext(pointer); + + return context->methods->get_chunk_space(context, pointer); +} + +/* + * MemoryContextGetParent + * Get the parent context (if any) of the specified context + */ +MemoryContext +MemoryContextGetParent(MemoryContext context) +{ + AssertArg(MemoryContextIsValid(context)); + + return context->parent; +} + +/* + * MemoryContextIsEmpty + * Is a memory context empty of any allocated space? + */ +bool +MemoryContextIsEmpty(MemoryContext context) +{ + AssertArg(MemoryContextIsValid(context)); + + /* + * For now, we consider a memory context nonempty if it has any children; + * perhaps this should be changed later. + */ + if (context->firstchild != NULL) + return false; + /* Otherwise use the type-specific inquiry */ + return context->methods->is_empty(context); +} + +/* + * Find the memory allocated to blocks for this memory context. If recurse is + * true, also include children. + */ +Size +MemoryContextMemAllocated(MemoryContext context, bool recurse) +{ + Size total = context->mem_allocated; + + AssertArg(MemoryContextIsValid(context)); + + if (recurse) + { + MemoryContext child; + + for (child = context->firstchild; + child != NULL; + child = child->nextchild) + total += MemoryContextMemAllocated(child, true); + } + + return total; +} + +/* + * MemoryContextStats + * Print statistics about the named context and all its descendants. + * + * This is just a debugging utility, so it's not very fancy. However, we do + * make some effort to summarize when the output would otherwise be very long. + * The statistics are sent to stderr. + */ +void +MemoryContextStats(MemoryContext context) +{ + /* A hard-wired limit on the number of children is usually good enough */ + MemoryContextStatsDetail(context, 100, true); +} + +/* + * MemoryContextStatsDetail + * + * Entry point for use if you want to vary the number of child contexts shown. + * + * If print_to_stderr is true, print statistics about the memory contexts + * with fprintf(stderr), otherwise use ereport(). + */ +void +MemoryContextStatsDetail(MemoryContext context, int max_children, + bool print_to_stderr) +{ + MemoryContextCounters grand_totals; + + memset(&grand_totals, 0, sizeof(grand_totals)); + + MemoryContextStatsInternal(context, 0, true, max_children, &grand_totals, print_to_stderr); + + if (print_to_stderr) + fprintf(stderr, + "Grand total: %zu bytes in %zd blocks; %zu free (%zd chunks); %zu used\n", + grand_totals.totalspace, grand_totals.nblocks, + grand_totals.freespace, grand_totals.freechunks, + grand_totals.totalspace - grand_totals.freespace); + else + + /* + * Use LOG_SERVER_ONLY to prevent the memory contexts from being sent + * to the connected client. + * + * We don't buffer the information about all memory contexts in a + * backend into StringInfo and log it as one message. Otherwise which + * may require the buffer to be enlarged very much and lead to OOM + * error since there can be a large number of memory contexts in a + * backend. Instead, we log one message per memory context. + */ + ereport(LOG_SERVER_ONLY, + (errhidestmt(true), + errhidecontext(true), + errmsg_internal("Grand total: %zu bytes in %zd blocks; %zu free (%zd chunks); %zu used", + grand_totals.totalspace, grand_totals.nblocks, + grand_totals.freespace, grand_totals.freechunks, + grand_totals.totalspace - grand_totals.freespace))); +} + +/* + * MemoryContextStatsInternal + * One recursion level for MemoryContextStats + * + * Print this context if print is true, but in any case accumulate counts into + * *totals (if given). + */ +static void +MemoryContextStatsInternal(MemoryContext context, int level, + bool print, int max_children, + MemoryContextCounters *totals, + bool print_to_stderr) +{ + MemoryContextCounters local_totals; + MemoryContext child; + int ichild; + + AssertArg(MemoryContextIsValid(context)); + + /* Examine the context itself */ + context->methods->stats(context, + print ? MemoryContextStatsPrint : NULL, + (void *) &level, + totals, print_to_stderr); + + /* + * Examine children. If there are more than max_children of them, we do + * not print the rest explicitly, but just summarize them. + */ + memset(&local_totals, 0, sizeof(local_totals)); + + for (child = context->firstchild, ichild = 0; + child != NULL; + child = child->nextchild, ichild++) + { + if (ichild < max_children) + MemoryContextStatsInternal(child, level + 1, + print, max_children, + totals, + print_to_stderr); + else + MemoryContextStatsInternal(child, level + 1, + false, max_children, + &local_totals, + print_to_stderr); + } + + /* Deal with excess children */ + if (ichild > max_children) + { + if (print) + { + if (print_to_stderr) + { + int i; + + for (i = 0; i <= level; i++) + fprintf(stderr, " "); + fprintf(stderr, + "%d more child contexts containing %zu total in %zd blocks; %zu free (%zd chunks); %zu used\n", + ichild - max_children, + local_totals.totalspace, + local_totals.nblocks, + local_totals.freespace, + local_totals.freechunks, + local_totals.totalspace - local_totals.freespace); + } + else + ereport(LOG_SERVER_ONLY, + (errhidestmt(true), + errhidecontext(true), + errmsg_internal("level: %d; %d more child contexts containing %zu total in %zd blocks; %zu free (%zd chunks); %zu used", + level, + ichild - max_children, + local_totals.totalspace, + local_totals.nblocks, + local_totals.freespace, + local_totals.freechunks, + local_totals.totalspace - local_totals.freespace))); + } + + if (totals) + { + totals->nblocks += local_totals.nblocks; + totals->freechunks += local_totals.freechunks; + totals->totalspace += local_totals.totalspace; + totals->freespace += local_totals.freespace; + } + } +} + +/* + * MemoryContextStatsPrint + * Print callback used by MemoryContextStatsInternal + * + * For now, the passthru pointer just points to "int level"; later we might + * make that more complicated. + */ +static void +MemoryContextStatsPrint(MemoryContext context, void *passthru, + const char *stats_string, + bool print_to_stderr) +{ + int level = *(int *) passthru; + const char *name = context->name; + const char *ident = context->ident; + char truncated_ident[110]; + int i; + + /* + * It seems preferable to label dynahash contexts with just the hash table + * name. Those are already unique enough, so the "dynahash" part isn't + * very helpful, and this way is more consistent with pre-v11 practice. + */ + if (ident && strcmp(name, "dynahash") == 0) + { + name = ident; + ident = NULL; + } + + truncated_ident[0] = '\0'; + + if (ident) + { + /* + * Some contexts may have very long identifiers (e.g., SQL queries). + * Arbitrarily truncate at 100 bytes, but be careful not to break + * multibyte characters. Also, replace ASCII control characters, such + * as newlines, with spaces. + */ + int idlen = strlen(ident); + bool truncated = false; + + strcpy(truncated_ident, ": "); + i = strlen(truncated_ident); + + if (idlen > 100) + { + idlen = pg_mbcliplen(ident, idlen, 100); + truncated = true; + } + + while (idlen-- > 0) + { + unsigned char c = *ident++; + + if (c < ' ') + c = ' '; + truncated_ident[i++] = c; + } + truncated_ident[i] = '\0'; + + if (truncated) + strcat(truncated_ident, "..."); + } + + if (print_to_stderr) + { + for (i = 0; i < level; i++) + fprintf(stderr, " "); + fprintf(stderr, "%s: %s%s\n", name, stats_string, truncated_ident); + } + else + ereport(LOG_SERVER_ONLY, + (errhidestmt(true), + errhidecontext(true), + errmsg_internal("level: %d; %s: %s%s", + level, name, stats_string, truncated_ident))); +} + +/* + * MemoryContextCheck + * Check all chunks in the named context. + * + * This is just a debugging utility, so it's not fancy. + */ +#ifdef MEMORY_CONTEXT_CHECKING +void +MemoryContextCheck(MemoryContext context) +{ + MemoryContext child; + + AssertArg(MemoryContextIsValid(context)); + + context->methods->check(context); + for (child = context->firstchild; child != NULL; child = child->nextchild) + MemoryContextCheck(child); +} +#endif + +/* + * MemoryContextContains + * Detect whether an allocated chunk of memory belongs to a given + * context or not. + * + * Caution: this test is reliable as long as 'pointer' does point to + * a chunk of memory allocated from *some* context. If 'pointer' points + * at memory obtained in some other way, there is a small chance of a + * false-positive result, since the bits right before it might look like + * a valid chunk header by chance. + */ +bool +MemoryContextContains(MemoryContext context, void *pointer) +{ + MemoryContext ptr_context; + + /* + * NB: Can't use GetMemoryChunkContext() here - that performs assertions + * that aren't acceptable here since we might be passed memory not + * allocated by any memory context. + * + * Try to detect bogus pointers handed to us, poorly though we can. + * Presumably, a pointer that isn't MAXALIGNED isn't pointing at an + * allocated chunk. + */ + if (pointer == NULL || pointer != (void *) MAXALIGN(pointer)) + return false; + + /* + * OK, it's probably safe to look at the context. + */ + ptr_context = *(MemoryContext *) (((char *) pointer) - sizeof(void *)); + + return ptr_context == context; +} + +/* + * MemoryContextCreate + * Context-type-independent part of context creation. + * + * This is only intended to be called by context-type-specific + * context creation routines, not by the unwashed masses. + * + * The memory context creation procedure goes like this: + * 1. Context-type-specific routine makes some initial space allocation, + * including enough space for the context header. If it fails, + * it can ereport() with no damage done. + * 2. Context-type-specific routine sets up all type-specific fields of + * the header (those beyond MemoryContextData proper), as well as any + * other management fields it needs to have a fully valid context. + * Usually, failure in this step is impossible, but if it's possible + * the initial space allocation should be freed before ereport'ing. + * 3. Context-type-specific routine calls MemoryContextCreate() to fill in + * the generic header fields and link the context into the context tree. + * 4. We return to the context-type-specific routine, which finishes + * up type-specific initialization. This routine can now do things + * that might fail (like allocate more memory), so long as it's + * sure the node is left in a state that delete will handle. + * + * node: the as-yet-uninitialized common part of the context header node. + * tag: NodeTag code identifying the memory context type. + * methods: context-type-specific methods (usually statically allocated). + * parent: parent context, or NULL if this will be a top-level context. + * name: name of context (must be statically allocated). + * + * Context routines generally assume that MemoryContextCreate can't fail, + * so this can contain Assert but not elog/ereport. + */ +void +MemoryContextCreate(MemoryContext node, + NodeTag tag, + const MemoryContextMethods *methods, + MemoryContext parent, + const char *name) +{ + /* Creating new memory contexts is not allowed in a critical section */ + Assert(CritSectionCount == 0); + + /* Initialize all standard fields of memory context header */ + node->type = tag; + node->isReset = true; + node->methods = methods; + node->parent = parent; + node->firstchild = NULL; + node->mem_allocated = 0; + node->prevchild = NULL; + node->name = name; + node->ident = NULL; + node->reset_cbs = NULL; + + /* OK to link node into context tree */ + if (parent) + { + node->nextchild = parent->firstchild; + if (parent->firstchild != NULL) + parent->firstchild->prevchild = node; + parent->firstchild = node; + /* inherit allowInCritSection flag from parent */ + node->allowInCritSection = parent->allowInCritSection; + } + else + { + node->nextchild = NULL; + node->allowInCritSection = false; + } + + VALGRIND_CREATE_MEMPOOL(node, 0, false); +} + +/* + * MemoryContextAlloc + * Allocate space within the specified context. + * + * This could be turned into a macro, but we'd have to import + * nodes/memnodes.h into postgres.h which seems a bad idea. + */ +void * +MemoryContextAlloc(MemoryContext context, Size size) +{ + void *ret; + + AssertArg(MemoryContextIsValid(context)); + AssertNotInCriticalSection(context); + + if (!AllocSizeIsValid(size)) + elog(ERROR, "invalid memory alloc request size %zu", size); + + context->isReset = false; + + ret = context->methods->alloc(context, size); + if (unlikely(ret == NULL)) + { + MemoryContextStats(TopMemoryContext); + + /* + * Here, and elsewhere in this module, we show the target context's + * "name" but not its "ident" (if any) in user-visible error messages. + * The "ident" string might contain security-sensitive data, such as + * values in SQL commands. + */ + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed on request of size %zu in memory context \"%s\".", + size, context->name))); + } + + VALGRIND_MEMPOOL_ALLOC(context, ret, size); + + return ret; +} + +/* + * MemoryContextAllocZero + * Like MemoryContextAlloc, but clears allocated memory + * + * We could just call MemoryContextAlloc then clear the memory, but this + * is a very common combination, so we provide the combined operation. + */ +void * +MemoryContextAllocZero(MemoryContext context, Size size) +{ + void *ret; + + AssertArg(MemoryContextIsValid(context)); + AssertNotInCriticalSection(context); + + if (!AllocSizeIsValid(size)) + elog(ERROR, "invalid memory alloc request size %zu", size); + + context->isReset = false; + + ret = context->methods->alloc(context, size); + if (unlikely(ret == NULL)) + { + MemoryContextStats(TopMemoryContext); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed on request of size %zu in memory context \"%s\".", + size, context->name))); + } + + VALGRIND_MEMPOOL_ALLOC(context, ret, size); + + MemSetAligned(ret, 0, size); + + return ret; +} + +/* + * MemoryContextAllocZeroAligned + * MemoryContextAllocZero where length is suitable for MemSetLoop + * + * This might seem overly specialized, but it's not because newNode() + * is so often called with compile-time-constant sizes. + */ +void * +MemoryContextAllocZeroAligned(MemoryContext context, Size size) +{ + void *ret; + + AssertArg(MemoryContextIsValid(context)); + AssertNotInCriticalSection(context); + + if (!AllocSizeIsValid(size)) + elog(ERROR, "invalid memory alloc request size %zu", size); + + context->isReset = false; + + ret = context->methods->alloc(context, size); + if (unlikely(ret == NULL)) + { + MemoryContextStats(TopMemoryContext); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed on request of size %zu in memory context \"%s\".", + size, context->name))); + } + + VALGRIND_MEMPOOL_ALLOC(context, ret, size); + + MemSetLoop(ret, 0, size); + + return ret; +} + +/* + * MemoryContextAllocExtended + * Allocate space within the specified context using the given flags. + */ +void * +MemoryContextAllocExtended(MemoryContext context, Size size, int flags) +{ + void *ret; + + AssertArg(MemoryContextIsValid(context)); + AssertNotInCriticalSection(context); + + if (((flags & MCXT_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) || + ((flags & MCXT_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size))) + elog(ERROR, "invalid memory alloc request size %zu", size); + + context->isReset = false; + + ret = context->methods->alloc(context, size); + if (unlikely(ret == NULL)) + { + if ((flags & MCXT_ALLOC_NO_OOM) == 0) + { + MemoryContextStats(TopMemoryContext); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed on request of size %zu in memory context \"%s\".", + size, context->name))); + } + return NULL; + } + + VALGRIND_MEMPOOL_ALLOC(context, ret, size); + + if ((flags & MCXT_ALLOC_ZERO) != 0) + MemSetAligned(ret, 0, size); + + return ret; +} + +/* + * HandleLogMemoryContextInterrupt + * Handle receipt of an interrupt indicating logging of memory + * contexts. + * + * All the actual work is deferred to ProcessLogMemoryContextInterrupt(), + * because we cannot safely emit a log message inside the signal handler. + */ +void +HandleLogMemoryContextInterrupt(void) +{ + InterruptPending = true; + LogMemoryContextPending = true; + /* latch will be set by procsignal_sigusr1_handler */ +} + +/* + * ProcessLogMemoryContextInterrupt + * Perform logging of memory contexts of this backend process. + * + * Any backend that participates in ProcSignal signaling must arrange + * to call this function if we see LogMemoryContextPending set. + * It is called from CHECK_FOR_INTERRUPTS(), which is enough because + * the target process for logging of memory contexts is a backend. + */ +void +ProcessLogMemoryContextInterrupt(void) +{ + LogMemoryContextPending = false; + + /* + * Use LOG_SERVER_ONLY to prevent this message from being sent to the + * connected client. + */ + ereport(LOG_SERVER_ONLY, + (errhidestmt(true), + errhidecontext(true), + errmsg("logging memory contexts of PID %d", MyProcPid))); + + /* + * When a backend process is consuming huge memory, logging all its memory + * contexts might overrun available disk space. To prevent this, we limit + * the number of child contexts to log per parent to 100. + * + * As with MemoryContextStats(), we suppose that practical cases where the + * dump gets long will typically be huge numbers of siblings under the + * same parent context; while the additional debugging value from seeing + * details about individual siblings beyond 100 will not be large. + */ + MemoryContextStatsDetail(TopMemoryContext, 100, false); +} + +void * +palloc(Size size) +{ + /* duplicates MemoryContextAlloc to avoid increased overhead */ + void *ret; + MemoryContext context = CurrentMemoryContext; + + AssertArg(MemoryContextIsValid(context)); + AssertNotInCriticalSection(context); + + if (!AllocSizeIsValid(size)) + elog(ERROR, "invalid memory alloc request size %zu", size); + + context->isReset = false; + + ret = context->methods->alloc(context, size); + if (unlikely(ret == NULL)) + { + MemoryContextStats(TopMemoryContext); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed on request of size %zu in memory context \"%s\".", + size, context->name))); + } + + VALGRIND_MEMPOOL_ALLOC(context, ret, size); + + return ret; +} + +void * +palloc0(Size size) +{ + /* duplicates MemoryContextAllocZero to avoid increased overhead */ + void *ret; + MemoryContext context = CurrentMemoryContext; + + AssertArg(MemoryContextIsValid(context)); + AssertNotInCriticalSection(context); + + if (!AllocSizeIsValid(size)) + elog(ERROR, "invalid memory alloc request size %zu", size); + + context->isReset = false; + + ret = context->methods->alloc(context, size); + if (unlikely(ret == NULL)) + { + MemoryContextStats(TopMemoryContext); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed on request of size %zu in memory context \"%s\".", + size, context->name))); + } + + VALGRIND_MEMPOOL_ALLOC(context, ret, size); + + MemSetAligned(ret, 0, size); + + return ret; +} + +void * +palloc_extended(Size size, int flags) +{ + /* duplicates MemoryContextAllocExtended to avoid increased overhead */ + void *ret; + MemoryContext context = CurrentMemoryContext; + + AssertArg(MemoryContextIsValid(context)); + AssertNotInCriticalSection(context); + + if (((flags & MCXT_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) || + ((flags & MCXT_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size))) + elog(ERROR, "invalid memory alloc request size %zu", size); + + context->isReset = false; + + ret = context->methods->alloc(context, size); + if (unlikely(ret == NULL)) + { + if ((flags & MCXT_ALLOC_NO_OOM) == 0) + { + MemoryContextStats(TopMemoryContext); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed on request of size %zu in memory context \"%s\".", + size, context->name))); + } + return NULL; + } + + VALGRIND_MEMPOOL_ALLOC(context, ret, size); + + if ((flags & MCXT_ALLOC_ZERO) != 0) + MemSetAligned(ret, 0, size); + + return ret; +} + +/* + * pfree + * Release an allocated chunk. + */ +void +pfree(void *pointer) +{ + MemoryContext context = GetMemoryChunkContext(pointer); + + context->methods->free_p(context, pointer); + VALGRIND_MEMPOOL_FREE(context, pointer); +} + +/* + * repalloc + * Adjust the size of a previously allocated chunk. + */ +void * +repalloc(void *pointer, Size size) +{ + MemoryContext context = GetMemoryChunkContext(pointer); + void *ret; + + if (!AllocSizeIsValid(size)) + elog(ERROR, "invalid memory alloc request size %zu", size); + + AssertNotInCriticalSection(context); + + /* isReset must be false already */ + Assert(!context->isReset); + + ret = context->methods->realloc(context, pointer, size); + if (unlikely(ret == NULL)) + { + MemoryContextStats(TopMemoryContext); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed on request of size %zu in memory context \"%s\".", + size, context->name))); + } + + VALGRIND_MEMPOOL_CHANGE(context, pointer, ret, size); + + return ret; +} + +/* + * MemoryContextAllocHuge + * Allocate (possibly-expansive) space within the specified context. + * + * See considerations in comment at MaxAllocHugeSize. + */ +void * +MemoryContextAllocHuge(MemoryContext context, Size size) +{ + void *ret; + + AssertArg(MemoryContextIsValid(context)); + AssertNotInCriticalSection(context); + + if (!AllocHugeSizeIsValid(size)) + elog(ERROR, "invalid memory alloc request size %zu", size); + + context->isReset = false; + + ret = context->methods->alloc(context, size); + if (unlikely(ret == NULL)) + { + MemoryContextStats(TopMemoryContext); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed on request of size %zu in memory context \"%s\".", + size, context->name))); + } + + VALGRIND_MEMPOOL_ALLOC(context, ret, size); + + return ret; +} + +/* + * repalloc_huge + * Adjust the size of a previously allocated chunk, permitting a large + * value. The previous allocation need not have been "huge". + */ +void * +repalloc_huge(void *pointer, Size size) +{ + MemoryContext context = GetMemoryChunkContext(pointer); + void *ret; + + if (!AllocHugeSizeIsValid(size)) + elog(ERROR, "invalid memory alloc request size %zu", size); + + AssertNotInCriticalSection(context); + + /* isReset must be false already */ + Assert(!context->isReset); + + ret = context->methods->realloc(context, pointer, size); + if (unlikely(ret == NULL)) + { + MemoryContextStats(TopMemoryContext); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed on request of size %zu in memory context \"%s\".", + size, context->name))); + } + + VALGRIND_MEMPOOL_CHANGE(context, pointer, ret, size); + + return ret; +} + +/* + * MemoryContextStrdup + * Like strdup(), but allocate from the specified context + */ +char * +MemoryContextStrdup(MemoryContext context, const char *string) +{ + char *nstr; + Size len = strlen(string) + 1; + + nstr = (char *) MemoryContextAlloc(context, len); + + memcpy(nstr, string, len); + + return nstr; +} + +char * +pstrdup(const char *in) +{ + return MemoryContextStrdup(CurrentMemoryContext, in); +} + +/* + * pnstrdup + * Like pstrdup(), but append null byte to a + * not-necessarily-null-terminated input string. + */ +char * +pnstrdup(const char *in, Size len) +{ + char *out; + + len = strnlen(in, len); + + out = palloc(len + 1); + memcpy(out, in, len); + out[len] = '\0'; + + return out; +} + +/* + * Make copy of string with all trailing newline characters removed. + */ +char * +pchomp(const char *in) +{ + size_t n; + + n = strlen(in); + while (n > 0 && in[n - 1] == '\n') + n--; + return pnstrdup(in, n); +} diff --git a/src/backend/utils/mmgr/memdebug.c b/src/backend/utils/mmgr/memdebug.c new file mode 100644 index 0000000..3644c7f --- /dev/null +++ b/src/backend/utils/mmgr/memdebug.c @@ -0,0 +1,93 @@ +/*------------------------------------------------------------------------- + * + * memdebug.c + * Declarations used in memory context implementations, not part of the + * public API of the memory management subsystem. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/utils/mmgr/memdebug.c + * + * + * About CLOBBER_FREED_MEMORY: + * + * If this symbol is defined, all freed memory is overwritten with 0x7F's. + * This is useful for catching places that reference already-freed memory. + * + * About MEMORY_CONTEXT_CHECKING: + * + * Since we usually round request sizes up to the next power of 2, there + * is often some unused space immediately after a requested data area. + * Thus, if someone makes the common error of writing past what they've + * requested, the problem is likely to go unnoticed ... until the day when + * there *isn't* any wasted space, perhaps because of different memory + * alignment on a new platform, or some other effect. To catch this sort + * of problem, the MEMORY_CONTEXT_CHECKING option stores 0x7E just beyond + * the requested space whenever the request is less than the actual chunk + * size, and verifies that the byte is undamaged when the chunk is freed. + * + * + * About USE_VALGRIND and Valgrind client requests: + * + * Valgrind provides "client request" macros that exchange information with + * the host Valgrind (if any). Under !USE_VALGRIND, memdebug.h stubs out + * currently-used macros. + * + * When running under Valgrind, we want a NOACCESS memory region both before + * and after the allocation. The chunk header is tempting as the preceding + * region, but mcxt.c expects to able to examine the standard chunk header + * fields. Therefore, we use, when available, the requested_size field and + * any subsequent padding. requested_size is made NOACCESS before returning + * a chunk pointer to a caller. However, to reduce client request traffic, + * it is kept DEFINED in chunks on the free list. + * + * The rounded-up capacity of the chunk usually acts as a post-allocation + * NOACCESS region. If the request consumes precisely the entire chunk, + * there is no such region; another chunk header may immediately follow. In + * that case, Valgrind will not detect access beyond the end of the chunk. + * + * See also the cooperating Valgrind client requests in mcxt.c. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "utils/memdebug.h" + +#ifdef RANDOMIZE_ALLOCATED_MEMORY + +/* + * Fill a just-allocated piece of memory with "random" data. It's not really + * very random, just a repeating sequence with a length that's prime. What + * we mainly want out of it is to have a good probability that two palloc's + * of the same number of bytes start out containing different data. + * + * The region may be NOACCESS, so make it UNDEFINED first to avoid errors as + * we fill it. Filling the region makes it DEFINED, so make it UNDEFINED + * again afterward. Whether to finally make it UNDEFINED or NOACCESS is + * fairly arbitrary. UNDEFINED is more convenient for SlabRealloc(), and + * other callers have no preference. + */ +void +randomize_mem(char *ptr, size_t size) +{ + static int save_ctr = 1; + size_t remaining = size; + int ctr; + + ctr = save_ctr; + VALGRIND_MAKE_MEM_UNDEFINED(ptr, size); + while (remaining-- > 0) + { + *ptr++ = ctr; + if (++ctr > 251) + ctr = 1; + } + VALGRIND_MAKE_MEM_UNDEFINED(ptr - size, size); + save_ctr = ctr; +} + +#endif /* RANDOMIZE_ALLOCATED_MEMORY */ diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c new file mode 100644 index 0000000..d93c591 --- /dev/null +++ b/src/backend/utils/mmgr/portalmem.c @@ -0,0 +1,1340 @@ +/*------------------------------------------------------------------------- + * + * portalmem.c + * backend portal memory management + * + * Portals are objects representing the execution state of a query. + * This module provides memory management services for portals, but it + * doesn't actually run the executor for them. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/mmgr/portalmem.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xact.h" +#include "catalog/pg_type.h" +#include "commands/portalcmds.h" +#include "miscadmin.h" +#include "storage/ipc.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "utils/snapmgr.h" +#include "utils/timestamp.h" + +/* + * Estimate of the maximum number of open portals a user would have, + * used in initially sizing the PortalHashTable in EnablePortalManager(). + * Since the hash table can expand, there's no need to make this overly + * generous, and keeping it small avoids unnecessary overhead in the + * hash_seq_search() calls executed during transaction end. + */ +#define PORTALS_PER_USER 16 + + +/* ---------------- + * Global state + * ---------------- + */ + +#define MAX_PORTALNAME_LEN NAMEDATALEN + +typedef struct portalhashent +{ + char portalname[MAX_PORTALNAME_LEN]; + Portal portal; +} PortalHashEnt; + +static HTAB *PortalHashTable = NULL; + +#define PortalHashTableLookup(NAME, PORTAL) \ +do { \ + PortalHashEnt *hentry; \ + \ + hentry = (PortalHashEnt *) hash_search(PortalHashTable, \ + (NAME), HASH_FIND, NULL); \ + if (hentry) \ + PORTAL = hentry->portal; \ + else \ + PORTAL = NULL; \ +} while(0) + +#define PortalHashTableInsert(PORTAL, NAME) \ +do { \ + PortalHashEnt *hentry; bool found; \ + \ + hentry = (PortalHashEnt *) hash_search(PortalHashTable, \ + (NAME), HASH_ENTER, &found); \ + if (found) \ + elog(ERROR, "duplicate portal name"); \ + hentry->portal = PORTAL; \ + /* To avoid duplicate storage, make PORTAL->name point to htab entry */ \ + PORTAL->name = hentry->portalname; \ +} while(0) + +#define PortalHashTableDelete(PORTAL) \ +do { \ + PortalHashEnt *hentry; \ + \ + hentry = (PortalHashEnt *) hash_search(PortalHashTable, \ + PORTAL->name, HASH_REMOVE, NULL); \ + if (hentry == NULL) \ + elog(WARNING, "trying to delete portal name that does not exist"); \ +} while(0) + +static MemoryContext TopPortalContext = NULL; + + +/* ---------------------------------------------------------------- + * public portal interface functions + * ---------------------------------------------------------------- + */ + +/* + * EnablePortalManager + * Enables the portal management module at backend startup. + */ +void +EnablePortalManager(void) +{ + HASHCTL ctl; + + Assert(TopPortalContext == NULL); + + TopPortalContext = AllocSetContextCreate(TopMemoryContext, + "TopPortalContext", + ALLOCSET_DEFAULT_SIZES); + + ctl.keysize = MAX_PORTALNAME_LEN; + ctl.entrysize = sizeof(PortalHashEnt); + + /* + * use PORTALS_PER_USER as a guess of how many hash table entries to + * create, initially + */ + PortalHashTable = hash_create("Portal hash", PORTALS_PER_USER, + &ctl, HASH_ELEM | HASH_STRINGS); +} + +/* + * GetPortalByName + * Returns a portal given a portal name, or NULL if name not found. + */ +Portal +GetPortalByName(const char *name) +{ + Portal portal; + + if (PointerIsValid(name)) + PortalHashTableLookup(name, portal); + else + portal = NULL; + + return portal; +} + +/* + * PortalGetPrimaryStmt + * Get the "primary" stmt within a portal, ie, the one marked canSetTag. + * + * Returns NULL if no such stmt. If multiple PlannedStmt structs within the + * portal are marked canSetTag, returns the first one. Neither of these + * cases should occur in present usages of this function. + */ +PlannedStmt * +PortalGetPrimaryStmt(Portal portal) +{ + ListCell *lc; + + foreach(lc, portal->stmts) + { + PlannedStmt *stmt = lfirst_node(PlannedStmt, lc); + + if (stmt->canSetTag) + return stmt; + } + return NULL; +} + +/* + * CreatePortal + * Returns a new portal given a name. + * + * allowDup: if true, automatically drop any pre-existing portal of the + * same name (if false, an error is raised). + * + * dupSilent: if true, don't even emit a WARNING. + */ +Portal +CreatePortal(const char *name, bool allowDup, bool dupSilent) +{ + Portal portal; + + AssertArg(PointerIsValid(name)); + + portal = GetPortalByName(name); + if (PortalIsValid(portal)) + { + if (!allowDup) + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_CURSOR), + errmsg("cursor \"%s\" already exists", name))); + if (!dupSilent) + ereport(WARNING, + (errcode(ERRCODE_DUPLICATE_CURSOR), + errmsg("closing existing cursor \"%s\"", + name))); + PortalDrop(portal, false); + } + + /* make new portal structure */ + portal = (Portal) MemoryContextAllocZero(TopPortalContext, sizeof *portal); + + /* initialize portal context; typically it won't store much */ + portal->portalContext = AllocSetContextCreate(TopPortalContext, + "PortalContext", + ALLOCSET_SMALL_SIZES); + + /* create a resource owner for the portal */ + portal->resowner = ResourceOwnerCreate(CurTransactionResourceOwner, + "Portal"); + + /* initialize portal fields that don't start off zero */ + portal->status = PORTAL_NEW; + portal->cleanup = PortalCleanup; + portal->createSubid = GetCurrentSubTransactionId(); + portal->activeSubid = portal->createSubid; + portal->createLevel = GetCurrentTransactionNestLevel(); + portal->strategy = PORTAL_MULTI_QUERY; + portal->cursorOptions = CURSOR_OPT_NO_SCROLL; + portal->atStart = true; + portal->atEnd = true; /* disallow fetches until query is set */ + portal->visible = true; + portal->creation_time = GetCurrentStatementStartTimestamp(); + + /* put portal in table (sets portal->name) */ + PortalHashTableInsert(portal, name); + + /* for named portals reuse portal->name copy */ + MemoryContextSetIdentifier(portal->portalContext, portal->name[0] ? portal->name : "<unnamed>"); + + return portal; +} + +/* + * CreateNewPortal + * Create a new portal, assigning it a random nonconflicting name. + */ +Portal +CreateNewPortal(void) +{ + static unsigned int unnamed_portal_count = 0; + + char portalname[MAX_PORTALNAME_LEN]; + + /* Select a nonconflicting name */ + for (;;) + { + unnamed_portal_count++; + sprintf(portalname, "<unnamed portal %u>", unnamed_portal_count); + if (GetPortalByName(portalname) == NULL) + break; + } + + return CreatePortal(portalname, false, false); +} + +/* + * PortalDefineQuery + * A simple subroutine to establish a portal's query. + * + * Notes: as of PG 8.4, caller MUST supply a sourceText string; it is not + * allowed anymore to pass NULL. (If you really don't have source text, + * you can pass a constant string, perhaps "(query not available)".) + * + * commandTag shall be NULL if and only if the original query string + * (before rewriting) was an empty string. Also, the passed commandTag must + * be a pointer to a constant string, since it is not copied. + * + * If cplan is provided, then it is a cached plan containing the stmts, and + * the caller must have done GetCachedPlan(), causing a refcount increment. + * The refcount will be released when the portal is destroyed. + * + * If cplan is NULL, then it is the caller's responsibility to ensure that + * the passed plan trees have adequate lifetime. Typically this is done by + * copying them into the portal's context. + * + * The caller is also responsible for ensuring that the passed prepStmtName + * (if not NULL) and sourceText have adequate lifetime. + * + * NB: this function mustn't do much beyond storing the passed values; in + * particular don't do anything that risks elog(ERROR). If that were to + * happen here before storing the cplan reference, we'd leak the plancache + * refcount that the caller is trying to hand off to us. + */ +void +PortalDefineQuery(Portal portal, + const char *prepStmtName, + const char *sourceText, + CommandTag commandTag, + List *stmts, + CachedPlan *cplan) +{ + AssertArg(PortalIsValid(portal)); + AssertState(portal->status == PORTAL_NEW); + + AssertArg(sourceText != NULL); + AssertArg(commandTag != CMDTAG_UNKNOWN || stmts == NIL); + + portal->prepStmtName = prepStmtName; + portal->sourceText = sourceText; + portal->qc.commandTag = commandTag; + portal->qc.nprocessed = 0; + portal->commandTag = commandTag; + portal->stmts = stmts; + portal->cplan = cplan; + portal->status = PORTAL_DEFINED; +} + +/* + * PortalReleaseCachedPlan + * Release a portal's reference to its cached plan, if any. + */ +static void +PortalReleaseCachedPlan(Portal portal) +{ + if (portal->cplan) + { + ReleaseCachedPlan(portal->cplan, NULL); + portal->cplan = NULL; + + /* + * We must also clear portal->stmts which is now a dangling reference + * to the cached plan's plan list. This protects any code that might + * try to examine the Portal later. + */ + portal->stmts = NIL; + } +} + +/* + * PortalCreateHoldStore + * Create the tuplestore for a portal. + */ +void +PortalCreateHoldStore(Portal portal) +{ + MemoryContext oldcxt; + + Assert(portal->holdContext == NULL); + Assert(portal->holdStore == NULL); + Assert(portal->holdSnapshot == NULL); + + /* + * Create the memory context that is used for storage of the tuple set. + * Note this is NOT a child of the portal's portalContext. + */ + portal->holdContext = + AllocSetContextCreate(TopPortalContext, + "PortalHoldContext", + ALLOCSET_DEFAULT_SIZES); + + /* + * Create the tuple store, selecting cross-transaction temp files, and + * enabling random access only if cursor requires scrolling. + * + * XXX: Should maintenance_work_mem be used for the portal size? + */ + oldcxt = MemoryContextSwitchTo(portal->holdContext); + + portal->holdStore = + tuplestore_begin_heap(portal->cursorOptions & CURSOR_OPT_SCROLL, + true, work_mem); + + MemoryContextSwitchTo(oldcxt); +} + +/* + * PinPortal + * Protect a portal from dropping. + * + * A pinned portal is still unpinned and dropped at transaction or + * subtransaction abort. + */ +void +PinPortal(Portal portal) +{ + if (portal->portalPinned) + elog(ERROR, "portal already pinned"); + + portal->portalPinned = true; +} + +void +UnpinPortal(Portal portal) +{ + if (!portal->portalPinned) + elog(ERROR, "portal not pinned"); + + portal->portalPinned = false; +} + +/* + * MarkPortalActive + * Transition a portal from READY to ACTIVE state. + * + * NOTE: never set portal->status = PORTAL_ACTIVE directly; call this instead. + */ +void +MarkPortalActive(Portal portal) +{ + /* For safety, this is a runtime test not just an Assert */ + if (portal->status != PORTAL_READY) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("portal \"%s\" cannot be run", portal->name))); + /* Perform the state transition */ + portal->status = PORTAL_ACTIVE; + portal->activeSubid = GetCurrentSubTransactionId(); +} + +/* + * MarkPortalDone + * Transition a portal from ACTIVE to DONE state. + * + * NOTE: never set portal->status = PORTAL_DONE directly; call this instead. + */ +void +MarkPortalDone(Portal portal) +{ + /* Perform the state transition */ + Assert(portal->status == PORTAL_ACTIVE); + portal->status = PORTAL_DONE; + + /* + * Allow portalcmds.c to clean up the state it knows about. We might as + * well do that now, since the portal can't be executed any more. + * + * In some cases involving execution of a ROLLBACK command in an already + * aborted transaction, this is necessary, or we'd reach AtCleanup_Portals + * with the cleanup hook still unexecuted. + */ + if (PointerIsValid(portal->cleanup)) + { + portal->cleanup(portal); + portal->cleanup = NULL; + } +} + +/* + * MarkPortalFailed + * Transition a portal into FAILED state. + * + * NOTE: never set portal->status = PORTAL_FAILED directly; call this instead. + */ +void +MarkPortalFailed(Portal portal) +{ + /* Perform the state transition */ + Assert(portal->status != PORTAL_DONE); + portal->status = PORTAL_FAILED; + + /* + * Allow portalcmds.c to clean up the state it knows about. We might as + * well do that now, since the portal can't be executed any more. + * + * In some cases involving cleanup of an already aborted transaction, this + * is necessary, or we'd reach AtCleanup_Portals with the cleanup hook + * still unexecuted. + */ + if (PointerIsValid(portal->cleanup)) + { + portal->cleanup(portal); + portal->cleanup = NULL; + } +} + +/* + * PortalDrop + * Destroy the portal. + */ +void +PortalDrop(Portal portal, bool isTopCommit) +{ + AssertArg(PortalIsValid(portal)); + + /* + * Don't allow dropping a pinned portal, it's still needed by whoever + * pinned it. + */ + if (portal->portalPinned) + ereport(ERROR, + (errcode(ERRCODE_INVALID_CURSOR_STATE), + errmsg("cannot drop pinned portal \"%s\"", portal->name))); + + /* + * Not sure if the PORTAL_ACTIVE case can validly happen or not... + */ + if (portal->status == PORTAL_ACTIVE) + ereport(ERROR, + (errcode(ERRCODE_INVALID_CURSOR_STATE), + errmsg("cannot drop active portal \"%s\"", portal->name))); + + /* + * Allow portalcmds.c to clean up the state it knows about, in particular + * shutting down the executor if still active. This step potentially runs + * user-defined code so failure has to be expected. It's the cleanup + * hook's responsibility to not try to do that more than once, in the case + * that failure occurs and then we come back to drop the portal again + * during transaction abort. + * + * Note: in most paths of control, this will have been done already in + * MarkPortalDone or MarkPortalFailed. We're just making sure. + */ + if (PointerIsValid(portal->cleanup)) + { + portal->cleanup(portal); + portal->cleanup = NULL; + } + + /* There shouldn't be an active snapshot anymore, except after error */ + Assert(portal->portalSnapshot == NULL || !isTopCommit); + + /* + * Remove portal from hash table. Because we do this here, we will not + * come back to try to remove the portal again if there's any error in the + * subsequent steps. Better to leak a little memory than to get into an + * infinite error-recovery loop. + */ + PortalHashTableDelete(portal); + + /* drop cached plan reference, if any */ + PortalReleaseCachedPlan(portal); + + /* + * If portal has a snapshot protecting its data, release that. This needs + * a little care since the registration will be attached to the portal's + * resowner; if the portal failed, we will already have released the + * resowner (and the snapshot) during transaction abort. + */ + if (portal->holdSnapshot) + { + if (portal->resowner) + UnregisterSnapshotFromOwner(portal->holdSnapshot, + portal->resowner); + portal->holdSnapshot = NULL; + } + + /* + * Release any resources still attached to the portal. There are several + * cases being covered here: + * + * Top transaction commit (indicated by isTopCommit): normally we should + * do nothing here and let the regular end-of-transaction resource + * releasing mechanism handle these resources too. However, if we have a + * FAILED portal (eg, a cursor that got an error), we'd better clean up + * its resources to avoid resource-leakage warning messages. + * + * Sub transaction commit: never comes here at all, since we don't kill + * any portals in AtSubCommit_Portals(). + * + * Main or sub transaction abort: we will do nothing here because + * portal->resowner was already set NULL; the resources were already + * cleaned up in transaction abort. + * + * Ordinary portal drop: must release resources. However, if the portal + * is not FAILED then we do not release its locks. The locks become the + * responsibility of the transaction's ResourceOwner (since it is the + * parent of the portal's owner) and will be released when the transaction + * eventually ends. + */ + if (portal->resowner && + (!isTopCommit || portal->status == PORTAL_FAILED)) + { + bool isCommit = (portal->status != PORTAL_FAILED); + + ResourceOwnerRelease(portal->resowner, + RESOURCE_RELEASE_BEFORE_LOCKS, + isCommit, false); + ResourceOwnerRelease(portal->resowner, + RESOURCE_RELEASE_LOCKS, + isCommit, false); + ResourceOwnerRelease(portal->resowner, + RESOURCE_RELEASE_AFTER_LOCKS, + isCommit, false); + ResourceOwnerDelete(portal->resowner); + } + portal->resowner = NULL; + + /* + * Delete tuplestore if present. We should do this even under error + * conditions; since the tuplestore would have been using cross- + * transaction storage, its temp files need to be explicitly deleted. + */ + if (portal->holdStore) + { + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(portal->holdContext); + tuplestore_end(portal->holdStore); + MemoryContextSwitchTo(oldcontext); + portal->holdStore = NULL; + } + + /* delete tuplestore storage, if any */ + if (portal->holdContext) + MemoryContextDelete(portal->holdContext); + + /* release subsidiary storage */ + MemoryContextDelete(portal->portalContext); + + /* release portal struct (it's in TopPortalContext) */ + pfree(portal); +} + +/* + * Delete all declared cursors. + * + * Used by commands: CLOSE ALL, DISCARD ALL + */ +void +PortalHashTableDeleteAll(void) +{ + HASH_SEQ_STATUS status; + PortalHashEnt *hentry; + + if (PortalHashTable == NULL) + return; + + hash_seq_init(&status, PortalHashTable); + while ((hentry = hash_seq_search(&status)) != NULL) + { + Portal portal = hentry->portal; + + /* Can't close the active portal (the one running the command) */ + if (portal->status == PORTAL_ACTIVE) + continue; + + PortalDrop(portal, false); + + /* Restart the iteration in case that led to other drops */ + hash_seq_term(&status); + hash_seq_init(&status, PortalHashTable); + } +} + +/* + * "Hold" a portal. Prepare it for access by later transactions. + */ +static void +HoldPortal(Portal portal) +{ + /* + * Note that PersistHoldablePortal() must release all resources used by + * the portal that are local to the creating transaction. + */ + PortalCreateHoldStore(portal); + PersistHoldablePortal(portal); + + /* drop cached plan reference, if any */ + PortalReleaseCachedPlan(portal); + + /* + * Any resources belonging to the portal will be released in the upcoming + * transaction-wide cleanup; the portal will no longer have its own + * resources. + */ + portal->resowner = NULL; + + /* + * Having successfully exported the holdable cursor, mark it as not + * belonging to this transaction. + */ + portal->createSubid = InvalidSubTransactionId; + portal->activeSubid = InvalidSubTransactionId; + portal->createLevel = 0; +} + +/* + * Pre-commit processing for portals. + * + * Holdable cursors created in this transaction need to be converted to + * materialized form, since we are going to close down the executor and + * release locks. Non-holdable portals created in this transaction are + * simply removed. Portals remaining from prior transactions should be + * left untouched. + * + * Returns true if any portals changed state (possibly causing user-defined + * code to be run), false if not. + */ +bool +PreCommit_Portals(bool isPrepare) +{ + bool result = false; + HASH_SEQ_STATUS status; + PortalHashEnt *hentry; + + hash_seq_init(&status, PortalHashTable); + + while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL) + { + Portal portal = hentry->portal; + + /* + * There should be no pinned portals anymore. Complain if someone + * leaked one. Auto-held portals are allowed; we assume that whoever + * pinned them is managing them. + */ + if (portal->portalPinned && !portal->autoHeld) + elog(ERROR, "cannot commit while a portal is pinned"); + + /* + * Do not touch active portals --- this can only happen in the case of + * a multi-transaction utility command, such as VACUUM, or a commit in + * a procedure. + * + * Note however that any resource owner attached to such a portal is + * still going to go away, so don't leave a dangling pointer. Also + * unregister any snapshots held by the portal, mainly to avoid + * snapshot leak warnings from ResourceOwnerRelease(). + */ + if (portal->status == PORTAL_ACTIVE) + { + if (portal->holdSnapshot) + { + if (portal->resowner) + UnregisterSnapshotFromOwner(portal->holdSnapshot, + portal->resowner); + portal->holdSnapshot = NULL; + } + portal->resowner = NULL; + /* Clear portalSnapshot too, for cleanliness */ + portal->portalSnapshot = NULL; + continue; + } + + /* Is it a holdable portal created in the current xact? */ + if ((portal->cursorOptions & CURSOR_OPT_HOLD) && + portal->createSubid != InvalidSubTransactionId && + portal->status == PORTAL_READY) + { + /* + * We are exiting the transaction that created a holdable cursor. + * Instead of dropping the portal, prepare it for access by later + * transactions. + * + * However, if this is PREPARE TRANSACTION rather than COMMIT, + * refuse PREPARE, because the semantics seem pretty unclear. + */ + if (isPrepare) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot PREPARE a transaction that has created a cursor WITH HOLD"))); + + HoldPortal(portal); + + /* Report we changed state */ + result = true; + } + else if (portal->createSubid == InvalidSubTransactionId) + { + /* + * Do nothing to cursors held over from a previous transaction + * (including ones we just froze in a previous cycle of this loop) + */ + continue; + } + else + { + /* Zap all non-holdable portals */ + PortalDrop(portal, true); + + /* Report we changed state */ + result = true; + } + + /* + * After either freezing or dropping a portal, we have to restart the + * iteration, because we could have invoked user-defined code that + * caused a drop of the next portal in the hash chain. + */ + hash_seq_term(&status); + hash_seq_init(&status, PortalHashTable); + } + + return result; +} + +/* + * Abort processing for portals. + * + * At this point we run the cleanup hook if present, but we can't release the + * portal's memory until the cleanup call. + */ +void +AtAbort_Portals(void) +{ + HASH_SEQ_STATUS status; + PortalHashEnt *hentry; + + hash_seq_init(&status, PortalHashTable); + + while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL) + { + Portal portal = hentry->portal; + + /* + * When elog(FATAL) is progress, we need to set the active portal to + * failed, so that PortalCleanup() doesn't run the executor shutdown. + */ + if (portal->status == PORTAL_ACTIVE && shmem_exit_inprogress) + MarkPortalFailed(portal); + + /* + * Do nothing else to cursors held over from a previous transaction. + */ + if (portal->createSubid == InvalidSubTransactionId) + continue; + + /* + * Do nothing to auto-held cursors. This is similar to the case of a + * cursor from a previous transaction, but it could also be that the + * cursor was auto-held in this transaction, so it wants to live on. + */ + if (portal->autoHeld) + continue; + + /* + * If it was created in the current transaction, we can't do normal + * shutdown on a READY portal either; it might refer to objects + * created in the failed transaction. See comments in + * AtSubAbort_Portals. + */ + if (portal->status == PORTAL_READY) + MarkPortalFailed(portal); + + /* + * Allow portalcmds.c to clean up the state it knows about, if we + * haven't already. + */ + if (PointerIsValid(portal->cleanup)) + { + portal->cleanup(portal); + portal->cleanup = NULL; + } + + /* drop cached plan reference, if any */ + PortalReleaseCachedPlan(portal); + + /* + * Any resources belonging to the portal will be released in the + * upcoming transaction-wide cleanup; they will be gone before we run + * PortalDrop. + */ + portal->resowner = NULL; + + /* + * Although we can't delete the portal data structure proper, we can + * release any memory in subsidiary contexts, such as executor state. + * The cleanup hook was the last thing that might have needed data + * there. But leave active portals alone. + */ + if (portal->status != PORTAL_ACTIVE) + MemoryContextDeleteChildren(portal->portalContext); + } +} + +/* + * Post-abort cleanup for portals. + * + * Delete all portals not held over from prior transactions. */ +void +AtCleanup_Portals(void) +{ + HASH_SEQ_STATUS status; + PortalHashEnt *hentry; + + hash_seq_init(&status, PortalHashTable); + + while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL) + { + Portal portal = hentry->portal; + + /* + * Do not touch active portals --- this can only happen in the case of + * a multi-transaction command. + */ + if (portal->status == PORTAL_ACTIVE) + continue; + + /* + * Do nothing to cursors held over from a previous transaction or + * auto-held ones. + */ + if (portal->createSubid == InvalidSubTransactionId || portal->autoHeld) + { + Assert(portal->status != PORTAL_ACTIVE); + Assert(portal->resowner == NULL); + continue; + } + + /* + * If a portal is still pinned, forcibly unpin it. PortalDrop will not + * let us drop the portal otherwise. Whoever pinned the portal was + * interrupted by the abort too and won't try to use it anymore. + */ + if (portal->portalPinned) + portal->portalPinned = false; + + /* + * We had better not call any user-defined code during cleanup, so if + * the cleanup hook hasn't been run yet, too bad; we'll just skip it. + */ + if (PointerIsValid(portal->cleanup)) + { + elog(WARNING, "skipping cleanup for portal \"%s\"", portal->name); + portal->cleanup = NULL; + } + + /* Zap it. */ + PortalDrop(portal, false); + } +} + +/* + * Portal-related cleanup when we return to the main loop on error. + * + * This is different from the cleanup at transaction abort. Auto-held portals + * are cleaned up on error but not on transaction abort. + */ +void +PortalErrorCleanup(void) +{ + HASH_SEQ_STATUS status; + PortalHashEnt *hentry; + + hash_seq_init(&status, PortalHashTable); + + while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL) + { + Portal portal = hentry->portal; + + if (portal->autoHeld) + { + portal->portalPinned = false; + PortalDrop(portal, false); + } + } +} + +/* + * Pre-subcommit processing for portals. + * + * Reassign portals created or used in the current subtransaction to the + * parent subtransaction. + */ +void +AtSubCommit_Portals(SubTransactionId mySubid, + SubTransactionId parentSubid, + int parentLevel, + ResourceOwner parentXactOwner) +{ + HASH_SEQ_STATUS status; + PortalHashEnt *hentry; + + hash_seq_init(&status, PortalHashTable); + + while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL) + { + Portal portal = hentry->portal; + + if (portal->createSubid == mySubid) + { + portal->createSubid = parentSubid; + portal->createLevel = parentLevel; + if (portal->resowner) + ResourceOwnerNewParent(portal->resowner, parentXactOwner); + } + if (portal->activeSubid == mySubid) + portal->activeSubid = parentSubid; + } +} + +/* + * Subtransaction abort handling for portals. + * + * Deactivate portals created or used during the failed subtransaction. + * Note that per AtSubCommit_Portals, this will catch portals created/used + * in descendants of the subtransaction too. + * + * We don't destroy any portals here; that's done in AtSubCleanup_Portals. + */ +void +AtSubAbort_Portals(SubTransactionId mySubid, + SubTransactionId parentSubid, + ResourceOwner myXactOwner, + ResourceOwner parentXactOwner) +{ + HASH_SEQ_STATUS status; + PortalHashEnt *hentry; + + hash_seq_init(&status, PortalHashTable); + + while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL) + { + Portal portal = hentry->portal; + + /* Was it created in this subtransaction? */ + if (portal->createSubid != mySubid) + { + /* No, but maybe it was used in this subtransaction? */ + if (portal->activeSubid == mySubid) + { + /* Maintain activeSubid until the portal is removed */ + portal->activeSubid = parentSubid; + + /* + * A MarkPortalActive() caller ran an upper-level portal in + * this subtransaction and left the portal ACTIVE. This can't + * happen, but force the portal into FAILED state for the same + * reasons discussed below. + * + * We assume we can get away without forcing upper-level READY + * portals to fail, even if they were run and then suspended. + * In theory a suspended upper-level portal could have + * acquired some references to objects that are about to be + * destroyed, but there should be sufficient defenses against + * such cases: the portal's original query cannot contain such + * references, and any references within, say, cached plans of + * PL/pgSQL functions are not from active queries and should + * be protected by revalidation logic. + */ + if (portal->status == PORTAL_ACTIVE) + MarkPortalFailed(portal); + + /* + * Also, if we failed it during the current subtransaction + * (either just above, or earlier), reattach its resource + * owner to the current subtransaction's resource owner, so + * that any resources it still holds will be released while + * cleaning up this subtransaction. This prevents some corner + * cases wherein we might get Asserts or worse while cleaning + * up objects created during the current subtransaction + * (because they're still referenced within this portal). + */ + if (portal->status == PORTAL_FAILED && portal->resowner) + { + ResourceOwnerNewParent(portal->resowner, myXactOwner); + portal->resowner = NULL; + } + } + /* Done if it wasn't created in this subtransaction */ + continue; + } + + /* + * Force any live portals of my own subtransaction into FAILED state. + * We have to do this because they might refer to objects created or + * changed in the failed subtransaction, leading to crashes within + * ExecutorEnd when portalcmds.c tries to close down the portal. + * Currently, every MarkPortalActive() caller ensures it updates the + * portal status again before relinquishing control, so ACTIVE can't + * happen here. If it does happen, dispose the portal like existing + * MarkPortalActive() callers would. + */ + if (portal->status == PORTAL_READY || + portal->status == PORTAL_ACTIVE) + MarkPortalFailed(portal); + + /* + * Allow portalcmds.c to clean up the state it knows about, if we + * haven't already. + */ + if (PointerIsValid(portal->cleanup)) + { + portal->cleanup(portal); + portal->cleanup = NULL; + } + + /* drop cached plan reference, if any */ + PortalReleaseCachedPlan(portal); + + /* + * Any resources belonging to the portal will be released in the + * upcoming transaction-wide cleanup; they will be gone before we run + * PortalDrop. + */ + portal->resowner = NULL; + + /* + * Although we can't delete the portal data structure proper, we can + * release any memory in subsidiary contexts, such as executor state. + * The cleanup hook was the last thing that might have needed data + * there. + */ + MemoryContextDeleteChildren(portal->portalContext); + } +} + +/* + * Post-subabort cleanup for portals. + * + * Drop all portals created in the failed subtransaction (but note that + * we will not drop any that were reassigned to the parent above). + */ +void +AtSubCleanup_Portals(SubTransactionId mySubid) +{ + HASH_SEQ_STATUS status; + PortalHashEnt *hentry; + + hash_seq_init(&status, PortalHashTable); + + while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL) + { + Portal portal = hentry->portal; + + if (portal->createSubid != mySubid) + continue; + + /* + * If a portal is still pinned, forcibly unpin it. PortalDrop will not + * let us drop the portal otherwise. Whoever pinned the portal was + * interrupted by the abort too and won't try to use it anymore. + */ + if (portal->portalPinned) + portal->portalPinned = false; + + /* + * We had better not call any user-defined code during cleanup, so if + * the cleanup hook hasn't been run yet, too bad; we'll just skip it. + */ + if (PointerIsValid(portal->cleanup)) + { + elog(WARNING, "skipping cleanup for portal \"%s\"", portal->name); + portal->cleanup = NULL; + } + + /* Zap it. */ + PortalDrop(portal, false); + } +} + +/* Find all available cursors */ +Datum +pg_cursor(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + HASH_SEQ_STATUS hash_seq; + PortalHashEnt *hentry; + + /* check to see if caller supports us returning a tuplestore */ + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("materialize mode required, but it is not allowed in this context"))); + + /* need to build tuplestore in query context */ + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + /* + * build tupdesc for result tuples. This must match the definition of the + * pg_cursors view in system_views.sql + */ + tupdesc = CreateTemplateTupleDesc(6); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "name", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "statement", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "is_holdable", + BOOLOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "is_binary", + BOOLOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 5, "is_scrollable", + BOOLOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 6, "creation_time", + TIMESTAMPTZOID, -1, 0); + + /* + * We put all the tuples into a tuplestore in one scan of the hashtable. + * This avoids any issue of the hashtable possibly changing between calls. + */ + tupstore = + tuplestore_begin_heap(rsinfo->allowedModes & SFRM_Materialize_Random, + false, work_mem); + + /* generate junk in short-term context */ + MemoryContextSwitchTo(oldcontext); + + hash_seq_init(&hash_seq, PortalHashTable); + while ((hentry = hash_seq_search(&hash_seq)) != NULL) + { + Portal portal = hentry->portal; + Datum values[6]; + bool nulls[6]; + + /* report only "visible" entries */ + if (!portal->visible) + continue; + + MemSet(nulls, 0, sizeof(nulls)); + + values[0] = CStringGetTextDatum(portal->name); + values[1] = CStringGetTextDatum(portal->sourceText); + values[2] = BoolGetDatum(portal->cursorOptions & CURSOR_OPT_HOLD); + values[3] = BoolGetDatum(portal->cursorOptions & CURSOR_OPT_BINARY); + values[4] = BoolGetDatum(portal->cursorOptions & CURSOR_OPT_SCROLL); + values[5] = TimestampTzGetDatum(portal->creation_time); + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + } + + /* clean up and return the tuplestore */ + tuplestore_donestoring(tupstore); + + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + + return (Datum) 0; +} + +bool +ThereAreNoReadyPortals(void) +{ + HASH_SEQ_STATUS status; + PortalHashEnt *hentry; + + hash_seq_init(&status, PortalHashTable); + + while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL) + { + Portal portal = hentry->portal; + + if (portal->status == PORTAL_READY) + return false; + } + + return true; +} + +/* + * Hold all pinned portals. + * + * When initiating a COMMIT or ROLLBACK inside a procedure, this must be + * called to protect internally-generated cursors from being dropped during + * the transaction shutdown. Currently, SPI calls this automatically; PLs + * that initiate COMMIT or ROLLBACK some other way are on the hook to do it + * themselves. (Note that we couldn't do this in, say, AtAbort_Portals + * because we need to run user-defined code while persisting a portal. + * It's too late to do that once transaction abort has started.) + * + * We protect such portals by converting them to held cursors. We mark them + * as "auto-held" so that exception exit knows to clean them up. (In normal, + * non-exception code paths, the PL needs to clean such portals itself, since + * transaction end won't do it anymore; but that should be normal practice + * anyway.) + */ +void +HoldPinnedPortals(void) +{ + HASH_SEQ_STATUS status; + PortalHashEnt *hentry; + + hash_seq_init(&status, PortalHashTable); + + while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL) + { + Portal portal = hentry->portal; + + if (portal->portalPinned && !portal->autoHeld) + { + /* + * Doing transaction control, especially abort, inside a cursor + * loop that is not read-only, for example using UPDATE ... + * RETURNING, has weird semantics issues. Also, this + * implementation wouldn't work, because such portals cannot be + * held. (The core grammar enforces that only SELECT statements + * can drive a cursor, but for example PL/pgSQL does not restrict + * it.) + */ + if (portal->strategy != PORTAL_ONE_SELECT) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot perform transaction commands inside a cursor loop that is not read-only"))); + + /* Verify it's in a suitable state to be held */ + if (portal->status != PORTAL_READY) + elog(ERROR, "pinned portal is not ready to be auto-held"); + + HoldPortal(portal); + portal->autoHeld = true; + } + } +} + +/* + * Drop the outer active snapshots for all portals, so that no snapshots + * remain active. + * + * Like HoldPinnedPortals, this must be called when initiating a COMMIT or + * ROLLBACK inside a procedure. This has to be separate from that since it + * should not be run until we're done with steps that are likely to fail. + * + * It's tempting to fold this into PreCommit_Portals, but to do so, we'd + * need to clean up snapshot management in VACUUM and perhaps other places. + */ +void +ForgetPortalSnapshots(void) +{ + HASH_SEQ_STATUS status; + PortalHashEnt *hentry; + int numPortalSnaps = 0; + int numActiveSnaps = 0; + + /* First, scan PortalHashTable and clear portalSnapshot fields */ + hash_seq_init(&status, PortalHashTable); + + while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL) + { + Portal portal = hentry->portal; + + if (portal->portalSnapshot != NULL) + { + portal->portalSnapshot = NULL; + numPortalSnaps++; + } + /* portal->holdSnapshot will be cleaned up in PreCommit_Portals */ + } + + /* + * Now, pop all the active snapshots, which should be just those that were + * portal snapshots. Ideally we'd drive this directly off the portal + * scan, but there's no good way to visit the portals in the correct + * order. So just cross-check after the fact. + */ + while (ActiveSnapshotSet()) + { + PopActiveSnapshot(); + numActiveSnaps++; + } + + if (numPortalSnaps != numActiveSnaps) + elog(ERROR, "portal snapshots (%d) did not account for all active snapshots (%d)", + numPortalSnaps, numActiveSnaps); +} diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c new file mode 100644 index 0000000..553dd7f --- /dev/null +++ b/src/backend/utils/mmgr/slab.c @@ -0,0 +1,794 @@ +/*------------------------------------------------------------------------- + * + * slab.c + * SLAB allocator definitions. + * + * SLAB is a MemoryContext implementation designed for cases where large + * numbers of equally-sized objects are allocated (and freed). + * + * + * Portions Copyright (c) 2017-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/mmgr/slab.c + * + * + * NOTE: + * The constant allocation size allows significant simplification and various + * optimizations over more general purpose allocators. The blocks are carved + * into chunks of exactly the right size (plus alignment), not wasting any + * memory. + * + * The information about free chunks is maintained both at the block level and + * global (context) level. This is possible as the chunk size (and thus also + * the number of chunks per block) is fixed. + * + * On each block, free chunks are tracked in a simple linked list. Contents + * of free chunks is replaced with an index of the next free chunk, forming + * a very simple linked list. Each block also contains a counter of free + * chunks. Combined with the local block-level freelist, it makes it trivial + * to eventually free the whole block. + * + * At the context level, we use 'freelist' to track blocks ordered by number + * of free chunks, starting with blocks having a single allocated chunk, and + * with completely full blocks on the tail. + * + * This also allows various optimizations - for example when searching for + * free chunk, the allocator reuses space from the fullest blocks first, in + * the hope that some of the less full blocks will get completely empty (and + * returned back to the OS). + * + * For each block, we maintain pointer to the first free chunk - this is quite + * cheap and allows us to skip all the preceding used chunks, eliminating + * a significant number of lookups in many common usage patterns. In the worst + * case this performs as if the pointer was not maintained. + * + * We cache the freelist index for the blocks with the fewest free chunks + * (minFreeChunks), so that we don't have to search the freelist on every + * SlabAlloc() call, which is quite expensive. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "lib/ilist.h" +#include "utils/memdebug.h" +#include "utils/memutils.h" + +/* + * SlabContext is a specialized implementation of MemoryContext. + */ +typedef struct SlabContext +{ + MemoryContextData header; /* Standard memory-context fields */ + /* Allocation parameters for this context: */ + Size chunkSize; /* chunk size */ + Size fullChunkSize; /* chunk size including header and alignment */ + Size blockSize; /* block size */ + Size headerSize; /* allocated size of context header */ + int chunksPerBlock; /* number of chunks per block */ + int minFreeChunks; /* min number of free chunks in any block */ + int nblocks; /* number of blocks allocated */ +#ifdef MEMORY_CONTEXT_CHECKING + bool *freechunks; /* bitmap of free chunks in a block */ +#endif + /* blocks with free space, grouped by number of free chunks: */ + dlist_head freelist[FLEXIBLE_ARRAY_MEMBER]; +} SlabContext; + +/* + * SlabBlock + * Structure of a single block in SLAB allocator. + * + * node: doubly-linked list of blocks in global freelist + * nfree: number of free chunks in this block + * firstFreeChunk: index of the first free chunk + */ +typedef struct SlabBlock +{ + dlist_node node; /* doubly-linked list */ + int nfree; /* number of free chunks */ + int firstFreeChunk; /* index of the first free chunk in the block */ +} SlabBlock; + +/* + * SlabChunk + * The prefix of each piece of memory in a SlabBlock + * + * Note: to meet the memory context APIs, the payload area of the chunk must + * be maxaligned, and the "slab" link must be immediately adjacent to the + * payload area (cf. GetMemoryChunkContext). Since we support no machines on + * which MAXALIGN is more than twice sizeof(void *), this happens without any + * special hacking in this struct declaration. But there is a static + * assertion below that the alignment is done correctly. + */ +typedef struct SlabChunk +{ + SlabBlock *block; /* block owning this chunk */ + SlabContext *slab; /* owning context */ + /* there must not be any padding to reach a MAXALIGN boundary here! */ +} SlabChunk; + + +#define SlabPointerGetChunk(ptr) \ + ((SlabChunk *)(((char *)(ptr)) - sizeof(SlabChunk))) +#define SlabChunkGetPointer(chk) \ + ((void *)(((char *)(chk)) + sizeof(SlabChunk))) +#define SlabBlockGetChunk(slab, block, idx) \ + ((SlabChunk *) ((char *) (block) + sizeof(SlabBlock) \ + + (idx * slab->fullChunkSize))) +#define SlabBlockStart(block) \ + ((char *) block + sizeof(SlabBlock)) +#define SlabChunkIndex(slab, block, chunk) \ + (((char *) chunk - SlabBlockStart(block)) / slab->fullChunkSize) + +/* + * These functions implement the MemoryContext API for Slab contexts. + */ +static void *SlabAlloc(MemoryContext context, Size size); +static void SlabFree(MemoryContext context, void *pointer); +static void *SlabRealloc(MemoryContext context, void *pointer, Size size); +static void SlabReset(MemoryContext context); +static void SlabDelete(MemoryContext context); +static Size SlabGetChunkSpace(MemoryContext context, void *pointer); +static bool SlabIsEmpty(MemoryContext context); +static void SlabStats(MemoryContext context, + MemoryStatsPrintFunc printfunc, void *passthru, + MemoryContextCounters *totals, + bool print_to_stderr); +#ifdef MEMORY_CONTEXT_CHECKING +static void SlabCheck(MemoryContext context); +#endif + +/* + * This is the virtual function table for Slab contexts. + */ +static const MemoryContextMethods SlabMethods = { + SlabAlloc, + SlabFree, + SlabRealloc, + SlabReset, + SlabDelete, + SlabGetChunkSpace, + SlabIsEmpty, + SlabStats +#ifdef MEMORY_CONTEXT_CHECKING + ,SlabCheck +#endif +}; + + +/* + * SlabContextCreate + * Create a new Slab context. + * + * parent: parent context, or NULL if top-level context + * name: name of context (must be statically allocated) + * blockSize: allocation block size + * chunkSize: allocation chunk size + * + * The chunkSize may not exceed: + * MAXALIGN_DOWN(SIZE_MAX) - MAXALIGN(sizeof(SlabBlock)) - sizeof(SlabChunk) + */ +MemoryContext +SlabContextCreate(MemoryContext parent, + const char *name, + Size blockSize, + Size chunkSize) +{ + int chunksPerBlock; + Size fullChunkSize; + Size freelistSize; + Size headerSize; + SlabContext *slab; + int i; + + /* Assert we padded SlabChunk properly */ + StaticAssertStmt(sizeof(SlabChunk) == MAXALIGN(sizeof(SlabChunk)), + "sizeof(SlabChunk) is not maxaligned"); + StaticAssertStmt(offsetof(SlabChunk, slab) + sizeof(MemoryContext) == + sizeof(SlabChunk), + "padding calculation in SlabChunk is wrong"); + + /* Make sure the linked list node fits inside a freed chunk */ + if (chunkSize < sizeof(int)) + chunkSize = sizeof(int); + + /* chunk, including SLAB header (both addresses nicely aligned) */ + fullChunkSize = sizeof(SlabChunk) + MAXALIGN(chunkSize); + + /* Make sure the block can store at least one chunk. */ + if (blockSize < fullChunkSize + sizeof(SlabBlock)) + elog(ERROR, "block size %zu for slab is too small for %zu chunks", + blockSize, chunkSize); + + /* Compute maximum number of chunks per block */ + chunksPerBlock = (blockSize - sizeof(SlabBlock)) / fullChunkSize; + + /* The freelist starts with 0, ends with chunksPerBlock. */ + freelistSize = sizeof(dlist_head) * (chunksPerBlock + 1); + + /* + * Allocate the context header. Unlike aset.c, we never try to combine + * this with the first regular block; not worth the extra complication. + */ + + /* Size of the memory context header */ + headerSize = offsetof(SlabContext, freelist) + freelistSize; + +#ifdef MEMORY_CONTEXT_CHECKING + + /* + * With memory checking, we need to allocate extra space for the bitmap of + * free chunks. The bitmap is an array of bools, so we don't need to worry + * about alignment. + */ + headerSize += chunksPerBlock * sizeof(bool); +#endif + + slab = (SlabContext *) malloc(headerSize); + if (slab == NULL) + { + MemoryContextStats(TopMemoryContext); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"), + errdetail("Failed while creating memory context \"%s\".", + name))); + } + + /* + * Avoid writing code that can fail between here and MemoryContextCreate; + * we'd leak the header if we ereport in this stretch. + */ + + /* Fill in SlabContext-specific header fields */ + slab->chunkSize = chunkSize; + slab->fullChunkSize = fullChunkSize; + slab->blockSize = blockSize; + slab->headerSize = headerSize; + slab->chunksPerBlock = chunksPerBlock; + slab->minFreeChunks = 0; + slab->nblocks = 0; + + /* initialize the freelist slots */ + for (i = 0; i < (slab->chunksPerBlock + 1); i++) + dlist_init(&slab->freelist[i]); + +#ifdef MEMORY_CONTEXT_CHECKING + /* set the freechunks pointer right after the freelists array */ + slab->freechunks + = (bool *) slab + offsetof(SlabContext, freelist) + freelistSize; +#endif + + /* Finally, do the type-independent part of context creation */ + MemoryContextCreate((MemoryContext) slab, + T_SlabContext, + &SlabMethods, + parent, + name); + + return (MemoryContext) slab; +} + +/* + * SlabReset + * Frees all memory which is allocated in the given set. + * + * The code simply frees all the blocks in the context - we don't keep any + * keeper blocks or anything like that. + */ +static void +SlabReset(MemoryContext context) +{ + int i; + SlabContext *slab = castNode(SlabContext, context); + + Assert(slab); + +#ifdef MEMORY_CONTEXT_CHECKING + /* Check for corruption and leaks before freeing */ + SlabCheck(context); +#endif + + /* walk over freelists and free the blocks */ + for (i = 0; i <= slab->chunksPerBlock; i++) + { + dlist_mutable_iter miter; + + dlist_foreach_modify(miter, &slab->freelist[i]) + { + SlabBlock *block = dlist_container(SlabBlock, node, miter.cur); + + dlist_delete(miter.cur); + +#ifdef CLOBBER_FREED_MEMORY + wipe_mem(block, slab->blockSize); +#endif + free(block); + slab->nblocks--; + context->mem_allocated -= slab->blockSize; + } + } + + slab->minFreeChunks = 0; + + Assert(slab->nblocks == 0); + Assert(context->mem_allocated == 0); +} + +/* + * SlabDelete + * Free all memory which is allocated in the given context. + */ +static void +SlabDelete(MemoryContext context) +{ + /* Reset to release all the SlabBlocks */ + SlabReset(context); + /* And free the context header */ + free(context); +} + +/* + * SlabAlloc + * Returns pointer to allocated memory of given size or NULL if + * request could not be completed; memory is added to the slab. + */ +static void * +SlabAlloc(MemoryContext context, Size size) +{ + SlabContext *slab = castNode(SlabContext, context); + SlabBlock *block; + SlabChunk *chunk; + int idx; + + Assert(slab); + + Assert((slab->minFreeChunks >= 0) && + (slab->minFreeChunks < slab->chunksPerBlock)); + + /* make sure we only allow correct request size */ + if (size != slab->chunkSize) + elog(ERROR, "unexpected alloc chunk size %zu (expected %zu)", + size, slab->chunkSize); + + /* + * If there are no free chunks in any existing block, create a new block + * and put it to the last freelist bucket. + * + * slab->minFreeChunks == 0 means there are no blocks with free chunks, + * thanks to how minFreeChunks is updated at the end of SlabAlloc(). + */ + if (slab->minFreeChunks == 0) + { + block = (SlabBlock *) malloc(slab->blockSize); + + if (block == NULL) + return NULL; + + block->nfree = slab->chunksPerBlock; + block->firstFreeChunk = 0; + + /* + * Put all the chunks on a freelist. Walk the chunks and point each + * one to the next one. + */ + for (idx = 0; idx < slab->chunksPerBlock; idx++) + { + chunk = SlabBlockGetChunk(slab, block, idx); + *(int32 *) SlabChunkGetPointer(chunk) = (idx + 1); + } + + /* + * And add it to the last freelist with all chunks empty. + * + * We know there are no blocks in the freelist, otherwise we wouldn't + * need a new block. + */ + Assert(dlist_is_empty(&slab->freelist[slab->chunksPerBlock])); + + dlist_push_head(&slab->freelist[slab->chunksPerBlock], &block->node); + + slab->minFreeChunks = slab->chunksPerBlock; + slab->nblocks += 1; + context->mem_allocated += slab->blockSize; + } + + /* grab the block from the freelist (even the new block is there) */ + block = dlist_head_element(SlabBlock, node, + &slab->freelist[slab->minFreeChunks]); + + /* make sure we actually got a valid block, with matching nfree */ + Assert(block != NULL); + Assert(slab->minFreeChunks == block->nfree); + Assert(block->nfree > 0); + + /* we know index of the first free chunk in the block */ + idx = block->firstFreeChunk; + + /* make sure the chunk index is valid, and that it's marked as empty */ + Assert((idx >= 0) && (idx < slab->chunksPerBlock)); + + /* compute the chunk location block start (after the block header) */ + chunk = SlabBlockGetChunk(slab, block, idx); + + /* + * Update the block nfree count, and also the minFreeChunks as we've + * decreased nfree for a block with the minimum number of free chunks + * (because that's how we chose the block). + */ + block->nfree--; + slab->minFreeChunks = block->nfree; + + /* + * Remove the chunk from the freelist head. The index of the next free + * chunk is stored in the chunk itself. + */ + VALGRIND_MAKE_MEM_DEFINED(SlabChunkGetPointer(chunk), sizeof(int32)); + block->firstFreeChunk = *(int32 *) SlabChunkGetPointer(chunk); + + Assert(block->firstFreeChunk >= 0); + Assert(block->firstFreeChunk <= slab->chunksPerBlock); + + Assert((block->nfree != 0 && + block->firstFreeChunk < slab->chunksPerBlock) || + (block->nfree == 0 && + block->firstFreeChunk == slab->chunksPerBlock)); + + /* move the whole block to the right place in the freelist */ + dlist_delete(&block->node); + dlist_push_head(&slab->freelist[block->nfree], &block->node); + + /* + * And finally update minFreeChunks, i.e. the index to the block with the + * lowest number of free chunks. We only need to do that when the block + * got full (otherwise we know the current block is the right one). We'll + * simply walk the freelist until we find a non-empty entry. + */ + if (slab->minFreeChunks == 0) + { + for (idx = 1; idx <= slab->chunksPerBlock; idx++) + { + if (dlist_is_empty(&slab->freelist[idx])) + continue; + + /* found a non-empty freelist */ + slab->minFreeChunks = idx; + break; + } + } + + if (slab->minFreeChunks == slab->chunksPerBlock) + slab->minFreeChunks = 0; + + /* Prepare to initialize the chunk header. */ + VALGRIND_MAKE_MEM_UNDEFINED(chunk, sizeof(SlabChunk)); + + chunk->block = block; + chunk->slab = slab; + +#ifdef MEMORY_CONTEXT_CHECKING + /* slab mark to catch clobber of "unused" space */ + if (slab->chunkSize < (slab->fullChunkSize - sizeof(SlabChunk))) + { + set_sentinel(SlabChunkGetPointer(chunk), size); + VALGRIND_MAKE_MEM_NOACCESS(((char *) chunk) + + sizeof(SlabChunk) + slab->chunkSize, + slab->fullChunkSize - + (slab->chunkSize + sizeof(SlabChunk))); + } +#endif +#ifdef RANDOMIZE_ALLOCATED_MEMORY + /* fill the allocated space with junk */ + randomize_mem((char *) SlabChunkGetPointer(chunk), size); +#endif + + Assert(slab->nblocks * slab->blockSize == context->mem_allocated); + + return SlabChunkGetPointer(chunk); +} + +/* + * SlabFree + * Frees allocated memory; memory is removed from the slab. + */ +static void +SlabFree(MemoryContext context, void *pointer) +{ + int idx; + SlabContext *slab = castNode(SlabContext, context); + SlabChunk *chunk = SlabPointerGetChunk(pointer); + SlabBlock *block = chunk->block; + +#ifdef MEMORY_CONTEXT_CHECKING + /* Test for someone scribbling on unused space in chunk */ + if (slab->chunkSize < (slab->fullChunkSize - sizeof(SlabChunk))) + if (!sentinel_ok(pointer, slab->chunkSize)) + elog(WARNING, "detected write past chunk end in %s %p", + slab->header.name, chunk); +#endif + + /* compute index of the chunk with respect to block start */ + idx = SlabChunkIndex(slab, block, chunk); + + /* add chunk to freelist, and update block nfree count */ + *(int32 *) pointer = block->firstFreeChunk; + block->firstFreeChunk = idx; + block->nfree++; + + Assert(block->nfree > 0); + Assert(block->nfree <= slab->chunksPerBlock); + +#ifdef CLOBBER_FREED_MEMORY + /* XXX don't wipe the int32 index, used for block-level freelist */ + wipe_mem((char *) pointer + sizeof(int32), + slab->chunkSize - sizeof(int32)); +#endif + + /* remove the block from a freelist */ + dlist_delete(&block->node); + + /* + * See if we need to update the minFreeChunks field for the slab - we only + * need to do that if there the block had that number of free chunks + * before we freed one. In that case, we check if there still are blocks + * in the original freelist and we either keep the current value (if there + * still are blocks) or increment it by one (the new block is still the + * one with minimum free chunks). + * + * The one exception is when the block will get completely free - in that + * case we will free it, se we can't use it for minFreeChunks. It however + * means there are no more blocks with free chunks. + */ + if (slab->minFreeChunks == (block->nfree - 1)) + { + /* Have we removed the last chunk from the freelist? */ + if (dlist_is_empty(&slab->freelist[slab->minFreeChunks])) + { + /* but if we made the block entirely free, we'll free it */ + if (block->nfree == slab->chunksPerBlock) + slab->minFreeChunks = 0; + else + slab->minFreeChunks++; + } + } + + /* If the block is now completely empty, free it. */ + if (block->nfree == slab->chunksPerBlock) + { + free(block); + slab->nblocks--; + context->mem_allocated -= slab->blockSize; + } + else + dlist_push_head(&slab->freelist[block->nfree], &block->node); + + Assert(slab->nblocks >= 0); + Assert(slab->nblocks * slab->blockSize == context->mem_allocated); +} + +/* + * SlabRealloc + * Change the allocated size of a chunk. + * + * As Slab is designed for allocating equally-sized chunks of memory, it can't + * do an actual chunk size change. We try to be gentle and allow calls with + * exactly the same size, as in that case we can simply return the same + * chunk. When the size differs, we throw an error. + * + * We could also allow requests with size < chunkSize. That however seems + * rather pointless - Slab is meant for chunks of constant size, and moreover + * realloc is usually used to enlarge the chunk. + */ +static void * +SlabRealloc(MemoryContext context, void *pointer, Size size) +{ + SlabContext *slab = castNode(SlabContext, context); + + Assert(slab); + + /* can't do actual realloc with slab, but let's try to be gentle */ + if (size == slab->chunkSize) + return pointer; + + elog(ERROR, "slab allocator does not support realloc()"); + return NULL; /* keep compiler quiet */ +} + +/* + * SlabGetChunkSpace + * Given a currently-allocated chunk, determine the total space + * it occupies (including all memory-allocation overhead). + */ +static Size +SlabGetChunkSpace(MemoryContext context, void *pointer) +{ + SlabContext *slab = castNode(SlabContext, context); + + Assert(slab); + + return slab->fullChunkSize; +} + +/* + * SlabIsEmpty + * Is an Slab empty of any allocated space? + */ +static bool +SlabIsEmpty(MemoryContext context) +{ + SlabContext *slab = castNode(SlabContext, context); + + Assert(slab); + + return (slab->nblocks == 0); +} + +/* + * SlabStats + * Compute stats about memory consumption of a Slab context. + * + * printfunc: if not NULL, pass a human-readable stats string to this. + * passthru: pass this pointer through to printfunc. + * totals: if not NULL, add stats about this context into *totals. + * print_to_stderr: print stats to stderr if true, elog otherwise. + */ +static void +SlabStats(MemoryContext context, + MemoryStatsPrintFunc printfunc, void *passthru, + MemoryContextCounters *totals, + bool print_to_stderr) +{ + SlabContext *slab = castNode(SlabContext, context); + Size nblocks = 0; + Size freechunks = 0; + Size totalspace; + Size freespace = 0; + int i; + + /* Include context header in totalspace */ + totalspace = slab->headerSize; + + for (i = 0; i <= slab->chunksPerBlock; i++) + { + dlist_iter iter; + + dlist_foreach(iter, &slab->freelist[i]) + { + SlabBlock *block = dlist_container(SlabBlock, node, iter.cur); + + nblocks++; + totalspace += slab->blockSize; + freespace += slab->fullChunkSize * block->nfree; + freechunks += block->nfree; + } + } + + if (printfunc) + { + char stats_string[200]; + + snprintf(stats_string, sizeof(stats_string), + "%zu total in %zd blocks; %zu free (%zd chunks); %zu used", + totalspace, nblocks, freespace, freechunks, + totalspace - freespace); + printfunc(context, passthru, stats_string, print_to_stderr); + } + + if (totals) + { + totals->nblocks += nblocks; + totals->freechunks += freechunks; + totals->totalspace += totalspace; + totals->freespace += freespace; + } +} + + +#ifdef MEMORY_CONTEXT_CHECKING + +/* + * SlabCheck + * Walk through chunks and check consistency of memory. + * + * NOTE: report errors as WARNING, *not* ERROR or FATAL. Otherwise you'll + * find yourself in an infinite loop when trouble occurs, because this + * routine will be entered again when elog cleanup tries to release memory! + */ +static void +SlabCheck(MemoryContext context) +{ + int i; + SlabContext *slab = castNode(SlabContext, context); + const char *name = slab->header.name; + + Assert(slab); + Assert(slab->chunksPerBlock > 0); + + /* walk all the freelists */ + for (i = 0; i <= slab->chunksPerBlock; i++) + { + int j, + nfree; + dlist_iter iter; + + /* walk all blocks on this freelist */ + dlist_foreach(iter, &slab->freelist[i]) + { + int idx; + SlabBlock *block = dlist_container(SlabBlock, node, iter.cur); + + /* + * Make sure the number of free chunks (in the block header) + * matches position in the freelist. + */ + if (block->nfree != i) + elog(WARNING, "problem in slab %s: number of free chunks %d in block %p does not match freelist %d", + name, block->nfree, block, i); + + /* reset the bitmap of free chunks for this block */ + memset(slab->freechunks, 0, (slab->chunksPerBlock * sizeof(bool))); + idx = block->firstFreeChunk; + + /* + * Now walk through the chunks, count the free ones and also + * perform some additional checks for the used ones. As the chunk + * freelist is stored within the chunks themselves, we have to + * walk through the chunks and construct our own bitmap. + */ + + nfree = 0; + while (idx < slab->chunksPerBlock) + { + SlabChunk *chunk; + + /* count the chunk as free, add it to the bitmap */ + nfree++; + slab->freechunks[idx] = true; + + /* read index of the next free chunk */ + chunk = SlabBlockGetChunk(slab, block, idx); + VALGRIND_MAKE_MEM_DEFINED(SlabChunkGetPointer(chunk), sizeof(int32)); + idx = *(int32 *) SlabChunkGetPointer(chunk); + } + + for (j = 0; j < slab->chunksPerBlock; j++) + { + /* non-zero bit in the bitmap means chunk the chunk is used */ + if (!slab->freechunks[j]) + { + SlabChunk *chunk = SlabBlockGetChunk(slab, block, j); + + /* chunks have both block and slab pointers, so check both */ + if (chunk->block != block) + elog(WARNING, "problem in slab %s: bogus block link in block %p, chunk %p", + name, block, chunk); + + if (chunk->slab != slab) + elog(WARNING, "problem in slab %s: bogus slab link in block %p, chunk %p", + name, block, chunk); + + /* there might be sentinel (thanks to alignment) */ + if (slab->chunkSize < (slab->fullChunkSize - sizeof(SlabChunk))) + if (!sentinel_ok(chunk, slab->chunkSize)) + elog(WARNING, "problem in slab %s: detected write past chunk end in block %p, chunk %p", + name, block, chunk); + } + } + + /* + * Make sure we got the expected number of free chunks (as tracked + * in the block header). + */ + if (nfree != block->nfree) + elog(WARNING, "problem in slab %s: number of free chunks %d in block %p does not match bitmap %d", + name, block->nfree, block, nfree); + } + } + + Assert(slab->nblocks * slab->blockSize == context->mem_allocated); +} + +#endif /* MEMORY_CONTEXT_CHECKING */ |