diff options
Diffstat (limited to 'src/backend/storage/ipc/dsm_impl.c')
-rw-r--r-- | src/backend/storage/ipc/dsm_impl.c | 1040 |
1 files changed, 1040 insertions, 0 deletions
diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c new file mode 100644 index 0000000..1972aec --- /dev/null +++ b/src/backend/storage/ipc/dsm_impl.c @@ -0,0 +1,1040 @@ +/*------------------------------------------------------------------------- + * + * dsm_impl.c + * manage dynamic shared memory segments + * + * This file provides low-level APIs for creating and destroying shared + * memory segments using several different possible techniques. We refer + * to these segments as dynamic because they can be created, altered, and + * destroyed at any point during the server life cycle. This is unlike + * the main shared memory segment, of which there is always exactly one + * and which is always mapped at a fixed address in every PostgreSQL + * background process. + * + * Because not all systems provide the same primitives in this area, nor + * do all primitives behave the same way on all systems, we provide + * several implementations of this facility. Many systems implement + * POSIX shared memory (shm_open etc.), which is well-suited to our needs + * in this area, with the exception that shared memory identifiers live + * in a flat system-wide namespace, raising the uncomfortable prospect of + * name collisions with other processes (including other copies of + * PostgreSQL) running on the same system. Some systems only support + * the older System V shared memory interface (shmget etc.) which is + * also usable; however, the default allocation limits are often quite + * small, and the namespace is even more restricted. + * + * We also provide an mmap-based shared memory implementation. This may + * be useful on systems that provide shared memory via a special-purpose + * filesystem; by opting for this implementation, the user can even + * control precisely where their shared memory segments are placed. It + * can also be used as a fallback for systems where shm_open and shmget + * are not available or can't be used for some reason. Of course, + * mapping a file residing on an actual spinning disk is a fairly poor + * approximation for shared memory because writeback may hurt performance + * substantially, but there should be few systems where we must make do + * with such poor tools. + * + * As ever, Windows requires its own implementation. + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/dsm_impl.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <fcntl.h> +#include <unistd.h> +#ifndef WIN32 +#include <sys/mman.h> +#endif +#include <sys/stat.h> +#ifdef HAVE_SYS_IPC_H +#include <sys/ipc.h> +#endif +#ifdef HAVE_SYS_SHM_H +#include <sys/shm.h> +#endif + +#include "common/file_perm.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "portability/mem.h" +#include "postmaster/postmaster.h" +#include "storage/dsm_impl.h" +#include "storage/fd.h" +#include "utils/guc.h" +#include "utils/memutils.h" + +#ifdef USE_DSM_POSIX +static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, + Size *mapped_size, int elevel); +static int dsm_impl_posix_resize(int fd, off_t size); +#endif +#ifdef USE_DSM_SYSV +static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, + Size *mapped_size, int elevel); +#endif +#ifdef USE_DSM_WINDOWS +static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, + Size *mapped_size, int elevel); +#endif +#ifdef USE_DSM_MMAP +static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, + Size *mapped_size, int elevel); +#endif +static int errcode_for_dynamic_shared_memory(void); + +const struct config_enum_entry dynamic_shared_memory_options[] = { +#ifdef USE_DSM_POSIX + {"posix", DSM_IMPL_POSIX, false}, +#endif +#ifdef USE_DSM_SYSV + {"sysv", DSM_IMPL_SYSV, false}, +#endif +#ifdef USE_DSM_WINDOWS + {"windows", DSM_IMPL_WINDOWS, false}, +#endif +#ifdef USE_DSM_MMAP + {"mmap", DSM_IMPL_MMAP, false}, +#endif + {NULL, 0, false} +}; + +/* Implementation selector. */ +int dynamic_shared_memory_type; + +/* Size of buffer to be used for zero-filling. */ +#define ZBUFFER_SIZE 8192 + +#define SEGMENT_NAME_PREFIX "Global/PostgreSQL" + +/*------ + * Perform a low-level shared memory operation in a platform-specific way, + * as dictated by the selected implementation. Each implementation is + * required to implement the following primitives. + * + * DSM_OP_CREATE. Create a segment whose size is the request_size and + * map it. + * + * DSM_OP_ATTACH. Map the segment, whose size must be the request_size. + * + * DSM_OP_DETACH. Unmap the segment. + * + * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the + * segment. + * + * Arguments: + * op: The operation to be performed. + * handle: The handle of an existing object, or for DSM_OP_CREATE, the + * a new handle the caller wants created. + * request_size: For DSM_OP_CREATE, the requested size. Otherwise, 0. + * impl_private: Private, implementation-specific data. Will be a pointer + * to NULL for the first operation on a shared memory segment within this + * backend; thereafter, it will point to the value to which it was set + * on the previous call. + * mapped_address: Pointer to start of current mapping; pointer to NULL + * if none. Updated with new mapping address. + * mapped_size: Pointer to size of current mapping; pointer to 0 if none. + * Updated with new mapped size. + * elevel: Level at which to log errors. + * + * Return value: true on success, false on failure. When false is returned, + * a message should first be logged at the specified elevel, except in the + * case where DSM_OP_CREATE experiences a name collision, which should + * silently return false. + *----- + */ +bool +dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, Size *mapped_size, + int elevel) +{ + Assert(op == DSM_OP_CREATE || request_size == 0); + Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) || + (*mapped_address == NULL && *mapped_size == 0)); + + switch (dynamic_shared_memory_type) + { +#ifdef USE_DSM_POSIX + case DSM_IMPL_POSIX: + return dsm_impl_posix(op, handle, request_size, impl_private, + mapped_address, mapped_size, elevel); +#endif +#ifdef USE_DSM_SYSV + case DSM_IMPL_SYSV: + return dsm_impl_sysv(op, handle, request_size, impl_private, + mapped_address, mapped_size, elevel); +#endif +#ifdef USE_DSM_WINDOWS + case DSM_IMPL_WINDOWS: + return dsm_impl_windows(op, handle, request_size, impl_private, + mapped_address, mapped_size, elevel); +#endif +#ifdef USE_DSM_MMAP + case DSM_IMPL_MMAP: + return dsm_impl_mmap(op, handle, request_size, impl_private, + mapped_address, mapped_size, elevel); +#endif + default: + elog(ERROR, "unexpected dynamic shared memory type: %d", + dynamic_shared_memory_type); + return false; + } +} + +#ifdef USE_DSM_POSIX +/* + * Operating system primitives to support POSIX shared memory. + * + * POSIX shared memory segments are created and attached using shm_open() + * and shm_unlink(); other operations, such as sizing or mapping the + * segment, are performed as if the shared memory segments were files. + * + * Indeed, on some platforms, they may be implemented that way. While + * POSIX shared memory segments seem intended to exist in a flat namespace, + * some operating systems may implement them as files, even going so far + * to treat a request for /xyz as a request to create a file by that name + * in the root directory. Users of such broken platforms should select + * a different shared memory implementation. + */ +static bool +dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, Size *mapped_size, + int elevel) +{ + char name[64]; + int flags; + int fd; + char *address; + + snprintf(name, 64, "/PostgreSQL.%u", handle); + + /* Handle teardown cases. */ + if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) + { + if (*mapped_address != NULL + && munmap(*mapped_address, *mapped_size) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not unmap shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = NULL; + *mapped_size = 0; + if (op == DSM_OP_DESTROY && shm_unlink(name) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not remove shared memory segment \"%s\": %m", + name))); + return false; + } + return true; + } + + /* + * Create new segment or open an existing one for attach. + * + * Even though we will close the FD before returning, it seems desirable + * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE + * failure. The fact that we won't hold the FD open long justifies using + * ReserveExternalFD rather than AcquireExternalFD, though. + */ + ReserveExternalFD(); + + flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0); + if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1) + { + ReleaseExternalFD(); + if (errno != EEXIST) + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not open shared memory segment \"%s\": %m", + name))); + return false; + } + + /* + * If we're attaching the segment, determine the current size; if we are + * creating the segment, set the size to the requested value. + */ + if (op == DSM_OP_ATTACH) + { + struct stat st; + + if (fstat(fd, &st) != 0) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + close(fd); + ReleaseExternalFD(); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not stat shared memory segment \"%s\": %m", + name))); + return false; + } + request_size = st.st_size; + } + else if (dsm_impl_posix_resize(fd, request_size) != 0) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + close(fd); + ReleaseExternalFD(); + shm_unlink(name); + errno = save_errno; + + /* + * If we received a query cancel or termination signal, we will have + * EINTR set here. If the caller said that errors are OK here, check + * for interrupts immediately. + */ + if (errno == EINTR && elevel >= ERROR) + CHECK_FOR_INTERRUPTS(); + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m", + name, request_size))); + return false; + } + + /* Map it. */ + address = mmap(NULL, request_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0); + if (address == MAP_FAILED) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + close(fd); + ReleaseExternalFD(); + if (op == DSM_OP_CREATE) + shm_unlink(name); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not map shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = address; + *mapped_size = request_size; + close(fd); + ReleaseExternalFD(); + + return true; +} + +/* + * Set the size of a virtual memory region associated with a file descriptor. + * If necessary, also ensure that virtual memory is actually allocated by the + * operating system, to avoid nasty surprises later. + * + * Returns non-zero if either truncation or allocation fails, and sets errno. + */ +static int +dsm_impl_posix_resize(int fd, off_t size) +{ + int rc; + + /* Truncate (or extend) the file to the requested size. */ + rc = ftruncate(fd, size); + + /* + * On Linux, a shm_open fd is backed by a tmpfs file. After resizing with + * ftruncate, the file may contain a hole. Accessing memory backed by a + * hole causes tmpfs to allocate pages, which fails with SIGBUS if there + * is no more tmpfs space available. So we ask tmpfs to allocate pages + * here, so we can fail gracefully with ENOSPC now rather than risking + * SIGBUS later. + */ +#if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__) + if (rc == 0) + { + /* + * We may get interrupted. If so, just retry unless there is an + * interrupt pending. This avoids the possibility of looping forever + * if another backend is repeatedly trying to interrupt us. + */ + pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE); + do + { + rc = posix_fallocate(fd, 0, size); + } while (rc == EINTR && !(ProcDiePending || QueryCancelPending)); + pgstat_report_wait_end(); + + /* + * The caller expects errno to be set, but posix_fallocate() doesn't + * set it. Instead it returns error numbers directly. So set errno, + * even though we'll also return rc to indicate success or failure. + */ + errno = rc; + } +#endif /* HAVE_POSIX_FALLOCATE && __linux__ */ + + return rc; +} + +#endif /* USE_DSM_POSIX */ + +#ifdef USE_DSM_SYSV +/* + * Operating system primitives to support System V shared memory. + * + * System V shared memory segments are manipulated using shmget(), shmat(), + * shmdt(), and shmctl(). As the default allocation limits for System V + * shared memory are usually quite low, the POSIX facilities may be + * preferable; but those are not supported everywhere. + */ +static bool +dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, Size *mapped_size, + int elevel) +{ + key_t key; + int ident; + char *address; + char name[64]; + int *ident_cache; + + /* + * POSIX shared memory and mmap-based shared memory identify segments with + * names. To avoid needless error message variation, we use the handle as + * the name. + */ + snprintf(name, 64, "%u", handle); + + /* + * The System V shared memory namespace is very restricted; names are of + * type key_t, which is expected to be some sort of integer data type, but + * not necessarily the same one as dsm_handle. Since we use dsm_handle to + * identify shared memory segments across processes, this might seem like + * a problem, but it's really not. If dsm_handle is bigger than key_t, + * the cast below might truncate away some bits from the handle the + * user-provided, but it'll truncate exactly the same bits away in exactly + * the same fashion every time we use that handle, which is all that + * really matters. Conversely, if dsm_handle is smaller than key_t, we + * won't use the full range of available key space, but that's no big deal + * either. + * + * We do make sure that the key isn't negative, because that might not be + * portable. + */ + key = (key_t) handle; + if (key < 1) /* avoid compiler warning if type is unsigned */ + key = -key; + + /* + * There's one special key, IPC_PRIVATE, which can't be used. If we end + * up with that value by chance during a create operation, just pretend it + * already exists, so that caller will retry. If we run into it anywhere + * else, the caller has passed a handle that doesn't correspond to + * anything we ever created, which should not happen. + */ + if (key == IPC_PRIVATE) + { + if (op != DSM_OP_CREATE) + elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE"); + errno = EEXIST; + return false; + } + + /* + * Before we can do anything with a shared memory segment, we have to map + * the shared memory key to a shared memory identifier using shmget(). To + * avoid repeated lookups, we store the key using impl_private. + */ + if (*impl_private != NULL) + { + ident_cache = *impl_private; + ident = *ident_cache; + } + else + { + int flags = IPCProtection; + size_t segsize; + + /* + * Allocate the memory BEFORE acquiring the resource, so that we don't + * leak the resource if memory allocation fails. + */ + ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int)); + + /* + * When using shmget to find an existing segment, we must pass the + * size as 0. Passing a non-zero size which is greater than the + * actual size will result in EINVAL. + */ + segsize = 0; + + if (op == DSM_OP_CREATE) + { + flags |= IPC_CREAT | IPC_EXCL; + segsize = request_size; + } + + if ((ident = shmget(key, segsize, flags)) == -1) + { + if (errno != EEXIST) + { + int save_errno = errno; + + pfree(ident_cache); + errno = save_errno; + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not get shared memory segment: %m"))); + } + return false; + } + + *ident_cache = ident; + *impl_private = ident_cache; + } + + /* Handle teardown cases. */ + if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) + { + pfree(ident_cache); + *impl_private = NULL; + if (*mapped_address != NULL && shmdt(*mapped_address) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not unmap shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = NULL; + *mapped_size = 0; + if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not remove shared memory segment \"%s\": %m", + name))); + return false; + } + return true; + } + + /* If we're attaching it, we must use IPC_STAT to determine the size. */ + if (op == DSM_OP_ATTACH) + { + struct shmid_ds shm; + + if (shmctl(ident, IPC_STAT, &shm) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not stat shared memory segment \"%s\": %m", + name))); + return false; + } + request_size = shm.shm_segsz; + } + + /* Map it. */ + address = shmat(ident, NULL, PG_SHMAT_FLAGS); + if (address == (void *) -1) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + if (op == DSM_OP_CREATE) + shmctl(ident, IPC_RMID, NULL); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not map shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = address; + *mapped_size = request_size; + + return true; +} +#endif + +#ifdef USE_DSM_WINDOWS +/* + * Operating system primitives to support Windows shared memory. + * + * Windows shared memory implementation is done using file mapping + * which can be backed by either physical file or system paging file. + * Current implementation uses system paging file as other effects + * like performance are not clear for physical file and it is used in similar + * way for main shared memory in windows. + * + * A memory mapping object is a kernel object - they always get deleted when + * the last reference to them goes away, either explicitly via a CloseHandle or + * when the process containing the reference exits. + */ +static bool +dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, + Size *mapped_size, int elevel) +{ + char *address; + HANDLE hmap; + char name[64]; + MEMORY_BASIC_INFORMATION info; + + /* + * Storing the shared memory segment in the Global\ namespace, can allow + * any process running in any session to access that file mapping object + * provided that the caller has the required access rights. But to avoid + * issues faced in main shared memory, we are using the naming convention + * similar to main shared memory. We can change here once issue mentioned + * in GetSharedMemName is resolved. + */ + snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle); + + /* + * Handle teardown cases. Since Windows automatically destroys the object + * when no references remain, we can treat it the same as detach. + */ + if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) + { + if (*mapped_address != NULL + && UnmapViewOfFile(*mapped_address) == 0) + { + _dosmaperr(GetLastError()); + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not unmap shared memory segment \"%s\": %m", + name))); + return false; + } + if (*impl_private != NULL + && CloseHandle(*impl_private) == 0) + { + _dosmaperr(GetLastError()); + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not remove shared memory segment \"%s\": %m", + name))); + return false; + } + + *impl_private = NULL; + *mapped_address = NULL; + *mapped_size = 0; + return true; + } + + /* Create new segment or open an existing one for attach. */ + if (op == DSM_OP_CREATE) + { + DWORD size_high; + DWORD size_low; + DWORD errcode; + + /* Shifts >= the width of the type are undefined. */ +#ifdef _WIN64 + size_high = request_size >> 32; +#else + size_high = 0; +#endif + size_low = (DWORD) request_size; + + /* CreateFileMapping might not clear the error code on success */ + SetLastError(0); + + hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */ + NULL, /* Default security attrs */ + PAGE_READWRITE, /* Memory is read/write */ + size_high, /* Upper 32 bits of size */ + size_low, /* Lower 32 bits of size */ + name); + + errcode = GetLastError(); + if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED) + { + /* + * On Windows, when the segment already exists, a handle for the + * existing segment is returned. We must close it before + * returning. However, if the existing segment is created by a + * service, then it returns ERROR_ACCESS_DENIED. We don't do + * _dosmaperr here, so errno won't be modified. + */ + if (hmap) + CloseHandle(hmap); + return false; + } + + if (!hmap) + { + _dosmaperr(errcode); + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not create shared memory segment \"%s\": %m", + name))); + return false; + } + } + else + { + hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ, + FALSE, /* do not inherit the name */ + name); /* name of mapping object */ + if (!hmap) + { + _dosmaperr(GetLastError()); + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not open shared memory segment \"%s\": %m", + name))); + return false; + } + } + + /* Map it. */ + address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ, + 0, 0, 0); + if (!address) + { + int save_errno; + + _dosmaperr(GetLastError()); + /* Back out what's already been done. */ + save_errno = errno; + CloseHandle(hmap); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not map shared memory segment \"%s\": %m", + name))); + return false; + } + + /* + * VirtualQuery gives size in page_size units, which is 4K for Windows. We + * need size only when we are attaching, but it's better to get the size + * when creating new segment to keep size consistent both for + * DSM_OP_CREATE and DSM_OP_ATTACH. + */ + if (VirtualQuery(address, &info, sizeof(info)) == 0) + { + int save_errno; + + _dosmaperr(GetLastError()); + /* Back out what's already been done. */ + save_errno = errno; + UnmapViewOfFile(address); + CloseHandle(hmap); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not stat shared memory segment \"%s\": %m", + name))); + return false; + } + + *mapped_address = address; + *mapped_size = info.RegionSize; + *impl_private = hmap; + + return true; +} +#endif + +#ifdef USE_DSM_MMAP +/* + * Operating system primitives to support mmap-based shared memory. + * + * Calling this "shared memory" is somewhat of a misnomer, because what + * we're really doing is creating a bunch of files and mapping them into + * our address space. The operating system may feel obliged to + * synchronize the contents to disk even if nothing is being paged out, + * which will not serve us well. The user can relocate the pg_dynshmem + * directory to a ramdisk to avoid this problem, if available. + */ +static bool +dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, Size *mapped_size, + int elevel) +{ + char name[64]; + int flags; + int fd; + char *address; + + snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u", + handle); + + /* Handle teardown cases. */ + if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) + { + if (*mapped_address != NULL + && munmap(*mapped_address, *mapped_size) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not unmap shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = NULL; + *mapped_size = 0; + if (op == DSM_OP_DESTROY && unlink(name) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not remove shared memory segment \"%s\": %m", + name))); + return false; + } + return true; + } + + /* Create new segment or open an existing one for attach. */ + flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0); + if ((fd = OpenTransientFile(name, flags)) == -1) + { + if (errno != EEXIST) + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not open shared memory segment \"%s\": %m", + name))); + return false; + } + + /* + * If we're attaching the segment, determine the current size; if we are + * creating the segment, set the size to the requested value. + */ + if (op == DSM_OP_ATTACH) + { + struct stat st; + + if (fstat(fd, &st) != 0) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + CloseTransientFile(fd); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not stat shared memory segment \"%s\": %m", + name))); + return false; + } + request_size = st.st_size; + } + else + { + /* + * Allocate a buffer full of zeros. + * + * Note: palloc zbuffer, instead of just using a local char array, to + * ensure it is reasonably well-aligned; this may save a few cycles + * transferring data to the kernel. + */ + char *zbuffer = (char *) palloc0(ZBUFFER_SIZE); + uint32 remaining = request_size; + bool success = true; + + /* + * Zero-fill the file. We have to do this the hard way to ensure that + * all the file space has really been allocated, so that we don't + * later seg fault when accessing the memory mapping. This is pretty + * pessimal. + */ + while (success && remaining > 0) + { + Size goal = remaining; + + if (goal > ZBUFFER_SIZE) + goal = ZBUFFER_SIZE; + pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE); + if (write(fd, zbuffer, goal) == goal) + remaining -= goal; + else + success = false; + pgstat_report_wait_end(); + } + + if (!success) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + CloseTransientFile(fd); + unlink(name); + errno = save_errno ? save_errno : ENOSPC; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m", + name, request_size))); + return false; + } + } + + /* Map it. */ + address = mmap(NULL, request_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0); + if (address == MAP_FAILED) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + CloseTransientFile(fd); + if (op == DSM_OP_CREATE) + unlink(name); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not map shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = address; + *mapped_size = request_size; + + if (CloseTransientFile(fd) != 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not close shared memory segment \"%s\": %m", + name))); + return false; + } + + return true; +} +#endif + +/* + * Implementation-specific actions that must be performed when a segment is to + * be preserved even when no backend has it attached. + * + * Except on Windows, we don't need to do anything at all. But since Windows + * cleans up segments automatically when no references remain, we duplicate + * the segment handle into the postmaster process. The postmaster needn't + * do anything to receive the handle; Windows transfers it automatically. + */ +void +dsm_impl_pin_segment(dsm_handle handle, void *impl_private, + void **impl_private_pm_handle) +{ + switch (dynamic_shared_memory_type) + { +#ifdef USE_DSM_WINDOWS + case DSM_IMPL_WINDOWS: + { + HANDLE hmap; + + if (!DuplicateHandle(GetCurrentProcess(), impl_private, + PostmasterHandle, &hmap, 0, FALSE, + DUPLICATE_SAME_ACCESS)) + { + char name[64]; + + snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle); + _dosmaperr(GetLastError()); + ereport(ERROR, + (errcode_for_dynamic_shared_memory(), + errmsg("could not duplicate handle for \"%s\": %m", + name))); + } + + /* + * Here, we remember the handle that we created in the + * postmaster process. This handle isn't actually usable in + * any process other than the postmaster, but that doesn't + * matter. We're just holding onto it so that, if the segment + * is unpinned, dsm_impl_unpin_segment can close it. + */ + *impl_private_pm_handle = hmap; + break; + } +#endif + default: + break; + } +} + +/* + * Implementation-specific actions that must be performed when a segment is no + * longer to be preserved, so that it will be cleaned up when all backends + * have detached from it. + * + * Except on Windows, we don't need to do anything at all. For Windows, we + * close the extra handle that dsm_impl_pin_segment created in the + * postmaster's process space. + */ +void +dsm_impl_unpin_segment(dsm_handle handle, void **impl_private) +{ + switch (dynamic_shared_memory_type) + { +#ifdef USE_DSM_WINDOWS + case DSM_IMPL_WINDOWS: + { + if (*impl_private && + !DuplicateHandle(PostmasterHandle, *impl_private, + NULL, NULL, 0, FALSE, + DUPLICATE_CLOSE_SOURCE)) + { + char name[64]; + + snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle); + _dosmaperr(GetLastError()); + ereport(ERROR, + (errcode_for_dynamic_shared_memory(), + errmsg("could not duplicate handle for \"%s\": %m", + name))); + } + + *impl_private = NULL; + break; + } +#endif + default: + break; + } +} + +static int +errcode_for_dynamic_shared_memory(void) +{ + if (errno == EFBIG || errno == ENOMEM) + return errcode(ERRCODE_OUT_OF_MEMORY); + else + return errcode_for_file_access(); +} |