summaryrefslogtreecommitdiffstats
path: root/src/backend/port/sysv_shmem.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-13 13:44:03 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-13 13:44:03 +0000
commit293913568e6a7a86fd1479e1cff8e2ecb58d6568 (patch)
treefc3b469a3ec5ab71b36ea97cc7aaddb838423a0c /src/backend/port/sysv_shmem.c
parentInitial commit. (diff)
downloadpostgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.tar.xz
postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.zip
Adding upstream version 16.2.upstream/16.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/port/sysv_shmem.c')
-rw-r--r--src/backend/port/sysv_shmem.c976
1 files changed, 976 insertions, 0 deletions
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
new file mode 100644
index 0000000..eaba244
--- /dev/null
+++ b/src/backend/port/sysv_shmem.c
@@ -0,0 +1,976 @@
+/*-------------------------------------------------------------------------
+ *
+ * sysv_shmem.c
+ * Implement shared memory using SysV facilities
+ *
+ * These routines used to be a fairly thin layer on top of SysV shared
+ * memory functionality. With the addition of anonymous-shmem logic,
+ * they're a bit fatter now. We still require a SysV shmem block to
+ * exist, though, because mmap'd shmem provides no way to find out how
+ * many processes are attached, which we need for interlocking purposes.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/port/sysv_shmem.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+#include <sys/file.h>
+#include <sys/ipc.h>
+#include <sys/mman.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+
+#include "miscadmin.h"
+#include "port/pg_bitutils.h"
+#include "portability/mem.h"
+#include "storage/dsm.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/pg_shmem.h"
+#include "utils/guc_hooks.h"
+#include "utils/pidfile.h"
+
+
+/*
+ * As of PostgreSQL 9.3, we normally allocate only a very small amount of
+ * System V shared memory, and only for the purposes of providing an
+ * interlock to protect the data directory. The real shared memory block
+ * is allocated using mmap(). This works around the problem that many
+ * systems have very low limits on the amount of System V shared memory
+ * that can be allocated. Even a limit of a few megabytes will be enough
+ * to run many copies of PostgreSQL without needing to adjust system settings.
+ *
+ * We assume that no one will attempt to run PostgreSQL 9.3 or later on
+ * systems that are ancient enough that anonymous shared memory is not
+ * supported, such as pre-2.4 versions of Linux. If that turns out to be
+ * false, we might need to add compile and/or run-time tests here and do this
+ * only if the running kernel supports it.
+ *
+ * However, we must always disable this logic in the EXEC_BACKEND case, and
+ * fall back to the old method of allocating the entire segment using System V
+ * shared memory, because there's no way to attach an anonymous mmap'd segment
+ * to a process after exec(). Since EXEC_BACKEND is intended only for
+ * developer use, this shouldn't be a big problem. Because of this, we do
+ * not worry about supporting anonymous shmem in the EXEC_BACKEND cases below.
+ *
+ * As of PostgreSQL 12, we regained the ability to use a large System V shared
+ * memory region even in non-EXEC_BACKEND builds, if shared_memory_type is set
+ * to sysv (though this is not the default).
+ */
+
+
+typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */
+typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */
+
+/*
+ * How does a given IpcMemoryId relate to this PostgreSQL process?
+ *
+ * One could recycle unattached segments of different data directories if we
+ * distinguished that case from other SHMSTATE_FOREIGN cases. Doing so would
+ * cause us to visit less of the key space, making us less likely to detect a
+ * SHMSTATE_ATTACHED key. It would also complicate the concurrency analysis,
+ * in that postmasters of different data directories could simultaneously
+ * attempt to recycle a given key. We'll waste keys longer in some cases, but
+ * avoiding the problems of the alternative justifies that loss.
+ */
+typedef enum
+{
+ SHMSTATE_ANALYSIS_FAILURE, /* unexpected failure to analyze the ID */
+ SHMSTATE_ATTACHED, /* pertinent to DataDir, has attached PIDs */
+ SHMSTATE_ENOENT, /* no segment of that ID */
+ SHMSTATE_FOREIGN, /* exists, but not pertinent to DataDir */
+ SHMSTATE_UNATTACHED /* pertinent to DataDir, no attached PIDs */
+} IpcMemoryState;
+
+
+unsigned long UsedShmemSegID = 0;
+void *UsedShmemSegAddr = NULL;
+
+static Size AnonymousShmemSize;
+static void *AnonymousShmem = NULL;
+
+static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
+static void IpcMemoryDetach(int status, Datum shmaddr);
+static void IpcMemoryDelete(int status, Datum shmId);
+static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId,
+ void *attachAt,
+ PGShmemHeader **addr);
+
+
+/*
+ * InternalIpcMemoryCreate(memKey, size)
+ *
+ * Attempt to create a new shared memory segment with the specified key.
+ * Will fail (return NULL) if such a segment already exists. If successful,
+ * attach the segment to the current process and return its attached address.
+ * On success, callbacks are registered with on_shmem_exit to detach and
+ * delete the segment when on_shmem_exit is called.
+ *
+ * If we fail with a failure code other than collision-with-existing-segment,
+ * print out an error and abort. Other types of errors are not recoverable.
+ */
+static void *
+InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
+{
+ IpcMemoryId shmid;
+ void *requestedAddress = NULL;
+ void *memAddress;
+
+ /*
+ * Normally we just pass requestedAddress = NULL to shmat(), allowing the
+ * system to choose where the segment gets mapped. But in an EXEC_BACKEND
+ * build, it's possible for whatever is chosen in the postmaster to not
+ * work for backends, due to variations in address space layout. As a
+ * rather klugy workaround, allow the user to specify the address to use
+ * via setting the environment variable PG_SHMEM_ADDR. (If this were of
+ * interest for anything except debugging, we'd probably create a cleaner
+ * and better-documented way to set it, such as a GUC.)
+ */
+#ifdef EXEC_BACKEND
+ {
+ char *pg_shmem_addr = getenv("PG_SHMEM_ADDR");
+
+ if (pg_shmem_addr)
+ requestedAddress = (void *) strtoul(pg_shmem_addr, NULL, 0);
+ else
+ {
+#if defined(__darwin__) && SIZEOF_VOID_P == 8
+ /*
+ * Provide a default value that is believed to avoid problems with
+ * ASLR on the current macOS release.
+ */
+ requestedAddress = (void *) 0x80000000000;
+#endif
+ }
+ }
+#endif
+
+ shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
+
+ if (shmid < 0)
+ {
+ int shmget_errno = errno;
+
+ /*
+ * Fail quietly if error indicates a collision with existing segment.
+ * One would expect EEXIST, given that we said IPC_EXCL, but perhaps
+ * we could get a permission violation instead? Also, EIDRM might
+ * occur if an old seg is slated for destruction but not gone yet.
+ */
+ if (shmget_errno == EEXIST || shmget_errno == EACCES
+#ifdef EIDRM
+ || shmget_errno == EIDRM
+#endif
+ )
+ return NULL;
+
+ /*
+ * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if
+ * there is an existing segment but it's smaller than "size" (this is
+ * a result of poorly-thought-out ordering of error tests). To
+ * distinguish between collision and invalid size in such cases, we
+ * make a second try with size = 0. These kernels do not test size
+ * against SHMMIN in the preexisting-segment case, so we will not get
+ * EINVAL a second time if there is such a segment.
+ */
+ if (shmget_errno == EINVAL)
+ {
+ shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection);
+
+ if (shmid < 0)
+ {
+ /* As above, fail quietly if we verify a collision */
+ if (errno == EEXIST || errno == EACCES
+#ifdef EIDRM
+ || errno == EIDRM
+#endif
+ )
+ return NULL;
+ /* Otherwise, fall through to report the original error */
+ }
+ else
+ {
+ /*
+ * On most platforms we cannot get here because SHMMIN is
+ * greater than zero. However, if we do succeed in creating a
+ * zero-size segment, free it and then fall through to report
+ * the original error.
+ */
+ if (shmctl(shmid, IPC_RMID, NULL) < 0)
+ elog(LOG, "shmctl(%d, %d, 0) failed: %m",
+ (int) shmid, IPC_RMID);
+ }
+ }
+
+ /*
+ * Else complain and abort.
+ *
+ * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX
+ * is violated. SHMALL violation might be reported as either ENOMEM
+ * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which
+ * it should be. SHMMNI violation is ENOSPC, per spec. Just plain
+ * not-enough-RAM is ENOMEM.
+ */
+ errno = shmget_errno;
+ ereport(FATAL,
+ (errmsg("could not create shared memory segment: %m"),
+ errdetail("Failed system call was shmget(key=%lu, size=%zu, 0%o).",
+ (unsigned long) memKey, size,
+ IPC_CREAT | IPC_EXCL | IPCProtection),
+ (shmget_errno == EINVAL) ?
+ errhint("This error usually means that PostgreSQL's request for a shared memory "
+ "segment exceeded your kernel's SHMMAX parameter, or possibly that "
+ "it is less than "
+ "your kernel's SHMMIN parameter.\n"
+ "The PostgreSQL documentation contains more information about shared "
+ "memory configuration.") : 0,
+ (shmget_errno == ENOMEM) ?
+ errhint("This error usually means that PostgreSQL's request for a shared "
+ "memory segment exceeded your kernel's SHMALL parameter. You might need "
+ "to reconfigure the kernel with larger SHMALL.\n"
+ "The PostgreSQL documentation contains more information about shared "
+ "memory configuration.") : 0,
+ (shmget_errno == ENOSPC) ?
+ errhint("This error does *not* mean that you have run out of disk space. "
+ "It occurs either if all available shared memory IDs have been taken, "
+ "in which case you need to raise the SHMMNI parameter in your kernel, "
+ "or because the system's overall limit for shared memory has been "
+ "reached.\n"
+ "The PostgreSQL documentation contains more information about shared "
+ "memory configuration.") : 0));
+ }
+
+ /* Register on-exit routine to delete the new segment */
+ on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid));
+
+ /* OK, should be able to attach to the segment */
+ memAddress = shmat(shmid, requestedAddress, PG_SHMAT_FLAGS);
+
+ if (memAddress == (void *) -1)
+ elog(FATAL, "shmat(id=%d, addr=%p, flags=0x%x) failed: %m",
+ shmid, requestedAddress, PG_SHMAT_FLAGS);
+
+ /* Register on-exit routine to detach new segment before deleting */
+ on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress));
+
+ /*
+ * Store shmem key and ID in data directory lockfile. Format to try to
+ * keep it the same length always (trailing junk in the lockfile won't
+ * hurt, but might confuse humans).
+ */
+ {
+ char line[64];
+
+ sprintf(line, "%9lu %9lu",
+ (unsigned long) memKey, (unsigned long) shmid);
+ AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line);
+ }
+
+ return memAddress;
+}
+
+/****************************************************************************/
+/* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */
+/* from process' address space */
+/* (called as an on_shmem_exit callback, hence funny argument list) */
+/****************************************************************************/
+static void
+IpcMemoryDetach(int status, Datum shmaddr)
+{
+ /* Detach System V shared memory block. */
+ if (shmdt((void *) DatumGetPointer(shmaddr)) < 0)
+ elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr));
+}
+
+/****************************************************************************/
+/* IpcMemoryDelete(status, shmId) deletes a shared memory segment */
+/* (called as an on_shmem_exit callback, hence funny argument list) */
+/****************************************************************************/
+static void
+IpcMemoryDelete(int status, Datum shmId)
+{
+ if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0)
+ elog(LOG, "shmctl(%d, %d, 0) failed: %m",
+ DatumGetInt32(shmId), IPC_RMID);
+}
+
+/*
+ * PGSharedMemoryIsInUse
+ *
+ * Is a previously-existing shmem segment still existing and in use?
+ *
+ * The point of this exercise is to detect the case where a prior postmaster
+ * crashed, but it left child backends that are still running. Therefore
+ * we only care about shmem segments that are associated with the intended
+ * DataDir. This is an important consideration since accidental matches of
+ * shmem segment IDs are reasonably common.
+ */
+bool
+PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
+{
+ PGShmemHeader *memAddress;
+ IpcMemoryState state;
+
+ state = PGSharedMemoryAttach((IpcMemoryId) id2, NULL, &memAddress);
+ if (memAddress && shmdt((void *) memAddress) < 0)
+ elog(LOG, "shmdt(%p) failed: %m", memAddress);
+ switch (state)
+ {
+ case SHMSTATE_ENOENT:
+ case SHMSTATE_FOREIGN:
+ case SHMSTATE_UNATTACHED:
+ return false;
+ case SHMSTATE_ANALYSIS_FAILURE:
+ case SHMSTATE_ATTACHED:
+ return true;
+ }
+ return true;
+}
+
+/*
+ * Test for a segment with id shmId; see comment at IpcMemoryState.
+ *
+ * If the segment exists, we'll attempt to attach to it, using attachAt
+ * if that's not NULL (but it's best to pass NULL if possible).
+ *
+ * *addr is set to the segment memory address if we attached to it, else NULL.
+ */
+static IpcMemoryState
+PGSharedMemoryAttach(IpcMemoryId shmId,
+ void *attachAt,
+ PGShmemHeader **addr)
+{
+ struct shmid_ds shmStat;
+ struct stat statbuf;
+ PGShmemHeader *hdr;
+
+ *addr = NULL;
+
+ /*
+ * First, try to stat the shm segment ID, to see if it exists at all.
+ */
+ if (shmctl(shmId, IPC_STAT, &shmStat) < 0)
+ {
+ /*
+ * EINVAL actually has multiple possible causes documented in the
+ * shmctl man page, but we assume it must mean the segment no longer
+ * exists.
+ */
+ if (errno == EINVAL)
+ return SHMSTATE_ENOENT;
+
+ /*
+ * EACCES implies we have no read permission, which means it is not a
+ * Postgres shmem segment (or at least, not one that is relevant to
+ * our data directory).
+ */
+ if (errno == EACCES)
+ return SHMSTATE_FOREIGN;
+
+ /*
+ * Some Linux kernel versions (in fact, all of them as of July 2007)
+ * sometimes return EIDRM when EINVAL is correct. The Linux kernel
+ * actually does not have any internal state that would justify
+ * returning EIDRM, so we can get away with assuming that EIDRM is
+ * equivalent to EINVAL on that platform.
+ */
+#ifdef HAVE_LINUX_EIDRM_BUG
+ if (errno == EIDRM)
+ return SHMSTATE_ENOENT;
+#endif
+
+ /*
+ * Otherwise, we had better assume that the segment is in use. The
+ * only likely case is (non-Linux, assumed spec-compliant) EIDRM,
+ * which implies that the segment has been IPC_RMID'd but there are
+ * still processes attached to it.
+ */
+ return SHMSTATE_ANALYSIS_FAILURE;
+ }
+
+ /*
+ * Try to attach to the segment and see if it matches our data directory.
+ * This avoids any risk of duplicate-shmem-key conflicts on machines that
+ * are running several postmasters under the same userid.
+ *
+ * (When we're called from PGSharedMemoryCreate, this stat call is
+ * duplicative; but since this isn't a high-traffic case it's not worth
+ * trying to optimize.)
+ */
+ if (stat(DataDir, &statbuf) < 0)
+ return SHMSTATE_ANALYSIS_FAILURE; /* can't stat; be conservative */
+
+ hdr = (PGShmemHeader *) shmat(shmId, attachAt, PG_SHMAT_FLAGS);
+ if (hdr == (PGShmemHeader *) -1)
+ {
+ /*
+ * Attachment failed. The cases we're interested in are the same as
+ * for the shmctl() call above. In particular, note that the owning
+ * postmaster could have terminated and removed the segment between
+ * shmctl() and shmat().
+ *
+ * If attachAt isn't NULL, it's possible that EINVAL reflects a
+ * problem with that address not a vanished segment, so it's best to
+ * pass NULL when probing for conflicting segments.
+ */
+ if (errno == EINVAL)
+ return SHMSTATE_ENOENT; /* segment disappeared */
+ if (errno == EACCES)
+ return SHMSTATE_FOREIGN; /* must be non-Postgres */
+#ifdef HAVE_LINUX_EIDRM_BUG
+ if (errno == EIDRM)
+ return SHMSTATE_ENOENT; /* segment disappeared */
+#endif
+ /* Otherwise, be conservative. */
+ return SHMSTATE_ANALYSIS_FAILURE;
+ }
+ *addr = hdr;
+
+ if (hdr->magic != PGShmemMagic ||
+ hdr->device != statbuf.st_dev ||
+ hdr->inode != statbuf.st_ino)
+ {
+ /*
+ * It's either not a Postgres segment, or not one for my data
+ * directory.
+ */
+ return SHMSTATE_FOREIGN;
+ }
+
+ /*
+ * It does match our data directory, so now test whether any processes are
+ * still attached to it. (We are, now, but the shm_nattch result is from
+ * before we attached to it.)
+ */
+ return shmStat.shm_nattch == 0 ? SHMSTATE_UNATTACHED : SHMSTATE_ATTACHED;
+}
+
+/*
+ * Identify the huge page size to use, and compute the related mmap flags.
+ *
+ * Some Linux kernel versions have a bug causing mmap() to fail on requests
+ * that are not a multiple of the hugepage size. Versions without that bug
+ * instead silently round the request up to the next hugepage multiple ---
+ * and then munmap() fails when we give it a size different from that.
+ * So we have to round our request up to a multiple of the actual hugepage
+ * size to avoid trouble.
+ *
+ * Doing the round-up ourselves also lets us make use of the extra memory,
+ * rather than just wasting it. Currently, we just increase the available
+ * space recorded in the shmem header, which will make the extra usable for
+ * purposes such as additional locktable entries. Someday, for very large
+ * hugepage sizes, we might want to think about more invasive strategies,
+ * such as increasing shared_buffers to absorb the extra space.
+ *
+ * Returns the (real, assumed or config provided) page size into
+ * *hugepagesize, and the hugepage-related mmap flags to use into
+ * *mmap_flags if requested by the caller. If huge pages are not supported,
+ * *hugepagesize and *mmap_flags are set to 0.
+ */
+void
+GetHugePageSize(Size *hugepagesize, int *mmap_flags)
+{
+#ifdef MAP_HUGETLB
+
+ Size default_hugepagesize = 0;
+ Size hugepagesize_local = 0;
+ int mmap_flags_local = 0;
+
+ /*
+ * System-dependent code to find out the default huge page size.
+ *
+ * On Linux, read /proc/meminfo looking for a line like "Hugepagesize:
+ * nnnn kB". Ignore any failures, falling back to the preset default.
+ */
+#ifdef __linux__
+
+ {
+ FILE *fp = AllocateFile("/proc/meminfo", "r");
+ char buf[128];
+ unsigned int sz;
+ char ch;
+
+ if (fp)
+ {
+ while (fgets(buf, sizeof(buf), fp))
+ {
+ if (sscanf(buf, "Hugepagesize: %u %c", &sz, &ch) == 2)
+ {
+ if (ch == 'k')
+ {
+ default_hugepagesize = sz * (Size) 1024;
+ break;
+ }
+ /* We could accept other units besides kB, if needed */
+ }
+ }
+ FreeFile(fp);
+ }
+ }
+#endif /* __linux__ */
+
+ if (huge_page_size != 0)
+ {
+ /* If huge page size is requested explicitly, use that. */
+ hugepagesize_local = (Size) huge_page_size * 1024;
+ }
+ else if (default_hugepagesize != 0)
+ {
+ /* Otherwise use the system default, if we have it. */
+ hugepagesize_local = default_hugepagesize;
+ }
+ else
+ {
+ /*
+ * If we fail to find out the system's default huge page size, or no
+ * huge page size is requested explicitly, assume it is 2MB. This will
+ * work fine when the actual size is less. If it's more, we might get
+ * mmap() or munmap() failures due to unaligned requests; but at this
+ * writing, there are no reports of any non-Linux systems being picky
+ * about that.
+ */
+ hugepagesize_local = 2 * 1024 * 1024;
+ }
+
+ mmap_flags_local = MAP_HUGETLB;
+
+ /*
+ * On recent enough Linux, also include the explicit page size, if
+ * necessary.
+ */
+#if defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT)
+ if (hugepagesize_local != default_hugepagesize)
+ {
+ int shift = pg_ceil_log2_64(hugepagesize_local);
+
+ mmap_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
+ }
+#endif
+
+ /* assign the results found */
+ if (mmap_flags)
+ *mmap_flags = mmap_flags_local;
+ if (hugepagesize)
+ *hugepagesize = hugepagesize_local;
+
+#else
+
+ if (hugepagesize)
+ *hugepagesize = 0;
+ if (mmap_flags)
+ *mmap_flags = 0;
+
+#endif /* MAP_HUGETLB */
+}
+
+/*
+ * GUC check_hook for huge_page_size
+ */
+bool
+check_huge_page_size(int *newval, void **extra, GucSource source)
+{
+#if !(defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT))
+ /* Recent enough Linux only, for now. See GetHugePageSize(). */
+ if (*newval != 0)
+ {
+ GUC_check_errdetail("huge_page_size must be 0 on this platform.");
+ return false;
+ }
+#endif
+ return true;
+}
+
+/*
+ * Creates an anonymous mmap()ed shared memory segment.
+ *
+ * Pass the requested size in *size. This function will modify *size to the
+ * actual size of the allocation, if it ends up allocating a segment that is
+ * larger than requested.
+ */
+static void *
+CreateAnonymousSegment(Size *size)
+{
+ Size allocsize = *size;
+ void *ptr = MAP_FAILED;
+ int mmap_errno = 0;
+
+#ifndef MAP_HUGETLB
+ /* PGSharedMemoryCreate should have dealt with this case */
+ Assert(huge_pages != HUGE_PAGES_ON);
+#else
+ if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY)
+ {
+ /*
+ * Round up the request size to a suitable large value.
+ */
+ Size hugepagesize;
+ int mmap_flags;
+
+ GetHugePageSize(&hugepagesize, &mmap_flags);
+
+ if (allocsize % hugepagesize != 0)
+ allocsize += hugepagesize - (allocsize % hugepagesize);
+
+ ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
+ PG_MMAP_FLAGS | mmap_flags, -1, 0);
+ mmap_errno = errno;
+ if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED)
+ elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m",
+ allocsize);
+ }
+#endif
+
+ if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON)
+ {
+ /*
+ * Use the original size, not the rounded-up value, when falling back
+ * to non-huge pages.
+ */
+ allocsize = *size;
+ ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
+ PG_MMAP_FLAGS, -1, 0);
+ mmap_errno = errno;
+ }
+
+ if (ptr == MAP_FAILED)
+ {
+ errno = mmap_errno;
+ ereport(FATAL,
+ (errmsg("could not map anonymous shared memory: %m"),
+ (mmap_errno == ENOMEM) ?
+ errhint("This error usually means that PostgreSQL's request "
+ "for a shared memory segment exceeded available memory, "
+ "swap space, or huge pages. To reduce the request size "
+ "(currently %zu bytes), reduce PostgreSQL's shared "
+ "memory usage, perhaps by reducing shared_buffers or "
+ "max_connections.",
+ allocsize) : 0));
+ }
+
+ *size = allocsize;
+ return ptr;
+}
+
+/*
+ * AnonymousShmemDetach --- detach from an anonymous mmap'd block
+ * (called as an on_shmem_exit callback, hence funny argument list)
+ */
+static void
+AnonymousShmemDetach(int status, Datum arg)
+{
+ /* Release anonymous shared memory block, if any. */
+ if (AnonymousShmem != NULL)
+ {
+ if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
+ elog(LOG, "munmap(%p, %zu) failed: %m",
+ AnonymousShmem, AnonymousShmemSize);
+ AnonymousShmem = NULL;
+ }
+}
+
+/*
+ * PGSharedMemoryCreate
+ *
+ * Create a shared memory segment of the given size and initialize its
+ * standard header. Also, register an on_shmem_exit callback to release
+ * the storage.
+ *
+ * Dead Postgres segments pertinent to this DataDir are recycled if found, but
+ * we do not fail upon collision with foreign shmem segments. The idea here
+ * is to detect and re-use keys that may have been assigned by a crashed
+ * postmaster or backend.
+ */
+PGShmemHeader *
+PGSharedMemoryCreate(Size size,
+ PGShmemHeader **shim)
+{
+ IpcMemoryKey NextShmemSegID;
+ void *memAddress;
+ PGShmemHeader *hdr;
+ struct stat statbuf;
+ Size sysvsize;
+
+ /*
+ * We use the data directory's ID info (inode and device numbers) to
+ * positively identify shmem segments associated with this data dir, and
+ * also as seeds for searching for a free shmem key.
+ */
+ if (stat(DataDir, &statbuf) < 0)
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not stat data directory \"%s\": %m",
+ DataDir)));
+
+ /* Complain if hugepages demanded but we can't possibly support them */
+#if !defined(MAP_HUGETLB)
+ if (huge_pages == HUGE_PAGES_ON)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("huge pages not supported on this platform")));
+#endif
+
+ /* For now, we don't support huge pages in SysV memory */
+ if (huge_pages == HUGE_PAGES_ON && shared_memory_type != SHMEM_TYPE_MMAP)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("huge pages not supported with the current shared_memory_type setting")));
+
+ /* Room for a header? */
+ Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
+
+ if (shared_memory_type == SHMEM_TYPE_MMAP)
+ {
+ AnonymousShmem = CreateAnonymousSegment(&size);
+ AnonymousShmemSize = size;
+
+ /* Register on-exit routine to unmap the anonymous segment */
+ on_shmem_exit(AnonymousShmemDetach, (Datum) 0);
+
+ /* Now we need only allocate a minimal-sized SysV shmem block. */
+ sysvsize = sizeof(PGShmemHeader);
+ }
+ else
+ sysvsize = size;
+
+ /*
+ * Loop till we find a free IPC key. Trust CreateDataDirLockFile() to
+ * ensure no more than one postmaster per data directory can enter this
+ * loop simultaneously. (CreateDataDirLockFile() does not entirely ensure
+ * that, but prefer fixing it over coping here.)
+ */
+ NextShmemSegID = statbuf.st_ino;
+
+ for (;;)
+ {
+ IpcMemoryId shmid;
+ PGShmemHeader *oldhdr;
+ IpcMemoryState state;
+
+ /* Try to create new segment */
+ memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize);
+ if (memAddress)
+ break; /* successful create and attach */
+
+ /* Check shared memory and possibly remove and recreate */
+
+ /*
+ * shmget() failure is typically EACCES, hence SHMSTATE_FOREIGN.
+ * ENOENT, a narrow possibility, implies SHMSTATE_ENOENT, but one can
+ * safely treat SHMSTATE_ENOENT like SHMSTATE_FOREIGN.
+ */
+ shmid = shmget(NextShmemSegID, sizeof(PGShmemHeader), 0);
+ if (shmid < 0)
+ {
+ oldhdr = NULL;
+ state = SHMSTATE_FOREIGN;
+ }
+ else
+ state = PGSharedMemoryAttach(shmid, NULL, &oldhdr);
+
+ switch (state)
+ {
+ case SHMSTATE_ANALYSIS_FAILURE:
+ case SHMSTATE_ATTACHED:
+ ereport(FATAL,
+ (errcode(ERRCODE_LOCK_FILE_EXISTS),
+ errmsg("pre-existing shared memory block (key %lu, ID %lu) is still in use",
+ (unsigned long) NextShmemSegID,
+ (unsigned long) shmid),
+ errhint("Terminate any old server processes associated with data directory \"%s\".",
+ DataDir)));
+ break;
+ case SHMSTATE_ENOENT:
+
+ /*
+ * To our surprise, some other process deleted since our last
+ * InternalIpcMemoryCreate(). Moments earlier, we would have
+ * seen SHMSTATE_FOREIGN. Try that same ID again.
+ */
+ elog(LOG,
+ "shared memory block (key %lu, ID %lu) deleted during startup",
+ (unsigned long) NextShmemSegID,
+ (unsigned long) shmid);
+ break;
+ case SHMSTATE_FOREIGN:
+ NextShmemSegID++;
+ break;
+ case SHMSTATE_UNATTACHED:
+
+ /*
+ * The segment pertains to DataDir, and every process that had
+ * used it has died or detached. Zap it, if possible, and any
+ * associated dynamic shared memory segments, as well. This
+ * shouldn't fail, but if it does, assume the segment belongs
+ * to someone else after all, and try the next candidate.
+ * Otherwise, try again to create the segment. That may fail
+ * if some other process creates the same shmem key before we
+ * do, in which case we'll try the next key.
+ */
+ if (oldhdr->dsm_control != 0)
+ dsm_cleanup_using_control_segment(oldhdr->dsm_control);
+ if (shmctl(shmid, IPC_RMID, NULL) < 0)
+ NextShmemSegID++;
+ break;
+ }
+
+ if (oldhdr && shmdt((void *) oldhdr) < 0)
+ elog(LOG, "shmdt(%p) failed: %m", oldhdr);
+ }
+
+ /* Initialize new segment. */
+ hdr = (PGShmemHeader *) memAddress;
+ hdr->creatorPID = getpid();
+ hdr->magic = PGShmemMagic;
+ hdr->dsm_control = 0;
+
+ /* Fill in the data directory ID info, too */
+ hdr->device = statbuf.st_dev;
+ hdr->inode = statbuf.st_ino;
+
+ /*
+ * Initialize space allocation status for segment.
+ */
+ hdr->totalsize = size;
+ hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
+ *shim = hdr;
+
+ /* Save info for possible future use */
+ UsedShmemSegAddr = memAddress;
+ UsedShmemSegID = (unsigned long) NextShmemSegID;
+
+ /*
+ * If AnonymousShmem is NULL here, then we're not using anonymous shared
+ * memory, and should return a pointer to the System V shared memory
+ * block. Otherwise, the System V shared memory block is only a shim, and
+ * we must return a pointer to the real block.
+ */
+ if (AnonymousShmem == NULL)
+ return hdr;
+ memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader));
+ return (PGShmemHeader *) AnonymousShmem;
+}
+
+#ifdef EXEC_BACKEND
+
+/*
+ * PGSharedMemoryReAttach
+ *
+ * This is called during startup of a postmaster child process to re-attach to
+ * an already existing shared memory segment. This is needed only in the
+ * EXEC_BACKEND case; otherwise postmaster children inherit the shared memory
+ * segment attachment via fork().
+ *
+ * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
+ * routine. The caller must have already restored them to the postmaster's
+ * values.
+ */
+void
+PGSharedMemoryReAttach(void)
+{
+ IpcMemoryId shmid;
+ PGShmemHeader *hdr;
+ IpcMemoryState state;
+ void *origUsedShmemSegAddr = UsedShmemSegAddr;
+
+ Assert(UsedShmemSegAddr != NULL);
+ Assert(IsUnderPostmaster);
+
+#ifdef __CYGWIN__
+ /* cygipc (currently) appears to not detach on exec. */
+ PGSharedMemoryDetach();
+ UsedShmemSegAddr = origUsedShmemSegAddr;
+#endif
+
+ elog(DEBUG3, "attaching to %p", UsedShmemSegAddr);
+ shmid = shmget(UsedShmemSegID, sizeof(PGShmemHeader), 0);
+ if (shmid < 0)
+ state = SHMSTATE_FOREIGN;
+ else
+ state = PGSharedMemoryAttach(shmid, UsedShmemSegAddr, &hdr);
+ if (state != SHMSTATE_ATTACHED)
+ elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m",
+ (int) UsedShmemSegID, UsedShmemSegAddr);
+ if (hdr != origUsedShmemSegAddr)
+ elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)",
+ hdr, origUsedShmemSegAddr);
+ dsm_set_control_handle(hdr->dsm_control);
+
+ UsedShmemSegAddr = hdr; /* probably redundant */
+}
+
+/*
+ * PGSharedMemoryNoReAttach
+ *
+ * This is called during startup of a postmaster child process when we choose
+ * *not* to re-attach to the existing shared memory segment. We must clean up
+ * to leave things in the appropriate state. This is not used in the non
+ * EXEC_BACKEND case, either.
+ *
+ * The child process startup logic might or might not call PGSharedMemoryDetach
+ * after this; make sure that it will be a no-op if called.
+ *
+ * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
+ * routine. The caller must have already restored them to the postmaster's
+ * values.
+ */
+void
+PGSharedMemoryNoReAttach(void)
+{
+ Assert(UsedShmemSegAddr != NULL);
+ Assert(IsUnderPostmaster);
+
+#ifdef __CYGWIN__
+ /* cygipc (currently) appears to not detach on exec. */
+ PGSharedMemoryDetach();
+#endif
+
+ /* For cleanliness, reset UsedShmemSegAddr to show we're not attached. */
+ UsedShmemSegAddr = NULL;
+ /* And the same for UsedShmemSegID. */
+ UsedShmemSegID = 0;
+}
+
+#endif /* EXEC_BACKEND */
+
+/*
+ * PGSharedMemoryDetach
+ *
+ * Detach from the shared memory segment, if still attached. This is not
+ * intended to be called explicitly by the process that originally created the
+ * segment (it will have on_shmem_exit callback(s) registered to do that).
+ * Rather, this is for subprocesses that have inherited an attachment and want
+ * to get rid of it.
+ *
+ * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
+ * routine, also AnonymousShmem and AnonymousShmemSize.
+ */
+void
+PGSharedMemoryDetach(void)
+{
+ if (UsedShmemSegAddr != NULL)
+ {
+ if ((shmdt(UsedShmemSegAddr) < 0)
+#if defined(EXEC_BACKEND) && defined(__CYGWIN__)
+ /* Work-around for cygipc exec bug */
+ && shmdt(NULL) < 0
+#endif
+ )
+ elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr);
+ UsedShmemSegAddr = NULL;
+ }
+
+ if (AnonymousShmem != NULL)
+ {
+ if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
+ elog(LOG, "munmap(%p, %zu) failed: %m",
+ AnonymousShmem, AnonymousShmemSize);
+ AnonymousShmem = NULL;
+ }
+}