summaryrefslogtreecommitdiffstats
path: root/src/backend/port
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:19:15 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:19:15 +0000
commit6eb9c5a5657d1fe77b55cc261450f3538d35a94d (patch)
tree657d8194422a5daccecfd42d654b8a245ef7b4c8 /src/backend/port
parentInitial commit. (diff)
downloadpostgresql-13-upstream.tar.xz
postgresql-13-upstream.zip
Adding upstream version 13.4.upstream/13.4upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/port')
-rw-r--r--src/backend/port/.gitignore3
-rw-r--r--src/backend/port/Makefile48
-rwxr-xr-xsrc/backend/port/aix/mkldexport.sh61
-rw-r--r--src/backend/port/atomics.c239
-rw-r--r--src/backend/port/hpux/tas.c.template40
-rw-r--r--src/backend/port/posix_sema.c388
-rw-r--r--src/backend/port/sysv_sema.c517
-rw-r--r--src/backend/port/sysv_shmem.c902
-rw-r--r--src/backend/port/tas/dummy.s0
-rw-r--r--src/backend/port/tas/hpux_hppa.s28
-rw-r--r--src/backend/port/tas/sunstudio_sparc.s53
-rw-r--r--src/backend/port/tas/sunstudio_x86.s43
-rw-r--r--src/backend/port/win32/Makefile23
-rw-r--r--src/backend/port/win32/crashdump.c183
-rw-r--r--src/backend/port/win32/signal.c344
-rw-r--r--src/backend/port/win32/socket.c692
-rw-r--r--src/backend/port/win32/timer.c121
-rw-r--r--src/backend/port/win32_sema.c235
-rw-r--r--src/backend/port/win32_shmem.c599
19 files changed, 4519 insertions, 0 deletions
diff --git a/src/backend/port/.gitignore b/src/backend/port/.gitignore
new file mode 100644
index 0000000..4ef36b8
--- /dev/null
+++ b/src/backend/port/.gitignore
@@ -0,0 +1,3 @@
+/pg_sema.c
+/pg_shmem.c
+/tas.s
diff --git a/src/backend/port/Makefile b/src/backend/port/Makefile
new file mode 100644
index 0000000..2d00b4f
--- /dev/null
+++ b/src/backend/port/Makefile
@@ -0,0 +1,48 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for the port-specific subsystem of the backend
+#
+# We have two different modes of operation: 1) put stuff specific to Port X
+# in subdirectory X and have that subdirectory's make file make it all, and
+# 2) use conditional statements in the present make file to include what's
+# necessary for a specific port in our own output. (1) came first, but (2)
+# is superior for many things, like when the same thing needs to be done for
+# multiple ports and you don't want to duplicate files in multiple
+# subdirectories. Much of the stuff done via Method 1 today should probably
+# be converted to Method 2.
+#
+# IDENTIFICATION
+# src/backend/port/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/port
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ $(TAS) \
+ atomics.o \
+ pg_sema.o \
+ pg_shmem.o
+
+ifeq ($(PORTNAME), win32)
+SUBDIRS += win32
+endif
+
+include $(top_srcdir)/src/backend/common.mk
+
+tas.o: tas.s
+ifeq ($(SUN_STUDIO_CC), yes)
+# preprocess assembler file with cpp
+ $(CC) $(CFLAGS) -c -P $<
+ mv $*.i $*_cpp.s
+ $(CC) $(CFLAGS) -c $*_cpp.s -o $@
+else
+ $(CC) $(CFLAGS) -c $<
+endif
+
+distclean clean:
+ rm -f tas_cpp.s
+ $(MAKE) -C win32 clean
diff --git a/src/backend/port/aix/mkldexport.sh b/src/backend/port/aix/mkldexport.sh
new file mode 100755
index 0000000..adf3793
--- /dev/null
+++ b/src/backend/port/aix/mkldexport.sh
@@ -0,0 +1,61 @@
+#!/bin/sh
+#
+# mkldexport
+# create an AIX exports file from an object file
+#
+# src/backend/port/aix/mkldexport.sh
+#
+# Usage:
+# mkldexport objectfile [location]
+# where
+# objectfile is the current location of the object file.
+# location is the eventual (installed) location of the
+# object file (if different from the current
+# working directory).
+#
+# [This file comes from the Postgres 4.2 distribution. - ay 7/95]
+#
+# Header: /usr/local/devel/postgres/src/tools/mkldexport/RCS/mkldexport.sh,v 1.2 1994/03/13 04:59:12 aoki Exp
+#
+
+# setting this to nm -B might be better
+# ... due to changes in AIX 4.x ...
+# ... let us search in different directories - Gerhard Reithofer
+if [ -x /usr/ucb/nm ]
+then NM=/usr/ucb/nm
+elif [ -x /usr/bin/nm ]
+then NM=/usr/bin/nm
+elif [ -x /usr/ccs/bin/nm ]
+then NM=/usr/ccs/bin/nm
+elif [ -x /usr/usg/bin/nm ]
+then NM=/usr/usg/bin/nm
+else echo "Fatal error: cannot find `nm' ... please check your installation."
+ exit 1
+fi
+
+CMDNAME=`basename $0`
+if [ -z "$1" ]; then
+ echo "Usage: $CMDNAME object [location]"
+ exit 1
+fi
+OBJNAME=`basename $1`
+if [ "`basename $OBJNAME`" != "`basename $OBJNAME .o`" ]; then
+ OBJNAME=`basename $OBJNAME .o`.so
+fi
+if [ -z "$2" ]; then
+ echo '#!'
+else
+ if [ "$2" = "." ]; then
+ # for the base executable (AIX 4.2 and up)
+ echo '#! .'
+ else
+ echo '#!' $2
+ fi
+fi
+$NM -BCg $1 | \
+ egrep ' [TDB] ' | \
+ sed -e 's/.* //' | \
+ egrep -v '\$' | \
+ sed -e 's/^[.]//' | \
+ sort | \
+ uniq
diff --git a/src/backend/port/atomics.c b/src/backend/port/atomics.c
new file mode 100644
index 0000000..c4f8370
--- /dev/null
+++ b/src/backend/port/atomics.c
@@ -0,0 +1,239 @@
+/*-------------------------------------------------------------------------
+ *
+ * atomics.c
+ * Non-Inline parts of the atomics implementation
+ *
+ * Portions Copyright (c) 2013-2020, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/port/atomics.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "port/atomics.h"
+#include "storage/spin.h"
+
+#ifdef PG_HAVE_MEMORY_BARRIER_EMULATION
+#ifdef WIN32
+#error "barriers are required (and provided) on WIN32 platforms"
+#endif
+#include <signal.h>
+#endif
+
+#ifdef PG_HAVE_MEMORY_BARRIER_EMULATION
+void
+pg_spinlock_barrier(void)
+{
+ /*
+ * NB: we have to be reentrant here, some barriers are placed in signal
+ * handlers.
+ *
+ * We use kill(0) for the fallback barrier as we assume that kernels on
+ * systems old enough to require fallback barrier support will include an
+ * appropriate barrier while checking the existence of the postmaster pid.
+ */
+ (void) kill(PostmasterPid, 0);
+}
+#endif
+
+#ifdef PG_HAVE_COMPILER_BARRIER_EMULATION
+void
+pg_extern_compiler_barrier(void)
+{
+ /* do nothing */
+}
+#endif
+
+
+#ifdef PG_HAVE_ATOMIC_FLAG_SIMULATION
+
+void
+pg_atomic_init_flag_impl(volatile pg_atomic_flag *ptr)
+{
+ StaticAssertStmt(sizeof(ptr->sema) >= sizeof(slock_t),
+ "size mismatch of atomic_flag vs slock_t");
+
+#ifndef HAVE_SPINLOCKS
+
+ /*
+ * NB: If we're using semaphore based TAS emulation, be careful to use a
+ * separate set of semaphores. Otherwise we'd get in trouble if an atomic
+ * var would be manipulated while spinlock is held.
+ */
+ s_init_lock_sema((slock_t *) &ptr->sema, true);
+#else
+ SpinLockInit((slock_t *) &ptr->sema);
+#endif
+
+ ptr->value = false;
+}
+
+bool
+pg_atomic_test_set_flag_impl(volatile pg_atomic_flag *ptr)
+{
+ uint32 oldval;
+
+ SpinLockAcquire((slock_t *) &ptr->sema);
+ oldval = ptr->value;
+ ptr->value = true;
+ SpinLockRelease((slock_t *) &ptr->sema);
+
+ return oldval == 0;
+}
+
+void
+pg_atomic_clear_flag_impl(volatile pg_atomic_flag *ptr)
+{
+ SpinLockAcquire((slock_t *) &ptr->sema);
+ ptr->value = false;
+ SpinLockRelease((slock_t *) &ptr->sema);
+}
+
+bool
+pg_atomic_unlocked_test_flag_impl(volatile pg_atomic_flag *ptr)
+{
+ return ptr->value == 0;
+}
+
+#endif /* PG_HAVE_ATOMIC_FLAG_SIMULATION */
+
+#ifdef PG_HAVE_ATOMIC_U32_SIMULATION
+void
+pg_atomic_init_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val_)
+{
+ StaticAssertStmt(sizeof(ptr->sema) >= sizeof(slock_t),
+ "size mismatch of atomic_uint32 vs slock_t");
+
+ /*
+ * If we're using semaphore based atomic flags, be careful about nested
+ * usage of atomics while a spinlock is held.
+ */
+#ifndef HAVE_SPINLOCKS
+ s_init_lock_sema((slock_t *) &ptr->sema, true);
+#else
+ SpinLockInit((slock_t *) &ptr->sema);
+#endif
+ ptr->value = val_;
+}
+
+void
+pg_atomic_write_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val)
+{
+ /*
+ * One might think that an unlocked write doesn't need to acquire the
+ * spinlock, but one would be wrong. Even an unlocked write has to cause a
+ * concurrent pg_atomic_compare_exchange_u32() (et al) to fail.
+ */
+ SpinLockAcquire((slock_t *) &ptr->sema);
+ ptr->value = val;
+ SpinLockRelease((slock_t *) &ptr->sema);
+}
+
+bool
+pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
+ uint32 *expected, uint32 newval)
+{
+ bool ret;
+
+ /*
+ * Do atomic op under a spinlock. It might look like we could just skip
+ * the cmpxchg if the lock isn't available, but that'd just emulate a
+ * 'weak' compare and swap. I.e. one that allows spurious failures. Since
+ * several algorithms rely on a strong variant and that is efficiently
+ * implementable on most major architectures let's emulate it here as
+ * well.
+ */
+ SpinLockAcquire((slock_t *) &ptr->sema);
+
+ /* perform compare/exchange logic */
+ ret = ptr->value == *expected;
+ *expected = ptr->value;
+ if (ret)
+ ptr->value = newval;
+
+ /* and release lock */
+ SpinLockRelease((slock_t *) &ptr->sema);
+
+ return ret;
+}
+
+uint32
+pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
+{
+ uint32 oldval;
+
+ SpinLockAcquire((slock_t *) &ptr->sema);
+ oldval = ptr->value;
+ ptr->value += add_;
+ SpinLockRelease((slock_t *) &ptr->sema);
+ return oldval;
+}
+
+#endif /* PG_HAVE_ATOMIC_U32_SIMULATION */
+
+
+#ifdef PG_HAVE_ATOMIC_U64_SIMULATION
+
+void
+pg_atomic_init_u64_impl(volatile pg_atomic_uint64 *ptr, uint64 val_)
+{
+ StaticAssertStmt(sizeof(ptr->sema) >= sizeof(slock_t),
+ "size mismatch of atomic_uint64 vs slock_t");
+
+ /*
+ * If we're using semaphore based atomic flags, be careful about nested
+ * usage of atomics while a spinlock is held.
+ */
+#ifndef HAVE_SPINLOCKS
+ s_init_lock_sema((slock_t *) &ptr->sema, true);
+#else
+ SpinLockInit((slock_t *) &ptr->sema);
+#endif
+ ptr->value = val_;
+}
+
+bool
+pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
+ uint64 *expected, uint64 newval)
+{
+ bool ret;
+
+ /*
+ * Do atomic op under a spinlock. It might look like we could just skip
+ * the cmpxchg if the lock isn't available, but that'd just emulate a
+ * 'weak' compare and swap. I.e. one that allows spurious failures. Since
+ * several algorithms rely on a strong variant and that is efficiently
+ * implementable on most major architectures let's emulate it here as
+ * well.
+ */
+ SpinLockAcquire((slock_t *) &ptr->sema);
+
+ /* perform compare/exchange logic */
+ ret = ptr->value == *expected;
+ *expected = ptr->value;
+ if (ret)
+ ptr->value = newval;
+
+ /* and release lock */
+ SpinLockRelease((slock_t *) &ptr->sema);
+
+ return ret;
+}
+
+uint64
+pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
+{
+ uint64 oldval;
+
+ SpinLockAcquire((slock_t *) &ptr->sema);
+ oldval = ptr->value;
+ ptr->value += add_;
+ SpinLockRelease((slock_t *) &ptr->sema);
+ return oldval;
+}
+
+#endif /* PG_HAVE_ATOMIC_U64_SIMULATION */
diff --git a/src/backend/port/hpux/tas.c.template b/src/backend/port/hpux/tas.c.template
new file mode 100644
index 0000000..5ccbbcd
--- /dev/null
+++ b/src/backend/port/hpux/tas.c.template
@@ -0,0 +1,40 @@
+/*
+ * tas() for HPPA.
+ *
+ * To generate tas.s using this template:
+ * 1. cc +O2 -S -c tas.c
+ * 2. edit tas.s:
+ * - replace the LDW with LDCWX
+ * 3. install as src/backend/port/tas/hpux_hppa.s.
+ *
+ * For details about the LDCWX instruction, see the "Precision
+ * Architecture and Instruction Reference Manual" (09740-90014 of June
+ * 1987), p. 5-38.
+ */
+
+int
+tas(lock)
+ int *lock; /* LDCWX is a word instruction */
+{
+ /*
+ * LDCWX requires that we align the "semaphore" to a 16-byte
+ * boundary. The actual datum is a single word (4 bytes).
+ */
+ lock = ((uintptr_t) lock + 15) & ~15;
+
+ /*
+ * The LDCWX instruction atomically clears the target word and
+ * returns the previous value. Hence, if the instruction returns
+ * 0, someone else has already acquired the lock before we tested
+ * it (i.e., we have failed).
+ *
+ * Notice that this means that we actually clear the word to set
+ * the lock and set the word to clear the lock. This is the
+ * opposite behavior from the SPARC LDSTUB instruction. For some
+ * reason everything that H-P does is rather baroque...
+ */
+ if (*lock) { /* this generates the LDW */
+ return(0); /* success */
+ }
+ return(1); /* failure */
+}
diff --git a/src/backend/port/posix_sema.c b/src/backend/port/posix_sema.c
new file mode 100644
index 0000000..277b82c
--- /dev/null
+++ b/src/backend/port/posix_sema.c
@@ -0,0 +1,388 @@
+/*-------------------------------------------------------------------------
+ *
+ * posix_sema.c
+ * Implement PGSemaphores using POSIX semaphore facilities
+ *
+ * We prefer the unnamed style of POSIX semaphore (the kind made with
+ * sem_init). We can cope with the kind made with sem_open, however.
+ *
+ * In either implementation, typedef PGSemaphore is equivalent to "sem_t *".
+ * With unnamed semaphores, the sem_t structs live in an array in shared
+ * memory. With named semaphores, that's not true because we cannot persuade
+ * sem_open to do its allocation there. Therefore, the named-semaphore code
+ * *does not cope with EXEC_BACKEND*. The sem_t structs will just be in the
+ * postmaster's private memory, where they are successfully inherited by
+ * forked backends, but they could not be accessed by exec'd backends.
+ *
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/port/posix_sema.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/pg_sema.h"
+#include "storage/shmem.h"
+
+
+/* see file header comment */
+#if defined(USE_NAMED_POSIX_SEMAPHORES) && defined(EXEC_BACKEND)
+#error cannot use named POSIX semaphores with EXEC_BACKEND
+#endif
+
+typedef union SemTPadded
+{
+ sem_t pgsem;
+ char pad[PG_CACHE_LINE_SIZE];
+} SemTPadded;
+
+/* typedef PGSemaphore is equivalent to pointer to sem_t */
+typedef struct PGSemaphoreData
+{
+ SemTPadded sem_padded;
+} PGSemaphoreData;
+
+#define PG_SEM_REF(x) (&(x)->sem_padded.pgsem)
+
+#define IPCProtection (0600) /* access/modify by user only */
+
+#ifdef USE_NAMED_POSIX_SEMAPHORES
+static sem_t **mySemPointers; /* keep track of created semaphores */
+#else
+static PGSemaphore sharedSemas; /* array of PGSemaphoreData in shared memory */
+#endif
+static int numSems; /* number of semas acquired so far */
+static int maxSems; /* allocated size of above arrays */
+static int nextSemKey; /* next name to try */
+
+
+static void ReleaseSemaphores(int status, Datum arg);
+
+
+#ifdef USE_NAMED_POSIX_SEMAPHORES
+
+/*
+ * PosixSemaphoreCreate
+ *
+ * Attempt to create a new named semaphore.
+ *
+ * If we fail with a failure code other than collision-with-existing-sema,
+ * print out an error and abort. Other types of errors suggest nonrecoverable
+ * problems.
+ */
+static sem_t *
+PosixSemaphoreCreate(void)
+{
+ int semKey;
+ char semname[64];
+ sem_t *mySem;
+
+ for (;;)
+ {
+ semKey = nextSemKey++;
+
+ snprintf(semname, sizeof(semname), "/pgsql-%d", semKey);
+
+ mySem = sem_open(semname, O_CREAT | O_EXCL,
+ (mode_t) IPCProtection, (unsigned) 1);
+
+#ifdef SEM_FAILED
+ if (mySem != (sem_t *) SEM_FAILED)
+ break;
+#else
+ if (mySem != (sem_t *) (-1))
+ break;
+#endif
+
+ /* Loop if error indicates a collision */
+ if (errno == EEXIST || errno == EACCES || errno == EINTR)
+ continue;
+
+ /*
+ * Else complain and abort
+ */
+ elog(FATAL, "sem_open(\"%s\") failed: %m", semname);
+ }
+
+ /*
+ * Unlink the semaphore immediately, so it can't be accessed externally.
+ * This also ensures that it will go away if we crash.
+ */
+ sem_unlink(semname);
+
+ return mySem;
+}
+#else /* !USE_NAMED_POSIX_SEMAPHORES */
+
+/*
+ * PosixSemaphoreCreate
+ *
+ * Attempt to create a new unnamed semaphore.
+ */
+static void
+PosixSemaphoreCreate(sem_t *sem)
+{
+ if (sem_init(sem, 1, 1) < 0)
+ elog(FATAL, "sem_init failed: %m");
+}
+#endif /* USE_NAMED_POSIX_SEMAPHORES */
+
+
+/*
+ * PosixSemaphoreKill - removes a semaphore
+ */
+static void
+PosixSemaphoreKill(sem_t *sem)
+{
+#ifdef USE_NAMED_POSIX_SEMAPHORES
+ /* Got to use sem_close for named semaphores */
+ if (sem_close(sem) < 0)
+ elog(LOG, "sem_close failed: %m");
+#else
+ /* Got to use sem_destroy for unnamed semaphores */
+ if (sem_destroy(sem) < 0)
+ elog(LOG, "sem_destroy failed: %m");
+#endif
+}
+
+
+/*
+ * Report amount of shared memory needed for semaphores
+ */
+Size
+PGSemaphoreShmemSize(int maxSemas)
+{
+#ifdef USE_NAMED_POSIX_SEMAPHORES
+ /* No shared memory needed in this case */
+ return 0;
+#else
+ /* Need a PGSemaphoreData per semaphore */
+ return mul_size(maxSemas, sizeof(PGSemaphoreData));
+#endif
+}
+
+/*
+ * PGReserveSemaphores --- initialize semaphore support
+ *
+ * This is called during postmaster start or shared memory reinitialization.
+ * It should do whatever is needed to be able to support up to maxSemas
+ * subsequent PGSemaphoreCreate calls. Also, if any system resources
+ * are acquired here or in PGSemaphoreCreate, register an on_shmem_exit
+ * callback to release them.
+ *
+ * In the Posix implementation, we acquire semaphores on-demand; the
+ * maxSemas parameter is just used to size the arrays. For unnamed
+ * semaphores, there is an array of PGSemaphoreData structs in shared memory.
+ * For named semaphores, we keep a postmaster-local array of sem_t pointers,
+ * which we use for releasing the semaphores when done.
+ * (This design minimizes the dependency of postmaster shutdown on the
+ * contents of shared memory, which a failed backend might have clobbered.
+ * We can't do much about the possibility of sem_destroy() crashing, but
+ * we don't have to expose the counters to other processes.)
+ */
+void
+PGReserveSemaphores(int maxSemas)
+{
+ struct stat statbuf;
+
+ /*
+ * We use the data directory's inode number to seed the search for free
+ * semaphore keys. This minimizes the odds of collision with other
+ * postmasters, while maximizing the odds that we will detect and clean up
+ * semaphores left over from a crashed postmaster in our own directory.
+ */
+ if (stat(DataDir, &statbuf) < 0)
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not stat data directory \"%s\": %m",
+ DataDir)));
+
+#ifdef USE_NAMED_POSIX_SEMAPHORES
+ mySemPointers = (sem_t **) malloc(maxSemas * sizeof(sem_t *));
+ if (mySemPointers == NULL)
+ elog(PANIC, "out of memory");
+#else
+
+ /*
+ * We must use ShmemAllocUnlocked(), since the spinlock protecting
+ * ShmemAlloc() won't be ready yet. (This ordering is necessary when we
+ * are emulating spinlocks with semaphores.)
+ */
+ sharedSemas = (PGSemaphore)
+ ShmemAllocUnlocked(PGSemaphoreShmemSize(maxSemas));
+#endif
+
+ numSems = 0;
+ maxSems = maxSemas;
+ nextSemKey = statbuf.st_ino;
+
+ on_shmem_exit(ReleaseSemaphores, 0);
+}
+
+/*
+ * Release semaphores at shutdown or shmem reinitialization
+ *
+ * (called as an on_shmem_exit callback, hence funny argument list)
+ */
+static void
+ReleaseSemaphores(int status, Datum arg)
+{
+ int i;
+
+#ifdef USE_NAMED_POSIX_SEMAPHORES
+ for (i = 0; i < numSems; i++)
+ PosixSemaphoreKill(mySemPointers[i]);
+ free(mySemPointers);
+#endif
+
+#ifdef USE_UNNAMED_POSIX_SEMAPHORES
+ for (i = 0; i < numSems; i++)
+ PosixSemaphoreKill(PG_SEM_REF(sharedSemas + i));
+#endif
+}
+
+/*
+ * PGSemaphoreCreate
+ *
+ * Allocate a PGSemaphore structure with initial count 1
+ */
+PGSemaphore
+PGSemaphoreCreate(void)
+{
+ PGSemaphore sema;
+ sem_t *newsem;
+
+ /* Can't do this in a backend, because static state is postmaster's */
+ Assert(!IsUnderPostmaster);
+
+ if (numSems >= maxSems)
+ elog(PANIC, "too many semaphores created");
+
+#ifdef USE_NAMED_POSIX_SEMAPHORES
+ newsem = PosixSemaphoreCreate();
+ /* Remember new sema for ReleaseSemaphores */
+ mySemPointers[numSems] = newsem;
+ sema = (PGSemaphore) newsem;
+#else
+ sema = &sharedSemas[numSems];
+ newsem = PG_SEM_REF(sema);
+ PosixSemaphoreCreate(newsem);
+#endif
+
+ numSems++;
+
+ return sema;
+}
+
+/*
+ * PGSemaphoreReset
+ *
+ * Reset a previously-initialized PGSemaphore to have count 0
+ */
+void
+PGSemaphoreReset(PGSemaphore sema)
+{
+ /*
+ * There's no direct API for this in POSIX, so we have to ratchet the
+ * semaphore down to 0 with repeated trywait's.
+ */
+ for (;;)
+ {
+ if (sem_trywait(PG_SEM_REF(sema)) < 0)
+ {
+ if (errno == EAGAIN || errno == EDEADLK)
+ break; /* got it down to 0 */
+ if (errno == EINTR)
+ continue; /* can this happen? */
+ elog(FATAL, "sem_trywait failed: %m");
+ }
+ }
+}
+
+/*
+ * PGSemaphoreLock
+ *
+ * Lock a semaphore (decrement count), blocking if count would be < 0
+ */
+void
+PGSemaphoreLock(PGSemaphore sema)
+{
+ int errStatus;
+
+ /* See notes in sysv_sema.c's implementation of PGSemaphoreLock. */
+ do
+ {
+ errStatus = sem_wait(PG_SEM_REF(sema));
+ } while (errStatus < 0 && errno == EINTR);
+
+ if (errStatus < 0)
+ elog(FATAL, "sem_wait failed: %m");
+}
+
+/*
+ * PGSemaphoreUnlock
+ *
+ * Unlock a semaphore (increment count)
+ */
+void
+PGSemaphoreUnlock(PGSemaphore sema)
+{
+ int errStatus;
+
+ /*
+ * Note: if errStatus is -1 and errno == EINTR then it means we returned
+ * from the operation prematurely because we were sent a signal. So we
+ * try and unlock the semaphore again. Not clear this can really happen,
+ * but might as well cope.
+ */
+ do
+ {
+ errStatus = sem_post(PG_SEM_REF(sema));
+ } while (errStatus < 0 && errno == EINTR);
+
+ if (errStatus < 0)
+ elog(FATAL, "sem_post failed: %m");
+}
+
+/*
+ * PGSemaphoreTryLock
+ *
+ * Lock a semaphore only if able to do so without blocking
+ */
+bool
+PGSemaphoreTryLock(PGSemaphore sema)
+{
+ int errStatus;
+
+ /*
+ * Note: if errStatus is -1 and errno == EINTR then it means we returned
+ * from the operation prematurely because we were sent a signal. So we
+ * try and lock the semaphore again.
+ */
+ do
+ {
+ errStatus = sem_trywait(PG_SEM_REF(sema));
+ } while (errStatus < 0 && errno == EINTR);
+
+ if (errStatus < 0)
+ {
+ if (errno == EAGAIN || errno == EDEADLK)
+ return false; /* failed to lock it */
+ /* Otherwise we got trouble */
+ elog(FATAL, "sem_trywait failed: %m");
+ }
+
+ return true;
+}
diff --git a/src/backend/port/sysv_sema.c b/src/backend/port/sysv_sema.c
new file mode 100644
index 0000000..88c2862
--- /dev/null
+++ b/src/backend/port/sysv_sema.c
@@ -0,0 +1,517 @@
+/*-------------------------------------------------------------------------
+ *
+ * sysv_sema.c
+ * Implement PGSemaphores using SysV semaphore facilities
+ *
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/port/sysv_sema.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#ifdef HAVE_SYS_IPC_H
+#include <sys/ipc.h>
+#endif
+#ifdef HAVE_SYS_SEM_H
+#include <sys/sem.h>
+#endif
+
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/pg_sema.h"
+#include "storage/shmem.h"
+
+
+typedef struct PGSemaphoreData
+{
+ int semId; /* semaphore set identifier */
+ int semNum; /* semaphore number within set */
+} PGSemaphoreData;
+
+#ifndef HAVE_UNION_SEMUN
+union semun
+{
+ int val;
+ struct semid_ds *buf;
+ unsigned short *array;
+};
+#endif
+
+typedef key_t IpcSemaphoreKey; /* semaphore key passed to semget(2) */
+typedef int IpcSemaphoreId; /* semaphore ID returned by semget(2) */
+
+/*
+ * SEMAS_PER_SET is the number of useful semaphores in each semaphore set
+ * we allocate. It must be *less than* your kernel's SEMMSL (max semaphores
+ * per set) parameter, which is often around 25. (Less than, because we
+ * allocate one extra sema in each set for identification purposes.)
+ */
+#define SEMAS_PER_SET 16
+
+#define IPCProtection (0600) /* access/modify by user only */
+
+#define PGSemaMagic 537 /* must be less than SEMVMX */
+
+
+static PGSemaphore sharedSemas; /* array of PGSemaphoreData in shared memory */
+static int numSharedSemas; /* number of PGSemaphoreDatas used so far */
+static int maxSharedSemas; /* allocated size of PGSemaphoreData array */
+static IpcSemaphoreId *mySemaSets; /* IDs of sema sets acquired so far */
+static int numSemaSets; /* number of sema sets acquired so far */
+static int maxSemaSets; /* allocated size of mySemaSets array */
+static IpcSemaphoreKey nextSemaKey; /* next key to try using */
+static int nextSemaNumber; /* next free sem num in last sema set */
+
+
+static IpcSemaphoreId InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey,
+ int numSems);
+static void IpcSemaphoreInitialize(IpcSemaphoreId semId, int semNum,
+ int value);
+static void IpcSemaphoreKill(IpcSemaphoreId semId);
+static int IpcSemaphoreGetValue(IpcSemaphoreId semId, int semNum);
+static pid_t IpcSemaphoreGetLastPID(IpcSemaphoreId semId, int semNum);
+static IpcSemaphoreId IpcSemaphoreCreate(int numSems);
+static void ReleaseSemaphores(int status, Datum arg);
+
+
+/*
+ * InternalIpcSemaphoreCreate
+ *
+ * Attempt to create a new semaphore set with the specified key.
+ * Will fail (return -1) if such a set already exists.
+ *
+ * If we fail with a failure code other than collision-with-existing-set,
+ * print out an error and abort. Other types of errors suggest nonrecoverable
+ * problems.
+ */
+static IpcSemaphoreId
+InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey, int numSems)
+{
+ int semId;
+
+ semId = semget(semKey, numSems, IPC_CREAT | IPC_EXCL | IPCProtection);
+
+ if (semId < 0)
+ {
+ int saved_errno = errno;
+
+ /*
+ * Fail quietly if error indicates a collision with existing set. One
+ * would expect EEXIST, given that we said IPC_EXCL, but perhaps we
+ * could get a permission violation instead? Also, EIDRM might occur
+ * if an old set is slated for destruction but not gone yet.
+ */
+ if (saved_errno == EEXIST || saved_errno == EACCES
+#ifdef EIDRM
+ || saved_errno == EIDRM
+#endif
+ )
+ return -1;
+
+ /*
+ * Else complain and abort
+ */
+ ereport(FATAL,
+ (errmsg("could not create semaphores: %m"),
+ errdetail("Failed system call was semget(%lu, %d, 0%o).",
+ (unsigned long) semKey, numSems,
+ IPC_CREAT | IPC_EXCL | IPCProtection),
+ (saved_errno == ENOSPC) ?
+ errhint("This error does *not* mean that you have run out of disk space. "
+ "It occurs when either the system limit for the maximum number of "
+ "semaphore sets (SEMMNI), or the system wide maximum number of "
+ "semaphores (SEMMNS), would be exceeded. You need to raise the "
+ "respective kernel parameter. Alternatively, reduce PostgreSQL's "
+ "consumption of semaphores by reducing its max_connections parameter.\n"
+ "The PostgreSQL documentation contains more information about "
+ "configuring your system for PostgreSQL.") : 0));
+ }
+
+ return semId;
+}
+
+/*
+ * Initialize a semaphore to the specified value.
+ */
+static void
+IpcSemaphoreInitialize(IpcSemaphoreId semId, int semNum, int value)
+{
+ union semun semun;
+
+ semun.val = value;
+ if (semctl(semId, semNum, SETVAL, semun) < 0)
+ {
+ int saved_errno = errno;
+
+ ereport(FATAL,
+ (errmsg_internal("semctl(%d, %d, SETVAL, %d) failed: %m",
+ semId, semNum, value),
+ (saved_errno == ERANGE) ?
+ errhint("You possibly need to raise your kernel's SEMVMX value to be at least "
+ "%d. Look into the PostgreSQL documentation for details.",
+ value) : 0));
+ }
+}
+
+/*
+ * IpcSemaphoreKill(semId) - removes a semaphore set
+ */
+static void
+IpcSemaphoreKill(IpcSemaphoreId semId)
+{
+ union semun semun;
+
+ semun.val = 0; /* unused, but keep compiler quiet */
+
+ if (semctl(semId, 0, IPC_RMID, semun) < 0)
+ elog(LOG, "semctl(%d, 0, IPC_RMID, ...) failed: %m", semId);
+}
+
+/* Get the current value (semval) of the semaphore */
+static int
+IpcSemaphoreGetValue(IpcSemaphoreId semId, int semNum)
+{
+ union semun dummy; /* for Solaris */
+
+ dummy.val = 0; /* unused */
+
+ return semctl(semId, semNum, GETVAL, dummy);
+}
+
+/* Get the PID of the last process to do semop() on the semaphore */
+static pid_t
+IpcSemaphoreGetLastPID(IpcSemaphoreId semId, int semNum)
+{
+ union semun dummy; /* for Solaris */
+
+ dummy.val = 0; /* unused */
+
+ return semctl(semId, semNum, GETPID, dummy);
+}
+
+
+/*
+ * Create a semaphore set with the given number of useful semaphores
+ * (an additional sema is actually allocated to serve as identifier).
+ * Dead Postgres sema sets are recycled if found, but we do not fail
+ * upon collision with non-Postgres sema sets.
+ *
+ * The idea here is to detect and re-use keys that may have been assigned
+ * by a crashed postmaster or backend.
+ */
+static IpcSemaphoreId
+IpcSemaphoreCreate(int numSems)
+{
+ IpcSemaphoreId semId;
+ union semun semun;
+ PGSemaphoreData mysema;
+
+ /* Loop till we find a free IPC key */
+ for (nextSemaKey++;; nextSemaKey++)
+ {
+ pid_t creatorPID;
+
+ /* Try to create new semaphore set */
+ semId = InternalIpcSemaphoreCreate(nextSemaKey, numSems + 1);
+ if (semId >= 0)
+ break; /* successful create */
+
+ /* See if it looks to be leftover from a dead Postgres process */
+ semId = semget(nextSemaKey, numSems + 1, 0);
+ if (semId < 0)
+ continue; /* failed: must be some other app's */
+ if (IpcSemaphoreGetValue(semId, numSems) != PGSemaMagic)
+ continue; /* sema belongs to a non-Postgres app */
+
+ /*
+ * If the creator PID is my own PID or does not belong to any extant
+ * process, it's safe to zap it.
+ */
+ creatorPID = IpcSemaphoreGetLastPID(semId, numSems);
+ if (creatorPID <= 0)
+ continue; /* oops, GETPID failed */
+ if (creatorPID != getpid())
+ {
+ if (kill(creatorPID, 0) == 0 || errno != ESRCH)
+ continue; /* sema belongs to a live process */
+ }
+
+ /*
+ * The sema set appears to be from a dead Postgres process, or from a
+ * previous cycle of life in this same process. Zap it, if possible.
+ * This probably shouldn't fail, but if it does, assume the sema set
+ * belongs to someone else after all, and continue quietly.
+ */
+ semun.val = 0; /* unused, but keep compiler quiet */
+ if (semctl(semId, 0, IPC_RMID, semun) < 0)
+ continue;
+
+ /*
+ * Now try again to create the sema set.
+ */
+ semId = InternalIpcSemaphoreCreate(nextSemaKey, numSems + 1);
+ if (semId >= 0)
+ break; /* successful create */
+
+ /*
+ * Can only get here if some other process managed to create the same
+ * sema key before we did. Let him have that one, loop around to try
+ * next key.
+ */
+ }
+
+ /*
+ * OK, we created a new sema set. Mark it as created by this process. We
+ * do this by setting the spare semaphore to PGSemaMagic-1 and then
+ * incrementing it with semop(). That leaves it with value PGSemaMagic
+ * and sempid referencing this process.
+ */
+ IpcSemaphoreInitialize(semId, numSems, PGSemaMagic - 1);
+ mysema.semId = semId;
+ mysema.semNum = numSems;
+ PGSemaphoreUnlock(&mysema);
+
+ return semId;
+}
+
+
+/*
+ * Report amount of shared memory needed for semaphores
+ */
+Size
+PGSemaphoreShmemSize(int maxSemas)
+{
+ return mul_size(maxSemas, sizeof(PGSemaphoreData));
+}
+
+/*
+ * PGReserveSemaphores --- initialize semaphore support
+ *
+ * This is called during postmaster start or shared memory reinitialization.
+ * It should do whatever is needed to be able to support up to maxSemas
+ * subsequent PGSemaphoreCreate calls. Also, if any system resources
+ * are acquired here or in PGSemaphoreCreate, register an on_shmem_exit
+ * callback to release them.
+ *
+ * In the SysV implementation, we acquire semaphore sets on-demand; the
+ * maxSemas parameter is just used to size the arrays. There is an array
+ * of PGSemaphoreData structs in shared memory, and a postmaster-local array
+ * with one entry per SysV semaphore set, which we use for releasing the
+ * semaphore sets when done. (This design ensures that postmaster shutdown
+ * doesn't rely on the contents of shared memory, which a failed backend might
+ * have clobbered.)
+ */
+void
+PGReserveSemaphores(int maxSemas)
+{
+ struct stat statbuf;
+
+ /*
+ * We use the data directory's inode number to seed the search for free
+ * semaphore keys. This minimizes the odds of collision with other
+ * postmasters, while maximizing the odds that we will detect and clean up
+ * semaphores left over from a crashed postmaster in our own directory.
+ */
+ if (stat(DataDir, &statbuf) < 0)
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not stat data directory \"%s\": %m",
+ DataDir)));
+
+ /*
+ * We must use ShmemAllocUnlocked(), since the spinlock protecting
+ * ShmemAlloc() won't be ready yet. (This ordering is necessary when we
+ * are emulating spinlocks with semaphores.)
+ */
+ sharedSemas = (PGSemaphore)
+ ShmemAllocUnlocked(PGSemaphoreShmemSize(maxSemas));
+ numSharedSemas = 0;
+ maxSharedSemas = maxSemas;
+
+ maxSemaSets = (maxSemas + SEMAS_PER_SET - 1) / SEMAS_PER_SET;
+ mySemaSets = (IpcSemaphoreId *)
+ malloc(maxSemaSets * sizeof(IpcSemaphoreId));
+ if (mySemaSets == NULL)
+ elog(PANIC, "out of memory");
+ numSemaSets = 0;
+ nextSemaKey = statbuf.st_ino;
+ nextSemaNumber = SEMAS_PER_SET; /* force sema set alloc on 1st call */
+
+ on_shmem_exit(ReleaseSemaphores, 0);
+}
+
+/*
+ * Release semaphores at shutdown or shmem reinitialization
+ *
+ * (called as an on_shmem_exit callback, hence funny argument list)
+ */
+static void
+ReleaseSemaphores(int status, Datum arg)
+{
+ int i;
+
+ for (i = 0; i < numSemaSets; i++)
+ IpcSemaphoreKill(mySemaSets[i]);
+ free(mySemaSets);
+}
+
+/*
+ * PGSemaphoreCreate
+ *
+ * Allocate a PGSemaphore structure with initial count 1
+ */
+PGSemaphore
+PGSemaphoreCreate(void)
+{
+ PGSemaphore sema;
+
+ /* Can't do this in a backend, because static state is postmaster's */
+ Assert(!IsUnderPostmaster);
+
+ if (nextSemaNumber >= SEMAS_PER_SET)
+ {
+ /* Time to allocate another semaphore set */
+ if (numSemaSets >= maxSemaSets)
+ elog(PANIC, "too many semaphores created");
+ mySemaSets[numSemaSets] = IpcSemaphoreCreate(SEMAS_PER_SET);
+ numSemaSets++;
+ nextSemaNumber = 0;
+ }
+ /* Use the next shared PGSemaphoreData */
+ if (numSharedSemas >= maxSharedSemas)
+ elog(PANIC, "too many semaphores created");
+ sema = &sharedSemas[numSharedSemas++];
+ /* Assign the next free semaphore in the current set */
+ sema->semId = mySemaSets[numSemaSets - 1];
+ sema->semNum = nextSemaNumber++;
+ /* Initialize it to count 1 */
+ IpcSemaphoreInitialize(sema->semId, sema->semNum, 1);
+
+ return sema;
+}
+
+/*
+ * PGSemaphoreReset
+ *
+ * Reset a previously-initialized PGSemaphore to have count 0
+ */
+void
+PGSemaphoreReset(PGSemaphore sema)
+{
+ IpcSemaphoreInitialize(sema->semId, sema->semNum, 0);
+}
+
+/*
+ * PGSemaphoreLock
+ *
+ * Lock a semaphore (decrement count), blocking if count would be < 0
+ */
+void
+PGSemaphoreLock(PGSemaphore sema)
+{
+ int errStatus;
+ struct sembuf sops;
+
+ sops.sem_op = -1; /* decrement */
+ sops.sem_flg = 0;
+ sops.sem_num = sema->semNum;
+
+ /*
+ * Note: if errStatus is -1 and errno == EINTR then it means we returned
+ * from the operation prematurely because we were sent a signal. So we
+ * try and lock the semaphore again.
+ *
+ * We used to check interrupts here, but that required servicing
+ * interrupts directly from signal handlers. Which is hard to do safely
+ * and portably.
+ */
+ do
+ {
+ errStatus = semop(sema->semId, &sops, 1);
+ } while (errStatus < 0 && errno == EINTR);
+
+ if (errStatus < 0)
+ elog(FATAL, "semop(id=%d) failed: %m", sema->semId);
+}
+
+/*
+ * PGSemaphoreUnlock
+ *
+ * Unlock a semaphore (increment count)
+ */
+void
+PGSemaphoreUnlock(PGSemaphore sema)
+{
+ int errStatus;
+ struct sembuf sops;
+
+ sops.sem_op = 1; /* increment */
+ sops.sem_flg = 0;
+ sops.sem_num = sema->semNum;
+
+ /*
+ * Note: if errStatus is -1 and errno == EINTR then it means we returned
+ * from the operation prematurely because we were sent a signal. So we
+ * try and unlock the semaphore again. Not clear this can really happen,
+ * but might as well cope.
+ */
+ do
+ {
+ errStatus = semop(sema->semId, &sops, 1);
+ } while (errStatus < 0 && errno == EINTR);
+
+ if (errStatus < 0)
+ elog(FATAL, "semop(id=%d) failed: %m", sema->semId);
+}
+
+/*
+ * PGSemaphoreTryLock
+ *
+ * Lock a semaphore only if able to do so without blocking
+ */
+bool
+PGSemaphoreTryLock(PGSemaphore sema)
+{
+ int errStatus;
+ struct sembuf sops;
+
+ sops.sem_op = -1; /* decrement */
+ sops.sem_flg = IPC_NOWAIT; /* but don't block */
+ sops.sem_num = sema->semNum;
+
+ /*
+ * Note: if errStatus is -1 and errno == EINTR then it means we returned
+ * from the operation prematurely because we were sent a signal. So we
+ * try and lock the semaphore again.
+ */
+ do
+ {
+ errStatus = semop(sema->semId, &sops, 1);
+ } while (errStatus < 0 && errno == EINTR);
+
+ if (errStatus < 0)
+ {
+ /* Expect EAGAIN or EWOULDBLOCK (platform-dependent) */
+#ifdef EAGAIN
+ if (errno == EAGAIN)
+ return false; /* failed to lock it */
+#endif
+#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN))
+ if (errno == EWOULDBLOCK)
+ return false; /* failed to lock it */
+#endif
+ /* Otherwise we got trouble */
+ elog(FATAL, "semop(id=%d) failed: %m", sema->semId);
+ }
+
+ return true;
+}
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
new file mode 100644
index 0000000..198a698
--- /dev/null
+++ b/src/backend/port/sysv_shmem.c
@@ -0,0 +1,902 @@
+/*-------------------------------------------------------------------------
+ *
+ * sysv_shmem.c
+ * Implement shared memory using SysV facilities
+ *
+ * These routines used to be a fairly thin layer on top of SysV shared
+ * memory functionality. With the addition of anonymous-shmem logic,
+ * they're a bit fatter now. We still require a SysV shmem block to
+ * exist, though, because mmap'd shmem provides no way to find out how
+ * many processes are attached, which we need for interlocking purposes.
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/port/sysv_shmem.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+#include <sys/file.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#ifdef HAVE_SYS_IPC_H
+#include <sys/ipc.h>
+#endif
+#ifdef HAVE_SYS_SHM_H
+#include <sys/shm.h>
+#endif
+
+#include "miscadmin.h"
+#include "portability/mem.h"
+#include "storage/dsm.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/pg_shmem.h"
+#include "utils/guc.h"
+#include "utils/pidfile.h"
+
+
+/*
+ * As of PostgreSQL 9.3, we normally allocate only a very small amount of
+ * System V shared memory, and only for the purposes of providing an
+ * interlock to protect the data directory. The real shared memory block
+ * is allocated using mmap(). This works around the problem that many
+ * systems have very low limits on the amount of System V shared memory
+ * that can be allocated. Even a limit of a few megabytes will be enough
+ * to run many copies of PostgreSQL without needing to adjust system settings.
+ *
+ * We assume that no one will attempt to run PostgreSQL 9.3 or later on
+ * systems that are ancient enough that anonymous shared memory is not
+ * supported, such as pre-2.4 versions of Linux. If that turns out to be
+ * false, we might need to add compile and/or run-time tests here and do this
+ * only if the running kernel supports it.
+ *
+ * However, we must always disable this logic in the EXEC_BACKEND case, and
+ * fall back to the old method of allocating the entire segment using System V
+ * shared memory, because there's no way to attach an anonymous mmap'd segment
+ * to a process after exec(). Since EXEC_BACKEND is intended only for
+ * developer use, this shouldn't be a big problem. Because of this, we do
+ * not worry about supporting anonymous shmem in the EXEC_BACKEND cases below.
+ *
+ * As of PostgreSQL 12, we regained the ability to use a large System V shared
+ * memory region even in non-EXEC_BACKEND builds, if shared_memory_type is set
+ * to sysv (though this is not the default).
+ */
+
+
+typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */
+typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */
+
+/*
+ * How does a given IpcMemoryId relate to this PostgreSQL process?
+ *
+ * One could recycle unattached segments of different data directories if we
+ * distinguished that case from other SHMSTATE_FOREIGN cases. Doing so would
+ * cause us to visit less of the key space, making us less likely to detect a
+ * SHMSTATE_ATTACHED key. It would also complicate the concurrency analysis,
+ * in that postmasters of different data directories could simultaneously
+ * attempt to recycle a given key. We'll waste keys longer in some cases, but
+ * avoiding the problems of the alternative justifies that loss.
+ */
+typedef enum
+{
+ SHMSTATE_ANALYSIS_FAILURE, /* unexpected failure to analyze the ID */
+ SHMSTATE_ATTACHED, /* pertinent to DataDir, has attached PIDs */
+ SHMSTATE_ENOENT, /* no segment of that ID */
+ SHMSTATE_FOREIGN, /* exists, but not pertinent to DataDir */
+ SHMSTATE_UNATTACHED /* pertinent to DataDir, no attached PIDs */
+} IpcMemoryState;
+
+
+unsigned long UsedShmemSegID = 0;
+void *UsedShmemSegAddr = NULL;
+
+static Size AnonymousShmemSize;
+static void *AnonymousShmem = NULL;
+
+static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
+static void IpcMemoryDetach(int status, Datum shmaddr);
+static void IpcMemoryDelete(int status, Datum shmId);
+static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId,
+ void *attachAt,
+ PGShmemHeader **addr);
+
+
+/*
+ * InternalIpcMemoryCreate(memKey, size)
+ *
+ * Attempt to create a new shared memory segment with the specified key.
+ * Will fail (return NULL) if such a segment already exists. If successful,
+ * attach the segment to the current process and return its attached address.
+ * On success, callbacks are registered with on_shmem_exit to detach and
+ * delete the segment when on_shmem_exit is called.
+ *
+ * If we fail with a failure code other than collision-with-existing-segment,
+ * print out an error and abort. Other types of errors are not recoverable.
+ */
+static void *
+InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
+{
+ IpcMemoryId shmid;
+ void *requestedAddress = NULL;
+ void *memAddress;
+
+ /*
+ * Normally we just pass requestedAddress = NULL to shmat(), allowing the
+ * system to choose where the segment gets mapped. But in an EXEC_BACKEND
+ * build, it's possible for whatever is chosen in the postmaster to not
+ * work for backends, due to variations in address space layout. As a
+ * rather klugy workaround, allow the user to specify the address to use
+ * via setting the environment variable PG_SHMEM_ADDR. (If this were of
+ * interest for anything except debugging, we'd probably create a cleaner
+ * and better-documented way to set it, such as a GUC.)
+ */
+#ifdef EXEC_BACKEND
+ {
+ char *pg_shmem_addr = getenv("PG_SHMEM_ADDR");
+
+ if (pg_shmem_addr)
+ requestedAddress = (void *) strtoul(pg_shmem_addr, NULL, 0);
+ }
+#endif
+
+ shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
+
+ if (shmid < 0)
+ {
+ int shmget_errno = errno;
+
+ /*
+ * Fail quietly if error indicates a collision with existing segment.
+ * One would expect EEXIST, given that we said IPC_EXCL, but perhaps
+ * we could get a permission violation instead? Also, EIDRM might
+ * occur if an old seg is slated for destruction but not gone yet.
+ */
+ if (shmget_errno == EEXIST || shmget_errno == EACCES
+#ifdef EIDRM
+ || shmget_errno == EIDRM
+#endif
+ )
+ return NULL;
+
+ /*
+ * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if
+ * there is an existing segment but it's smaller than "size" (this is
+ * a result of poorly-thought-out ordering of error tests). To
+ * distinguish between collision and invalid size in such cases, we
+ * make a second try with size = 0. These kernels do not test size
+ * against SHMMIN in the preexisting-segment case, so we will not get
+ * EINVAL a second time if there is such a segment.
+ */
+ if (shmget_errno == EINVAL)
+ {
+ shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection);
+
+ if (shmid < 0)
+ {
+ /* As above, fail quietly if we verify a collision */
+ if (errno == EEXIST || errno == EACCES
+#ifdef EIDRM
+ || errno == EIDRM
+#endif
+ )
+ return NULL;
+ /* Otherwise, fall through to report the original error */
+ }
+ else
+ {
+ /*
+ * On most platforms we cannot get here because SHMMIN is
+ * greater than zero. However, if we do succeed in creating a
+ * zero-size segment, free it and then fall through to report
+ * the original error.
+ */
+ if (shmctl(shmid, IPC_RMID, NULL) < 0)
+ elog(LOG, "shmctl(%d, %d, 0) failed: %m",
+ (int) shmid, IPC_RMID);
+ }
+ }
+
+ /*
+ * Else complain and abort.
+ *
+ * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX
+ * is violated. SHMALL violation might be reported as either ENOMEM
+ * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which
+ * it should be. SHMMNI violation is ENOSPC, per spec. Just plain
+ * not-enough-RAM is ENOMEM.
+ */
+ errno = shmget_errno;
+ ereport(FATAL,
+ (errmsg("could not create shared memory segment: %m"),
+ errdetail("Failed system call was shmget(key=%lu, size=%zu, 0%o).",
+ (unsigned long) memKey, size,
+ IPC_CREAT | IPC_EXCL | IPCProtection),
+ (shmget_errno == EINVAL) ?
+ errhint("This error usually means that PostgreSQL's request for a shared memory "
+ "segment exceeded your kernel's SHMMAX parameter, or possibly that "
+ "it is less than "
+ "your kernel's SHMMIN parameter.\n"
+ "The PostgreSQL documentation contains more information about shared "
+ "memory configuration.") : 0,
+ (shmget_errno == ENOMEM) ?
+ errhint("This error usually means that PostgreSQL's request for a shared "
+ "memory segment exceeded your kernel's SHMALL parameter. You might need "
+ "to reconfigure the kernel with larger SHMALL.\n"
+ "The PostgreSQL documentation contains more information about shared "
+ "memory configuration.") : 0,
+ (shmget_errno == ENOSPC) ?
+ errhint("This error does *not* mean that you have run out of disk space. "
+ "It occurs either if all available shared memory IDs have been taken, "
+ "in which case you need to raise the SHMMNI parameter in your kernel, "
+ "or because the system's overall limit for shared memory has been "
+ "reached.\n"
+ "The PostgreSQL documentation contains more information about shared "
+ "memory configuration.") : 0));
+ }
+
+ /* Register on-exit routine to delete the new segment */
+ on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid));
+
+ /* OK, should be able to attach to the segment */
+ memAddress = shmat(shmid, requestedAddress, PG_SHMAT_FLAGS);
+
+ if (memAddress == (void *) -1)
+ elog(FATAL, "shmat(id=%d, addr=%p, flags=0x%x) failed: %m",
+ shmid, requestedAddress, PG_SHMAT_FLAGS);
+
+ /* Register on-exit routine to detach new segment before deleting */
+ on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress));
+
+ /*
+ * Store shmem key and ID in data directory lockfile. Format to try to
+ * keep it the same length always (trailing junk in the lockfile won't
+ * hurt, but might confuse humans).
+ */
+ {
+ char line[64];
+
+ sprintf(line, "%9lu %9lu",
+ (unsigned long) memKey, (unsigned long) shmid);
+ AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line);
+ }
+
+ return memAddress;
+}
+
+/****************************************************************************/
+/* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */
+/* from process' address space */
+/* (called as an on_shmem_exit callback, hence funny argument list) */
+/****************************************************************************/
+static void
+IpcMemoryDetach(int status, Datum shmaddr)
+{
+ /* Detach System V shared memory block. */
+ if (shmdt(DatumGetPointer(shmaddr)) < 0)
+ elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr));
+}
+
+/****************************************************************************/
+/* IpcMemoryDelete(status, shmId) deletes a shared memory segment */
+/* (called as an on_shmem_exit callback, hence funny argument list) */
+/****************************************************************************/
+static void
+IpcMemoryDelete(int status, Datum shmId)
+{
+ if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0)
+ elog(LOG, "shmctl(%d, %d, 0) failed: %m",
+ DatumGetInt32(shmId), IPC_RMID);
+}
+
+/*
+ * PGSharedMemoryIsInUse
+ *
+ * Is a previously-existing shmem segment still existing and in use?
+ *
+ * The point of this exercise is to detect the case where a prior postmaster
+ * crashed, but it left child backends that are still running. Therefore
+ * we only care about shmem segments that are associated with the intended
+ * DataDir. This is an important consideration since accidental matches of
+ * shmem segment IDs are reasonably common.
+ */
+bool
+PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
+{
+ PGShmemHeader *memAddress;
+ IpcMemoryState state;
+
+ state = PGSharedMemoryAttach((IpcMemoryId) id2, NULL, &memAddress);
+ if (memAddress && shmdt(memAddress) < 0)
+ elog(LOG, "shmdt(%p) failed: %m", memAddress);
+ switch (state)
+ {
+ case SHMSTATE_ENOENT:
+ case SHMSTATE_FOREIGN:
+ case SHMSTATE_UNATTACHED:
+ return false;
+ case SHMSTATE_ANALYSIS_FAILURE:
+ case SHMSTATE_ATTACHED:
+ return true;
+ }
+ return true;
+}
+
+/*
+ * Test for a segment with id shmId; see comment at IpcMemoryState.
+ *
+ * If the segment exists, we'll attempt to attach to it, using attachAt
+ * if that's not NULL (but it's best to pass NULL if possible).
+ *
+ * *addr is set to the segment memory address if we attached to it, else NULL.
+ */
+static IpcMemoryState
+PGSharedMemoryAttach(IpcMemoryId shmId,
+ void *attachAt,
+ PGShmemHeader **addr)
+{
+ struct shmid_ds shmStat;
+ struct stat statbuf;
+ PGShmemHeader *hdr;
+
+ *addr = NULL;
+
+ /*
+ * First, try to stat the shm segment ID, to see if it exists at all.
+ */
+ if (shmctl(shmId, IPC_STAT, &shmStat) < 0)
+ {
+ /*
+ * EINVAL actually has multiple possible causes documented in the
+ * shmctl man page, but we assume it must mean the segment no longer
+ * exists.
+ */
+ if (errno == EINVAL)
+ return SHMSTATE_ENOENT;
+
+ /*
+ * EACCES implies we have no read permission, which means it is not a
+ * Postgres shmem segment (or at least, not one that is relevant to
+ * our data directory).
+ */
+ if (errno == EACCES)
+ return SHMSTATE_FOREIGN;
+
+ /*
+ * Some Linux kernel versions (in fact, all of them as of July 2007)
+ * sometimes return EIDRM when EINVAL is correct. The Linux kernel
+ * actually does not have any internal state that would justify
+ * returning EIDRM, so we can get away with assuming that EIDRM is
+ * equivalent to EINVAL on that platform.
+ */
+#ifdef HAVE_LINUX_EIDRM_BUG
+ if (errno == EIDRM)
+ return SHMSTATE_ENOENT;
+#endif
+
+ /*
+ * Otherwise, we had better assume that the segment is in use. The
+ * only likely case is (non-Linux, assumed spec-compliant) EIDRM,
+ * which implies that the segment has been IPC_RMID'd but there are
+ * still processes attached to it.
+ */
+ return SHMSTATE_ANALYSIS_FAILURE;
+ }
+
+ /*
+ * Try to attach to the segment and see if it matches our data directory.
+ * This avoids any risk of duplicate-shmem-key conflicts on machines that
+ * are running several postmasters under the same userid.
+ *
+ * (When we're called from PGSharedMemoryCreate, this stat call is
+ * duplicative; but since this isn't a high-traffic case it's not worth
+ * trying to optimize.)
+ */
+ if (stat(DataDir, &statbuf) < 0)
+ return SHMSTATE_ANALYSIS_FAILURE; /* can't stat; be conservative */
+
+ hdr = (PGShmemHeader *) shmat(shmId, attachAt, PG_SHMAT_FLAGS);
+ if (hdr == (PGShmemHeader *) -1)
+ {
+ /*
+ * Attachment failed. The cases we're interested in are the same as
+ * for the shmctl() call above. In particular, note that the owning
+ * postmaster could have terminated and removed the segment between
+ * shmctl() and shmat().
+ *
+ * If attachAt isn't NULL, it's possible that EINVAL reflects a
+ * problem with that address not a vanished segment, so it's best to
+ * pass NULL when probing for conflicting segments.
+ */
+ if (errno == EINVAL)
+ return SHMSTATE_ENOENT; /* segment disappeared */
+ if (errno == EACCES)
+ return SHMSTATE_FOREIGN; /* must be non-Postgres */
+#ifdef HAVE_LINUX_EIDRM_BUG
+ if (errno == EIDRM)
+ return SHMSTATE_ENOENT; /* segment disappeared */
+#endif
+ /* Otherwise, be conservative. */
+ return SHMSTATE_ANALYSIS_FAILURE;
+ }
+ *addr = hdr;
+
+ if (hdr->magic != PGShmemMagic ||
+ hdr->device != statbuf.st_dev ||
+ hdr->inode != statbuf.st_ino)
+ {
+ /*
+ * It's either not a Postgres segment, or not one for my data
+ * directory.
+ */
+ return SHMSTATE_FOREIGN;
+ }
+
+ /*
+ * It does match our data directory, so now test whether any processes are
+ * still attached to it. (We are, now, but the shm_nattch result is from
+ * before we attached to it.)
+ */
+ return shmStat.shm_nattch == 0 ? SHMSTATE_UNATTACHED : SHMSTATE_ATTACHED;
+}
+
+#ifdef MAP_HUGETLB
+
+/*
+ * Identify the huge page size to use.
+ *
+ * Some Linux kernel versions have a bug causing mmap() to fail on requests
+ * that are not a multiple of the hugepage size. Versions without that bug
+ * instead silently round the request up to the next hugepage multiple ---
+ * and then munmap() fails when we give it a size different from that.
+ * So we have to round our request up to a multiple of the actual hugepage
+ * size to avoid trouble.
+ *
+ * Doing the round-up ourselves also lets us make use of the extra memory,
+ * rather than just wasting it. Currently, we just increase the available
+ * space recorded in the shmem header, which will make the extra usable for
+ * purposes such as additional locktable entries. Someday, for very large
+ * hugepage sizes, we might want to think about more invasive strategies,
+ * such as increasing shared_buffers to absorb the extra space.
+ *
+ * Returns the (real or assumed) page size into *hugepagesize,
+ * and the hugepage-related mmap flags to use into *mmap_flags.
+ *
+ * Currently *mmap_flags is always just MAP_HUGETLB. Someday, on systems
+ * that support it, we might OR in additional bits to specify a particular
+ * non-default huge page size.
+ */
+static void
+GetHugePageSize(Size *hugepagesize, int *mmap_flags)
+{
+ /*
+ * If we fail to find out the system's default huge page size, assume it
+ * is 2MB. This will work fine when the actual size is less. If it's
+ * more, we might get mmap() or munmap() failures due to unaligned
+ * requests; but at this writing, there are no reports of any non-Linux
+ * systems being picky about that.
+ */
+ *hugepagesize = 2 * 1024 * 1024;
+ *mmap_flags = MAP_HUGETLB;
+
+ /*
+ * System-dependent code to find out the default huge page size.
+ *
+ * On Linux, read /proc/meminfo looking for a line like "Hugepagesize:
+ * nnnn kB". Ignore any failures, falling back to the preset default.
+ */
+#ifdef __linux__
+ {
+ FILE *fp = AllocateFile("/proc/meminfo", "r");
+ char buf[128];
+ unsigned int sz;
+ char ch;
+
+ if (fp)
+ {
+ while (fgets(buf, sizeof(buf), fp))
+ {
+ if (sscanf(buf, "Hugepagesize: %u %c", &sz, &ch) == 2)
+ {
+ if (ch == 'k')
+ {
+ *hugepagesize = sz * (Size) 1024;
+ break;
+ }
+ /* We could accept other units besides kB, if needed */
+ }
+ }
+ FreeFile(fp);
+ }
+ }
+#endif /* __linux__ */
+}
+
+#endif /* MAP_HUGETLB */
+
+/*
+ * Creates an anonymous mmap()ed shared memory segment.
+ *
+ * Pass the requested size in *size. This function will modify *size to the
+ * actual size of the allocation, if it ends up allocating a segment that is
+ * larger than requested.
+ */
+static void *
+CreateAnonymousSegment(Size *size)
+{
+ Size allocsize = *size;
+ void *ptr = MAP_FAILED;
+ int mmap_errno = 0;
+
+#ifndef MAP_HUGETLB
+ /* PGSharedMemoryCreate should have dealt with this case */
+ Assert(huge_pages != HUGE_PAGES_ON);
+#else
+ if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY)
+ {
+ /*
+ * Round up the request size to a suitable large value.
+ */
+ Size hugepagesize;
+ int mmap_flags;
+
+ GetHugePageSize(&hugepagesize, &mmap_flags);
+
+ if (allocsize % hugepagesize != 0)
+ allocsize += hugepagesize - (allocsize % hugepagesize);
+
+ ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
+ PG_MMAP_FLAGS | mmap_flags, -1, 0);
+ mmap_errno = errno;
+ if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED)
+ elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m",
+ allocsize);
+ }
+#endif
+
+ if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON)
+ {
+ /*
+ * Use the original size, not the rounded-up value, when falling back
+ * to non-huge pages.
+ */
+ allocsize = *size;
+ ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
+ PG_MMAP_FLAGS, -1, 0);
+ mmap_errno = errno;
+ }
+
+ if (ptr == MAP_FAILED)
+ {
+ errno = mmap_errno;
+ ereport(FATAL,
+ (errmsg("could not map anonymous shared memory: %m"),
+ (mmap_errno == ENOMEM) ?
+ errhint("This error usually means that PostgreSQL's request "
+ "for a shared memory segment exceeded available memory, "
+ "swap space, or huge pages. To reduce the request size "
+ "(currently %zu bytes), reduce PostgreSQL's shared "
+ "memory usage, perhaps by reducing shared_buffers or "
+ "max_connections.",
+ *size) : 0));
+ }
+
+ *size = allocsize;
+ return ptr;
+}
+
+/*
+ * AnonymousShmemDetach --- detach from an anonymous mmap'd block
+ * (called as an on_shmem_exit callback, hence funny argument list)
+ */
+static void
+AnonymousShmemDetach(int status, Datum arg)
+{
+ /* Release anonymous shared memory block, if any. */
+ if (AnonymousShmem != NULL)
+ {
+ if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
+ elog(LOG, "munmap(%p, %zu) failed: %m",
+ AnonymousShmem, AnonymousShmemSize);
+ AnonymousShmem = NULL;
+ }
+}
+
+/*
+ * PGSharedMemoryCreate
+ *
+ * Create a shared memory segment of the given size and initialize its
+ * standard header. Also, register an on_shmem_exit callback to release
+ * the storage.
+ *
+ * Dead Postgres segments pertinent to this DataDir are recycled if found, but
+ * we do not fail upon collision with foreign shmem segments. The idea here
+ * is to detect and re-use keys that may have been assigned by a crashed
+ * postmaster or backend.
+ */
+PGShmemHeader *
+PGSharedMemoryCreate(Size size,
+ PGShmemHeader **shim)
+{
+ IpcMemoryKey NextShmemSegID;
+ void *memAddress;
+ PGShmemHeader *hdr;
+ struct stat statbuf;
+ Size sysvsize;
+
+ /*
+ * We use the data directory's ID info (inode and device numbers) to
+ * positively identify shmem segments associated with this data dir, and
+ * also as seeds for searching for a free shmem key.
+ */
+ if (stat(DataDir, &statbuf) < 0)
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not stat data directory \"%s\": %m",
+ DataDir)));
+
+ /* Complain if hugepages demanded but we can't possibly support them */
+#if !defined(MAP_HUGETLB)
+ if (huge_pages == HUGE_PAGES_ON)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("huge pages not supported on this platform")));
+#endif
+
+ /* Room for a header? */
+ Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
+
+ if (shared_memory_type == SHMEM_TYPE_MMAP)
+ {
+ AnonymousShmem = CreateAnonymousSegment(&size);
+ AnonymousShmemSize = size;
+
+ /* Register on-exit routine to unmap the anonymous segment */
+ on_shmem_exit(AnonymousShmemDetach, (Datum) 0);
+
+ /* Now we need only allocate a minimal-sized SysV shmem block. */
+ sysvsize = sizeof(PGShmemHeader);
+ }
+ else
+ sysvsize = size;
+
+ /*
+ * Loop till we find a free IPC key. Trust CreateDataDirLockFile() to
+ * ensure no more than one postmaster per data directory can enter this
+ * loop simultaneously. (CreateDataDirLockFile() does not entirely ensure
+ * that, but prefer fixing it over coping here.)
+ */
+ NextShmemSegID = statbuf.st_ino;
+
+ for (;;)
+ {
+ IpcMemoryId shmid;
+ PGShmemHeader *oldhdr;
+ IpcMemoryState state;
+
+ /* Try to create new segment */
+ memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize);
+ if (memAddress)
+ break; /* successful create and attach */
+
+ /* Check shared memory and possibly remove and recreate */
+
+ /*
+ * shmget() failure is typically EACCES, hence SHMSTATE_FOREIGN.
+ * ENOENT, a narrow possibility, implies SHMSTATE_ENOENT, but one can
+ * safely treat SHMSTATE_ENOENT like SHMSTATE_FOREIGN.
+ */
+ shmid = shmget(NextShmemSegID, sizeof(PGShmemHeader), 0);
+ if (shmid < 0)
+ {
+ oldhdr = NULL;
+ state = SHMSTATE_FOREIGN;
+ }
+ else
+ state = PGSharedMemoryAttach(shmid, NULL, &oldhdr);
+
+ switch (state)
+ {
+ case SHMSTATE_ANALYSIS_FAILURE:
+ case SHMSTATE_ATTACHED:
+ ereport(FATAL,
+ (errcode(ERRCODE_LOCK_FILE_EXISTS),
+ errmsg("pre-existing shared memory block (key %lu, ID %lu) is still in use",
+ (unsigned long) NextShmemSegID,
+ (unsigned long) shmid),
+ errhint("Terminate any old server processes associated with data directory \"%s\".",
+ DataDir)));
+ break;
+ case SHMSTATE_ENOENT:
+
+ /*
+ * To our surprise, some other process deleted since our last
+ * InternalIpcMemoryCreate(). Moments earlier, we would have
+ * seen SHMSTATE_FOREIGN. Try that same ID again.
+ */
+ elog(LOG,
+ "shared memory block (key %lu, ID %lu) deleted during startup",
+ (unsigned long) NextShmemSegID,
+ (unsigned long) shmid);
+ break;
+ case SHMSTATE_FOREIGN:
+ NextShmemSegID++;
+ break;
+ case SHMSTATE_UNATTACHED:
+
+ /*
+ * The segment pertains to DataDir, and every process that had
+ * used it has died or detached. Zap it, if possible, and any
+ * associated dynamic shared memory segments, as well. This
+ * shouldn't fail, but if it does, assume the segment belongs
+ * to someone else after all, and try the next candidate.
+ * Otherwise, try again to create the segment. That may fail
+ * if some other process creates the same shmem key before we
+ * do, in which case we'll try the next key.
+ */
+ if (oldhdr->dsm_control != 0)
+ dsm_cleanup_using_control_segment(oldhdr->dsm_control);
+ if (shmctl(shmid, IPC_RMID, NULL) < 0)
+ NextShmemSegID++;
+ break;
+ }
+
+ if (oldhdr && shmdt(oldhdr) < 0)
+ elog(LOG, "shmdt(%p) failed: %m", oldhdr);
+ }
+
+ /* Initialize new segment. */
+ hdr = (PGShmemHeader *) memAddress;
+ hdr->creatorPID = getpid();
+ hdr->magic = PGShmemMagic;
+ hdr->dsm_control = 0;
+
+ /* Fill in the data directory ID info, too */
+ hdr->device = statbuf.st_dev;
+ hdr->inode = statbuf.st_ino;
+
+ /*
+ * Initialize space allocation status for segment.
+ */
+ hdr->totalsize = size;
+ hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
+ *shim = hdr;
+
+ /* Save info for possible future use */
+ UsedShmemSegAddr = memAddress;
+ UsedShmemSegID = (unsigned long) NextShmemSegID;
+
+ /*
+ * If AnonymousShmem is NULL here, then we're not using anonymous shared
+ * memory, and should return a pointer to the System V shared memory
+ * block. Otherwise, the System V shared memory block is only a shim, and
+ * we must return a pointer to the real block.
+ */
+ if (AnonymousShmem == NULL)
+ return hdr;
+ memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader));
+ return (PGShmemHeader *) AnonymousShmem;
+}
+
+#ifdef EXEC_BACKEND
+
+/*
+ * PGSharedMemoryReAttach
+ *
+ * This is called during startup of a postmaster child process to re-attach to
+ * an already existing shared memory segment. This is needed only in the
+ * EXEC_BACKEND case; otherwise postmaster children inherit the shared memory
+ * segment attachment via fork().
+ *
+ * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
+ * routine. The caller must have already restored them to the postmaster's
+ * values.
+ */
+void
+PGSharedMemoryReAttach(void)
+{
+ IpcMemoryId shmid;
+ PGShmemHeader *hdr;
+ IpcMemoryState state;
+ void *origUsedShmemSegAddr = UsedShmemSegAddr;
+
+ Assert(UsedShmemSegAddr != NULL);
+ Assert(IsUnderPostmaster);
+
+#ifdef __CYGWIN__
+ /* cygipc (currently) appears to not detach on exec. */
+ PGSharedMemoryDetach();
+ UsedShmemSegAddr = origUsedShmemSegAddr;
+#endif
+
+ elog(DEBUG3, "attaching to %p", UsedShmemSegAddr);
+ shmid = shmget(UsedShmemSegID, sizeof(PGShmemHeader), 0);
+ if (shmid < 0)
+ state = SHMSTATE_FOREIGN;
+ else
+ state = PGSharedMemoryAttach(shmid, UsedShmemSegAddr, &hdr);
+ if (state != SHMSTATE_ATTACHED)
+ elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m",
+ (int) UsedShmemSegID, UsedShmemSegAddr);
+ if (hdr != origUsedShmemSegAddr)
+ elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)",
+ hdr, origUsedShmemSegAddr);
+ dsm_set_control_handle(hdr->dsm_control);
+
+ UsedShmemSegAddr = hdr; /* probably redundant */
+}
+
+/*
+ * PGSharedMemoryNoReAttach
+ *
+ * This is called during startup of a postmaster child process when we choose
+ * *not* to re-attach to the existing shared memory segment. We must clean up
+ * to leave things in the appropriate state. This is not used in the non
+ * EXEC_BACKEND case, either.
+ *
+ * The child process startup logic might or might not call PGSharedMemoryDetach
+ * after this; make sure that it will be a no-op if called.
+ *
+ * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
+ * routine. The caller must have already restored them to the postmaster's
+ * values.
+ */
+void
+PGSharedMemoryNoReAttach(void)
+{
+ Assert(UsedShmemSegAddr != NULL);
+ Assert(IsUnderPostmaster);
+
+#ifdef __CYGWIN__
+ /* cygipc (currently) appears to not detach on exec. */
+ PGSharedMemoryDetach();
+#endif
+
+ /* For cleanliness, reset UsedShmemSegAddr to show we're not attached. */
+ UsedShmemSegAddr = NULL;
+ /* And the same for UsedShmemSegID. */
+ UsedShmemSegID = 0;
+}
+
+#endif /* EXEC_BACKEND */
+
+/*
+ * PGSharedMemoryDetach
+ *
+ * Detach from the shared memory segment, if still attached. This is not
+ * intended to be called explicitly by the process that originally created the
+ * segment (it will have on_shmem_exit callback(s) registered to do that).
+ * Rather, this is for subprocesses that have inherited an attachment and want
+ * to get rid of it.
+ *
+ * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
+ * routine, also AnonymousShmem and AnonymousShmemSize.
+ */
+void
+PGSharedMemoryDetach(void)
+{
+ if (UsedShmemSegAddr != NULL)
+ {
+ if ((shmdt(UsedShmemSegAddr) < 0)
+#if defined(EXEC_BACKEND) && defined(__CYGWIN__)
+ /* Work-around for cygipc exec bug */
+ && shmdt(NULL) < 0
+#endif
+ )
+ elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr);
+ UsedShmemSegAddr = NULL;
+ }
+
+ if (AnonymousShmem != NULL)
+ {
+ if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
+ elog(LOG, "munmap(%p, %zu) failed: %m",
+ AnonymousShmem, AnonymousShmemSize);
+ AnonymousShmem = NULL;
+ }
+}
diff --git a/src/backend/port/tas/dummy.s b/src/backend/port/tas/dummy.s
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/backend/port/tas/dummy.s
diff --git a/src/backend/port/tas/hpux_hppa.s b/src/backend/port/tas/hpux_hppa.s
new file mode 100644
index 0000000..d978a7c
--- /dev/null
+++ b/src/backend/port/tas/hpux_hppa.s
@@ -0,0 +1,28 @@
+
+ .SPACE $TEXT$,SORT=8
+ .SUBSPA $CODE$,QUAD=0,ALIGN=4,ACCESS=44,CODE_ONLY,SORT=24
+tas
+ .PROC
+ .CALLINFO CALLER,FRAME=0,ENTRY_SR=3
+ .ENTRY
+ LDO 15(%r26),%r31 ;offset 0x0
+ DEPI 0,31,4,%r31 ;offset 0x4
+ LDCWX 0(0,%r31),%r23 ;offset 0x8
+ COMICLR,= 0,%r23,%r0 ;offset 0xc
+ DEP,TR %r0,31,32,%r28 ;offset 0x10
+$00000001
+ LDI 1,%r28 ;offset 0x14
+$L0
+ .EXIT
+ BV,N %r0(%r2) ;offset 0x18
+ .PROCEND ;in=26;out=28;
+
+
+ .SPACE $TEXT$
+ .SUBSPA $CODE$
+ .SPACE $PRIVATE$,SORT=16
+ .SUBSPA $DATA$,QUAD=1,ALIGN=8,ACCESS=31,SORT=16
+ .SPACE $TEXT$
+ .SUBSPA $CODE$
+ .EXPORT tas,ENTRY,PRIV_LEV=3,ARGW0=GR,RTNVAL=GR
+ .END
diff --git a/src/backend/port/tas/sunstudio_sparc.s b/src/backend/port/tas/sunstudio_sparc.s
new file mode 100644
index 0000000..4bebf07
--- /dev/null
+++ b/src/backend/port/tas/sunstudio_sparc.s
@@ -0,0 +1,53 @@
+!-------------------------------------------------------------------------
+!
+! sunstudio_sparc.s
+! compare and swap for Sun Studio on Sparc
+!
+! Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+! Portions Copyright (c) 1994, Regents of the University of California
+!
+! IDENTIFICATION
+! src/backend/port/tas/sunstudio_sparc.s
+!
+!-------------------------------------------------------------------------
+
+! Fortunately the Sun compiler can process cpp conditionals with -P
+
+! '/' is the comment for x86, while '!' is the comment for Sparc
+
+#if defined(__sparcv9) || defined(__sparc)
+
+ .section ".text"
+ .align 8
+ .skip 24
+ .align 4
+
+ .global pg_atomic_cas
+pg_atomic_cas:
+
+ ! "cas" only works on sparcv9 and sparcv8plus chips, and
+ ! requires a compiler targeting these CPUs. It will fail
+ ! on a compiler targeting sparcv8, and of course will not
+ ! be understood by a sparcv8 CPU. gcc continues to use
+ ! "ldstub" because it targets sparcv7.
+ !
+ ! There is actually a trick for embedding "cas" in a
+ ! sparcv8-targeted compiler, but it can only be run
+ ! on a sparcv8plus/v9 cpus:
+ !
+ ! http://cvs.opensolaris.org/source/xref/on/usr/src/lib/libc/sparc/threads/sparc.il
+ !
+ ! NB: We're assuming we're running on a TSO system here - solaris
+ ! userland luckily always has done so.
+
+#if defined(__sparcv9) || defined(__sparcv8plus)
+ cas [%o0],%o2,%o1
+#else
+ ldstub [%o0],%o1
+#endif
+ mov %o1,%o0
+ retl
+ nop
+ .type pg_atomic_cas,2
+ .size pg_atomic_cas,(.-pg_atomic_cas)
+#endif
diff --git a/src/backend/port/tas/sunstudio_x86.s b/src/backend/port/tas/sunstudio_x86.s
new file mode 100644
index 0000000..d95e173
--- /dev/null
+++ b/src/backend/port/tas/sunstudio_x86.s
@@ -0,0 +1,43 @@
+/-------------------------------------------------------------------------
+/
+/ sunstudio_x86.s
+/ compare and swap for Sun Studio on x86
+/
+/ Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+/ Portions Copyright (c) 1994, Regents of the University of California
+/
+/ IDENTIFICATION
+/ src/backend/port/tas/sunstudio_x86.s
+/
+/-------------------------------------------------------------------------
+
+/ Fortunately the Sun compiler can process cpp conditionals with -P
+
+/ '/' is the comment for x86, while '!' is the comment for Sparc
+
+ .file "tas.s"
+
+#if defined(__amd64)
+ .code64
+#endif
+
+ .globl pg_atomic_cas
+ .type pg_atomic_cas, @function
+
+ .section .text, "ax"
+ .align 16
+
+pg_atomic_cas:
+#if defined(__amd64)
+ movl %edx,%eax
+ lock
+ cmpxchgl %esi,(%rdi)
+#else
+ movl 4(%esp), %edx
+ movl 8(%esp), %ecx
+ movl 12(%esp), %eax
+ lock
+ cmpxchgl %ecx, (%edx)
+#endif
+ ret
+ .size pg_atomic_cas, . - pg_atomic_cas
diff --git a/src/backend/port/win32/Makefile b/src/backend/port/win32/Makefile
new file mode 100644
index 0000000..90126f6
--- /dev/null
+++ b/src/backend/port/win32/Makefile
@@ -0,0 +1,23 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for backend/port/win32
+#
+# IDENTIFICATION
+# src/backend/port/win32/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/port/win32
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ signal.o \
+ socket.o \
+ timer.o
+ifeq ($(have_win32_dbghelp), yes)
+OBJS += crashdump.o
+endif
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/port/win32/crashdump.c b/src/backend/port/win32/crashdump.c
new file mode 100644
index 0000000..e6c6837
--- /dev/null
+++ b/src/backend/port/win32/crashdump.c
@@ -0,0 +1,183 @@
+/*-------------------------------------------------------------------------
+ *
+ * crashdump.c
+ * Automatic crash dump creation for PostgreSQL on Windows
+ *
+ * The crashdump feature traps unhandled win32 exceptions produced by the
+ * backend, and tries to produce a Windows MiniDump crash
+ * dump for later debugging and analysis. The machine performing the dump
+ * doesn't need any special debugging tools; the user only needs to send
+ * the dump to somebody who has the same version of PostgreSQL and has debugging
+ * tools.
+ *
+ * crashdump module originally by Craig Ringer <ringerc@ringerc.id.au>
+ *
+ * LIMITATIONS
+ * ===========
+ * This *won't* work in hard OOM situations or stack overflows.
+ *
+ * For those, it'd be necessary to take a much more complicated approach where
+ * the handler switches to a new stack (if it can) and forks a helper process
+ * to debug it self.
+ *
+ * POSSIBLE FUTURE WORK
+ * ====================
+ * For bonus points, the crash dump format permits embedding of user-supplied
+ * data. If there's anything else that should always be supplied with a crash
+ * dump (postgresql.conf? Last few lines of a log file?), it could potentially
+ * be added, though at the cost of a greater chance of the crash dump failing.
+ *
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/port/win32/crashdump.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#define WIN32_LEAN_AND_MEAN
+
+/*
+ * Some versions of the MS SDK contain "typedef enum { ... } ;" which the MS
+ * compiler quite sanely complains about. Well done, Microsoft.
+ * This pragma disables the warning just while we include the header.
+ * The pragma is known to work with all (as at the time of writing) supported
+ * versions of MSVC.
+ */
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4091)
+#endif
+#include <dbghelp.h>
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+/*
+ * Much of the following code is based on CodeProject and MSDN examples,
+ * particularly
+ * http://www.codeproject.com/KB/debug/postmortemdebug_standalone1.aspx
+ *
+ * Useful MSDN articles:
+ *
+ * http://msdn.microsoft.com/en-us/library/ff805116(v=VS.85).aspx
+ * http://msdn.microsoft.com/en-us/library/ms679294(VS.85).aspx
+ *
+ * Other useful articles on working with minidumps:
+ * http://www.debuginfo.com/articles/effminidumps.html
+ */
+
+typedef BOOL (WINAPI * MINIDUMPWRITEDUMP) (HANDLE hProcess, DWORD dwPid, HANDLE hFile, MINIDUMP_TYPE DumpType,
+ CONST PMINIDUMP_EXCEPTION_INFORMATION ExceptionParam,
+ CONST PMINIDUMP_USER_STREAM_INFORMATION UserStreamParam,
+ CONST PMINIDUMP_CALLBACK_INFORMATION CallbackParam
+);
+
+
+/*
+ * This function is the exception handler passed to SetUnhandledExceptionFilter.
+ * It's invoked only if there's an unhandled exception. The handler will use
+ * dbghelp.dll to generate a crash dump, then resume the normal unhandled
+ * exception process, which will generally exit with an error message from
+ * the runtime.
+ *
+ * This function is run under the unhandled exception handler, effectively
+ * in a crash context, so it should be careful with memory and avoid using
+ * any PostgreSQL functions.
+ */
+static LONG WINAPI
+crashDumpHandler(struct _EXCEPTION_POINTERS *pExceptionInfo)
+{
+ /*
+ * We only write crash dumps if the "crashdumps" directory within the
+ * postgres data directory exists.
+ */
+ DWORD attribs = GetFileAttributesA("crashdumps");
+
+ if (attribs != INVALID_FILE_ATTRIBUTES && (attribs & FILE_ATTRIBUTE_DIRECTORY))
+ {
+ /* 'crashdumps' exists and is a directory. Try to write a dump' */
+ HMODULE hDll = NULL;
+ MINIDUMPWRITEDUMP pDump = NULL;
+ MINIDUMP_TYPE dumpType;
+ char dumpPath[_MAX_PATH];
+ HANDLE selfProcHandle = GetCurrentProcess();
+ DWORD selfPid = GetProcessId(selfProcHandle);
+ HANDLE dumpFile;
+ DWORD systemTicks;
+ struct _MINIDUMP_EXCEPTION_INFORMATION ExInfo;
+
+ ExInfo.ThreadId = GetCurrentThreadId();
+ ExInfo.ExceptionPointers = pExceptionInfo;
+ ExInfo.ClientPointers = FALSE;
+
+ /* Load the dbghelp.dll library and functions */
+ hDll = LoadLibrary("dbghelp.dll");
+ if (hDll == NULL)
+ {
+ write_stderr("could not load dbghelp.dll, cannot write crash dump\n");
+ return EXCEPTION_CONTINUE_SEARCH;
+ }
+
+ pDump = (MINIDUMPWRITEDUMP) GetProcAddress(hDll, "MiniDumpWriteDump");
+
+ if (pDump == NULL)
+ {
+ write_stderr("could not load required functions in dbghelp.dll, cannot write crash dump\n");
+ return EXCEPTION_CONTINUE_SEARCH;
+ }
+
+ /*
+ * Dump as much as we can, except shared memory, code segments, and
+ * memory mapped files. Exactly what we can dump depends on the
+ * version of dbghelp.dll, see:
+ * http://msdn.microsoft.com/en-us/library/ms680519(v=VS.85).aspx
+ */
+ dumpType = MiniDumpNormal | MiniDumpWithHandleData |
+ MiniDumpWithDataSegs;
+
+ if (GetProcAddress(hDll, "EnumDirTree") != NULL)
+ {
+ /* If this function exists, we have version 5.2 or newer */
+ dumpType |= MiniDumpWithIndirectlyReferencedMemory |
+ MiniDumpWithPrivateReadWriteMemory;
+ }
+
+ systemTicks = GetTickCount();
+ snprintf(dumpPath, _MAX_PATH,
+ "crashdumps\\postgres-pid%0i-%0i.mdmp",
+ (int) selfPid, (int) systemTicks);
+ dumpPath[_MAX_PATH - 1] = '\0';
+
+ dumpFile = CreateFile(dumpPath, GENERIC_WRITE, FILE_SHARE_WRITE,
+ NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL,
+ NULL);
+ if (dumpFile == INVALID_HANDLE_VALUE)
+ {
+ write_stderr("could not open crash dump file \"%s\" for writing: error code %lu\n",
+ dumpPath, GetLastError());
+ return EXCEPTION_CONTINUE_SEARCH;
+ }
+
+ if ((*pDump) (selfProcHandle, selfPid, dumpFile, dumpType, &ExInfo,
+ NULL, NULL))
+ write_stderr("wrote crash dump to file \"%s\"\n", dumpPath);
+ else
+ write_stderr("could not write crash dump to file \"%s\": error code %lu\n",
+ dumpPath, GetLastError());
+
+ CloseHandle(dumpFile);
+ }
+
+ return EXCEPTION_CONTINUE_SEARCH;
+}
+
+
+void
+pgwin32_install_crashdump_handler(void)
+{
+ SetUnhandledExceptionFilter(crashDumpHandler);
+}
diff --git a/src/backend/port/win32/signal.c b/src/backend/port/win32/signal.c
new file mode 100644
index 0000000..3218b38
--- /dev/null
+++ b/src/backend/port/win32/signal.c
@@ -0,0 +1,344 @@
+/*-------------------------------------------------------------------------
+ *
+ * signal.c
+ * Microsoft Windows Win32 Signal Emulation Functions
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/port/win32/signal.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "libpq/pqsignal.h"
+
+/*
+ * These are exported for use by the UNBLOCKED_SIGNAL_QUEUE() macro.
+ * pg_signal_queue must be volatile since it is changed by the signal
+ * handling thread and inspected without any lock by the main thread.
+ * pg_signal_mask is only changed by main thread so shouldn't need it.
+ */
+volatile int pg_signal_queue;
+int pg_signal_mask;
+
+HANDLE pgwin32_signal_event;
+HANDLE pgwin32_initial_signal_pipe = INVALID_HANDLE_VALUE;
+
+/*
+ * pg_signal_crit_sec is used to protect only pg_signal_queue. That is the only
+ * variable that can be accessed from the signal sending threads!
+ */
+static CRITICAL_SECTION pg_signal_crit_sec;
+
+/* Note that array elements 0 are unused since they correspond to signal 0 */
+static pqsigfunc pg_signal_array[PG_SIGNAL_COUNT];
+static pqsigfunc pg_signal_defaults[PG_SIGNAL_COUNT];
+
+
+/* Signal handling thread functions */
+static DWORD WINAPI pg_signal_thread(LPVOID param);
+static BOOL WINAPI pg_console_handler(DWORD dwCtrlType);
+
+
+/*
+ * pg_usleep --- delay the specified number of microseconds, but
+ * stop waiting if a signal arrives.
+ *
+ * This replaces the non-signal-aware version provided by src/port/pgsleep.c.
+ */
+void
+pg_usleep(long microsec)
+{
+ Assert(pgwin32_signal_event != NULL);
+ if (WaitForSingleObject(pgwin32_signal_event,
+ (microsec < 500 ? 1 : (microsec + 500) / 1000))
+ == WAIT_OBJECT_0)
+ {
+ pgwin32_dispatch_queued_signals();
+ errno = EINTR;
+ return;
+ }
+}
+
+
+/* Initialization */
+void
+pgwin32_signal_initialize(void)
+{
+ int i;
+ HANDLE signal_thread_handle;
+
+ InitializeCriticalSection(&pg_signal_crit_sec);
+
+ for (i = 0; i < PG_SIGNAL_COUNT; i++)
+ {
+ pg_signal_array[i] = SIG_DFL;
+ pg_signal_defaults[i] = SIG_IGN;
+ }
+ pg_signal_mask = 0;
+ pg_signal_queue = 0;
+
+ /* Create the global event handle used to flag signals */
+ pgwin32_signal_event = CreateEvent(NULL, TRUE, FALSE, NULL);
+ if (pgwin32_signal_event == NULL)
+ ereport(FATAL,
+ (errmsg_internal("could not create signal event: error code %lu", GetLastError())));
+
+ /* Create thread for handling signals */
+ signal_thread_handle = CreateThread(NULL, 0, pg_signal_thread, NULL, 0, NULL);
+ if (signal_thread_handle == NULL)
+ ereport(FATAL,
+ (errmsg_internal("could not create signal handler thread")));
+
+ /* Create console control handle to pick up Ctrl-C etc */
+ if (!SetConsoleCtrlHandler(pg_console_handler, TRUE))
+ ereport(FATAL,
+ (errmsg_internal("could not set console control handler")));
+}
+
+/*
+ * Dispatch all signals currently queued and not blocked
+ * Blocked signals are ignored, and will be fired at the time of
+ * the pqsigsetmask() call.
+ */
+void
+pgwin32_dispatch_queued_signals(void)
+{
+ int exec_mask;
+
+ Assert(pgwin32_signal_event != NULL);
+ EnterCriticalSection(&pg_signal_crit_sec);
+ while ((exec_mask = UNBLOCKED_SIGNAL_QUEUE()) != 0)
+ {
+ /* One or more unblocked signals queued for execution */
+ int i;
+
+ for (i = 1; i < PG_SIGNAL_COUNT; i++)
+ {
+ if (exec_mask & sigmask(i))
+ {
+ /* Execute this signal */
+ pqsigfunc sig = pg_signal_array[i];
+
+ if (sig == SIG_DFL)
+ sig = pg_signal_defaults[i];
+ pg_signal_queue &= ~sigmask(i);
+ if (sig != SIG_ERR && sig != SIG_IGN && sig != SIG_DFL)
+ {
+ LeaveCriticalSection(&pg_signal_crit_sec);
+ sig(i);
+ EnterCriticalSection(&pg_signal_crit_sec);
+ break; /* Restart outer loop, in case signal mask or
+ * queue has been modified inside signal
+ * handler */
+ }
+ }
+ }
+ }
+ ResetEvent(pgwin32_signal_event);
+ LeaveCriticalSection(&pg_signal_crit_sec);
+}
+
+/* signal masking. Only called on main thread, no sync required */
+int
+pqsigsetmask(int mask)
+{
+ int prevmask;
+
+ prevmask = pg_signal_mask;
+ pg_signal_mask = mask;
+
+ /*
+ * Dispatch any signals queued up right away, in case we have unblocked
+ * one or more signals previously queued
+ */
+ pgwin32_dispatch_queued_signals();
+
+ return prevmask;
+}
+
+
+/*
+ * Unix-like signal handler installation
+ *
+ * Only called on main thread, no sync required
+ */
+pqsigfunc
+pqsignal(int signum, pqsigfunc handler)
+{
+ pqsigfunc prevfunc;
+
+ if (signum >= PG_SIGNAL_COUNT || signum < 0)
+ return SIG_ERR;
+ prevfunc = pg_signal_array[signum];
+ pg_signal_array[signum] = handler;
+ return prevfunc;
+}
+
+/* Create the signal listener pipe for specified PID */
+HANDLE
+pgwin32_create_signal_listener(pid_t pid)
+{
+ char pipename[128];
+ HANDLE pipe;
+
+ snprintf(pipename, sizeof(pipename), "\\\\.\\pipe\\pgsignal_%u", (int) pid);
+
+ pipe = CreateNamedPipe(pipename, PIPE_ACCESS_DUPLEX,
+ PIPE_TYPE_MESSAGE | PIPE_READMODE_MESSAGE | PIPE_WAIT,
+ PIPE_UNLIMITED_INSTANCES, 16, 16, 1000, NULL);
+
+ if (pipe == INVALID_HANDLE_VALUE)
+ ereport(ERROR,
+ (errmsg("could not create signal listener pipe for PID %d: error code %lu",
+ (int) pid, GetLastError())));
+
+ return pipe;
+}
+
+
+/*
+ * All functions below execute on the signal handler thread
+ * and must be synchronized as such!
+ * NOTE! The only global variable that can be used is
+ * pg_signal_queue!
+ */
+
+
+/*
+ * Queue a signal for the main thread, by setting the flag bit and event.
+ */
+void
+pg_queue_signal(int signum)
+{
+ Assert(pgwin32_signal_event != NULL);
+ if (signum >= PG_SIGNAL_COUNT || signum <= 0)
+ return; /* ignore any bad signal number */
+
+ EnterCriticalSection(&pg_signal_crit_sec);
+ pg_signal_queue |= sigmask(signum);
+ LeaveCriticalSection(&pg_signal_crit_sec);
+
+ SetEvent(pgwin32_signal_event);
+}
+
+/* Signal handling thread */
+static DWORD WINAPI
+pg_signal_thread(LPVOID param)
+{
+ char pipename[128];
+ HANDLE pipe = pgwin32_initial_signal_pipe;
+
+ /* Set up pipe name, in case we have to re-create the pipe. */
+ snprintf(pipename, sizeof(pipename), "\\\\.\\pipe\\pgsignal_%lu", GetCurrentProcessId());
+
+ for (;;)
+ {
+ BOOL fConnected;
+
+ /* Create a new pipe instance if we don't have one. */
+ if (pipe == INVALID_HANDLE_VALUE)
+ {
+ pipe = CreateNamedPipe(pipename, PIPE_ACCESS_DUPLEX,
+ PIPE_TYPE_MESSAGE | PIPE_READMODE_MESSAGE | PIPE_WAIT,
+ PIPE_UNLIMITED_INSTANCES, 16, 16, 1000, NULL);
+
+ if (pipe == INVALID_HANDLE_VALUE)
+ {
+ write_stderr("could not create signal listener pipe: error code %lu; retrying\n", GetLastError());
+ SleepEx(500, FALSE);
+ continue;
+ }
+ }
+
+ /*
+ * Wait for a client to connect. If something connects before we
+ * reach here, we'll get back a "failure" with ERROR_PIPE_CONNECTED,
+ * which is actually a success (way to go, Microsoft).
+ */
+ fConnected = ConnectNamedPipe(pipe, NULL) ? TRUE : (GetLastError() == ERROR_PIPE_CONNECTED);
+ if (fConnected)
+ {
+ /*
+ * We have a connection from a would-be signal sender. Process it.
+ */
+ BYTE sigNum;
+ DWORD bytes;
+
+ if (ReadFile(pipe, &sigNum, 1, &bytes, NULL) &&
+ bytes == 1)
+ {
+ /*
+ * Queue the signal before responding to the client. In this
+ * way, it's guaranteed that once kill() has returned in the
+ * signal sender, the next CHECK_FOR_INTERRUPTS() in the
+ * signal recipient will see the signal. (This is a stronger
+ * guarantee than POSIX makes; maybe we don't need it? But
+ * without it, we've seen timing bugs on Windows that do not
+ * manifest on any known Unix.)
+ */
+ pg_queue_signal(sigNum);
+
+ /*
+ * Write something back to the client, allowing its
+ * CallNamedPipe() call to terminate.
+ */
+ WriteFile(pipe, &sigNum, 1, &bytes, NULL); /* Don't care if it
+ * works or not */
+
+ /*
+ * We must wait for the client to read the data before we can
+ * disconnect, else the data will be lost. (If the WriteFile
+ * call failed, there'll be nothing in the buffer, so this
+ * shouldn't block.)
+ */
+ FlushFileBuffers(pipe);
+ }
+ else
+ {
+ /*
+ * If we fail to read a byte from the client, assume it's the
+ * client's problem and do nothing. Perhaps it'd be better to
+ * force a pipe close and reopen?
+ */
+ }
+
+ /* Disconnect from client so that we can re-use the pipe. */
+ DisconnectNamedPipe(pipe);
+ }
+ else
+ {
+ /*
+ * Connection failed. Cleanup and try again.
+ *
+ * This should never happen. If it does, there's a window where
+ * we'll miss signals until we manage to re-create the pipe.
+ * However, just trying to use the same pipe again is probably not
+ * going to work, so we have little choice.
+ */
+ CloseHandle(pipe);
+ pipe = INVALID_HANDLE_VALUE;
+ }
+ }
+ return 0;
+}
+
+
+/* Console control handler will execute on a thread created
+ by the OS at the time of invocation */
+static BOOL WINAPI
+pg_console_handler(DWORD dwCtrlType)
+{
+ if (dwCtrlType == CTRL_C_EVENT ||
+ dwCtrlType == CTRL_BREAK_EVENT ||
+ dwCtrlType == CTRL_CLOSE_EVENT ||
+ dwCtrlType == CTRL_SHUTDOWN_EVENT)
+ {
+ pg_queue_signal(SIGINT);
+ return TRUE;
+ }
+ return FALSE;
+}
diff --git a/src/backend/port/win32/socket.c b/src/backend/port/win32/socket.c
new file mode 100644
index 0000000..6fbd1ed
--- /dev/null
+++ b/src/backend/port/win32/socket.c
@@ -0,0 +1,692 @@
+/*-------------------------------------------------------------------------
+ *
+ * socket.c
+ * Microsoft Windows Win32 Socket Functions
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/port/win32/socket.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+/*
+ * Indicate if pgwin32_recv() and pgwin32_send() should operate
+ * in non-blocking mode.
+ *
+ * Since the socket emulation layer always sets the actual socket to
+ * non-blocking mode in order to be able to deliver signals, we must
+ * specify this in a separate flag if we actually need non-blocking
+ * operation.
+ *
+ * This flag changes the behaviour *globally* for all socket operations,
+ * so it should only be set for very short periods of time.
+ */
+int pgwin32_noblock = 0;
+
+/* Undef the macros defined in win32.h, so we can access system functions */
+#undef socket
+#undef bind
+#undef listen
+#undef accept
+#undef connect
+#undef select
+#undef recv
+#undef send
+
+/*
+ * Blocking socket functions implemented so they listen on both
+ * the socket and the signal event, required for signal handling.
+ */
+
+/*
+ * Convert the last socket error code into errno
+ *
+ * Note: where there is a direct correspondence between a WSAxxx error code
+ * and a Berkeley error symbol, this mapping is actually a no-op, because
+ * in win32.h we redefine the network-related Berkeley error symbols to have
+ * the values of their WSAxxx counterparts. The point of the switch is
+ * mostly to translate near-miss error codes into something that's sensible
+ * in the Berkeley universe.
+ */
+static void
+TranslateSocketError(void)
+{
+ switch (WSAGetLastError())
+ {
+ case WSAEINVAL:
+ case WSANOTINITIALISED:
+ case WSAEINVALIDPROVIDER:
+ case WSAEINVALIDPROCTABLE:
+ case WSAEDESTADDRREQ:
+ errno = EINVAL;
+ break;
+ case WSAEINPROGRESS:
+ errno = EINPROGRESS;
+ break;
+ case WSAEFAULT:
+ errno = EFAULT;
+ break;
+ case WSAEISCONN:
+ errno = EISCONN;
+ break;
+ case WSAEMSGSIZE:
+ errno = EMSGSIZE;
+ break;
+ case WSAEAFNOSUPPORT:
+ errno = EAFNOSUPPORT;
+ break;
+ case WSAEMFILE:
+ errno = EMFILE;
+ break;
+ case WSAENOBUFS:
+ errno = ENOBUFS;
+ break;
+ case WSAEPROTONOSUPPORT:
+ case WSAEPROTOTYPE:
+ case WSAESOCKTNOSUPPORT:
+ errno = EPROTONOSUPPORT;
+ break;
+ case WSAECONNABORTED:
+ errno = ECONNABORTED;
+ break;
+ case WSAECONNREFUSED:
+ errno = ECONNREFUSED;
+ break;
+ case WSAECONNRESET:
+ errno = ECONNRESET;
+ break;
+ case WSAEINTR:
+ errno = EINTR;
+ break;
+ case WSAENOTSOCK:
+ errno = ENOTSOCK;
+ break;
+ case WSAEOPNOTSUPP:
+ errno = EOPNOTSUPP;
+ break;
+ case WSAEWOULDBLOCK:
+ errno = EWOULDBLOCK;
+ break;
+ case WSAEACCES:
+ errno = EACCES;
+ break;
+ case WSAEADDRINUSE:
+ errno = EADDRINUSE;
+ break;
+ case WSAEADDRNOTAVAIL:
+ errno = EADDRNOTAVAIL;
+ break;
+ case WSAEHOSTUNREACH:
+ case WSAEHOSTDOWN:
+ case WSAHOST_NOT_FOUND:
+ case WSAENETDOWN:
+ case WSAENETUNREACH:
+ case WSAENETRESET:
+ errno = EHOSTUNREACH;
+ break;
+ case WSAENOTCONN:
+ case WSAESHUTDOWN:
+ case WSAEDISCON:
+ errno = ENOTCONN;
+ break;
+ default:
+ ereport(NOTICE,
+ (errmsg_internal("unrecognized win32 socket error code: %d", WSAGetLastError())));
+ errno = EINVAL;
+ }
+}
+
+static int
+pgwin32_poll_signals(void)
+{
+ if (UNBLOCKED_SIGNAL_QUEUE())
+ {
+ pgwin32_dispatch_queued_signals();
+ errno = EINTR;
+ return 1;
+ }
+ return 0;
+}
+
+static int
+isDataGram(SOCKET s)
+{
+ int type;
+ int typelen = sizeof(type);
+
+ if (getsockopt(s, SOL_SOCKET, SO_TYPE, (char *) &type, &typelen))
+ return 1;
+
+ return (type == SOCK_DGRAM) ? 1 : 0;
+}
+
+int
+pgwin32_waitforsinglesocket(SOCKET s, int what, int timeout)
+{
+ static HANDLE waitevent = INVALID_HANDLE_VALUE;
+ static SOCKET current_socket = INVALID_SOCKET;
+ static int isUDP = 0;
+ HANDLE events[2];
+ int r;
+
+ /* Create an event object just once and use it on all future calls */
+ if (waitevent == INVALID_HANDLE_VALUE)
+ {
+ waitevent = CreateEvent(NULL, TRUE, FALSE, NULL);
+
+ if (waitevent == INVALID_HANDLE_VALUE)
+ ereport(ERROR,
+ (errmsg_internal("could not create socket waiting event: error code %lu", GetLastError())));
+ }
+ else if (!ResetEvent(waitevent))
+ ereport(ERROR,
+ (errmsg_internal("could not reset socket waiting event: error code %lu", GetLastError())));
+
+ /*
+ * Track whether socket is UDP or not. (NB: most likely, this is both
+ * useless and wrong; there is no reason to think that the behavior of
+ * WSAEventSelect is different for TCP and UDP.)
+ */
+ if (current_socket != s)
+ isUDP = isDataGram(s);
+ current_socket = s;
+
+ /*
+ * Attach event to socket. NOTE: we must detach it again before
+ * returning, since other bits of code may try to attach other events to
+ * the socket.
+ */
+ if (WSAEventSelect(s, waitevent, what) != 0)
+ {
+ TranslateSocketError();
+ return 0;
+ }
+
+ events[0] = pgwin32_signal_event;
+ events[1] = waitevent;
+
+ /*
+ * Just a workaround of unknown locking problem with writing in UDP socket
+ * under high load: Client's pgsql backend sleeps infinitely in
+ * WaitForMultipleObjectsEx, pgstat process sleeps in pgwin32_select().
+ * So, we will wait with small timeout(0.1 sec) and if socket is still
+ * blocked, try WSASend (see comments in pgwin32_select) and wait again.
+ */
+ if ((what & FD_WRITE) && isUDP)
+ {
+ for (;;)
+ {
+ r = WaitForMultipleObjectsEx(2, events, FALSE, 100, TRUE);
+
+ if (r == WAIT_TIMEOUT)
+ {
+ char c;
+ WSABUF buf;
+ DWORD sent;
+
+ buf.buf = &c;
+ buf.len = 0;
+
+ r = WSASend(s, &buf, 1, &sent, 0, NULL, NULL);
+ if (r == 0) /* Completed - means things are fine! */
+ {
+ WSAEventSelect(s, NULL, 0);
+ return 1;
+ }
+ else if (WSAGetLastError() != WSAEWOULDBLOCK)
+ {
+ TranslateSocketError();
+ WSAEventSelect(s, NULL, 0);
+ return 0;
+ }
+ }
+ else
+ break;
+ }
+ }
+ else
+ r = WaitForMultipleObjectsEx(2, events, FALSE, timeout, TRUE);
+
+ WSAEventSelect(s, NULL, 0);
+
+ if (r == WAIT_OBJECT_0 || r == WAIT_IO_COMPLETION)
+ {
+ pgwin32_dispatch_queued_signals();
+ errno = EINTR;
+ return 0;
+ }
+ if (r == WAIT_OBJECT_0 + 1)
+ return 1;
+ if (r == WAIT_TIMEOUT)
+ {
+ errno = EWOULDBLOCK;
+ return 0;
+ }
+ ereport(ERROR,
+ (errmsg_internal("unrecognized return value from WaitForMultipleObjects: %d (error code %lu)", r, GetLastError())));
+ return 0;
+}
+
+/*
+ * Create a socket, setting it to overlapped and non-blocking
+ */
+SOCKET
+pgwin32_socket(int af, int type, int protocol)
+{
+ SOCKET s;
+ unsigned long on = 1;
+
+ s = WSASocket(af, type, protocol, NULL, 0, WSA_FLAG_OVERLAPPED);
+ if (s == INVALID_SOCKET)
+ {
+ TranslateSocketError();
+ return INVALID_SOCKET;
+ }
+
+ if (ioctlsocket(s, FIONBIO, &on))
+ {
+ TranslateSocketError();
+ return INVALID_SOCKET;
+ }
+ errno = 0;
+
+ return s;
+}
+
+int
+pgwin32_bind(SOCKET s, struct sockaddr *addr, int addrlen)
+{
+ int res;
+
+ res = bind(s, addr, addrlen);
+ if (res < 0)
+ TranslateSocketError();
+ return res;
+}
+
+int
+pgwin32_listen(SOCKET s, int backlog)
+{
+ int res;
+
+ res = listen(s, backlog);
+ if (res < 0)
+ TranslateSocketError();
+ return res;
+}
+
+SOCKET
+pgwin32_accept(SOCKET s, struct sockaddr *addr, int *addrlen)
+{
+ SOCKET rs;
+
+ /*
+ * Poll for signals, but don't return with EINTR, since we don't handle
+ * that in pqcomm.c
+ */
+ pgwin32_poll_signals();
+
+ rs = WSAAccept(s, addr, addrlen, NULL, 0);
+ if (rs == INVALID_SOCKET)
+ {
+ TranslateSocketError();
+ return INVALID_SOCKET;
+ }
+ return rs;
+}
+
+
+/* No signal delivery during connect. */
+int
+pgwin32_connect(SOCKET s, const struct sockaddr *addr, int addrlen)
+{
+ int r;
+
+ r = WSAConnect(s, addr, addrlen, NULL, NULL, NULL, NULL);
+ if (r == 0)
+ return 0;
+
+ if (WSAGetLastError() != WSAEWOULDBLOCK)
+ {
+ TranslateSocketError();
+ return -1;
+ }
+
+ while (pgwin32_waitforsinglesocket(s, FD_CONNECT, INFINITE) == 0)
+ {
+ /* Loop endlessly as long as we are just delivering signals */
+ }
+
+ return 0;
+}
+
+int
+pgwin32_recv(SOCKET s, char *buf, int len, int f)
+{
+ WSABUF wbuf;
+ int r;
+ DWORD b;
+ DWORD flags = f;
+ int n;
+
+ if (pgwin32_poll_signals())
+ return -1;
+
+ wbuf.len = len;
+ wbuf.buf = buf;
+
+ r = WSARecv(s, &wbuf, 1, &b, &flags, NULL, NULL);
+ if (r != SOCKET_ERROR)
+ return b; /* success */
+
+ if (WSAGetLastError() != WSAEWOULDBLOCK)
+ {
+ TranslateSocketError();
+ return -1;
+ }
+
+ if (pgwin32_noblock)
+ {
+ /*
+ * No data received, and we are in "emulated non-blocking mode", so
+ * return indicating that we'd block if we were to continue.
+ */
+ errno = EWOULDBLOCK;
+ return -1;
+ }
+
+ /* We're in blocking mode, so wait for data */
+
+ for (n = 0; n < 5; n++)
+ {
+ if (pgwin32_waitforsinglesocket(s, FD_READ | FD_CLOSE | FD_ACCEPT,
+ INFINITE) == 0)
+ return -1; /* errno already set */
+
+ r = WSARecv(s, &wbuf, 1, &b, &flags, NULL, NULL);
+ if (r != SOCKET_ERROR)
+ return b; /* success */
+ if (WSAGetLastError() != WSAEWOULDBLOCK)
+ {
+ TranslateSocketError();
+ return -1;
+ }
+
+ /*
+ * There seem to be cases on win2k (at least) where WSARecv can return
+ * WSAEWOULDBLOCK even when pgwin32_waitforsinglesocket claims the
+ * socket is readable. In this case, just sleep for a moment and try
+ * again. We try up to 5 times - if it fails more than that it's not
+ * likely to ever come back.
+ */
+ pg_usleep(10000);
+ }
+ ereport(NOTICE,
+ (errmsg_internal("could not read from ready socket (after retries)")));
+ errno = EWOULDBLOCK;
+ return -1;
+}
+
+/*
+ * The second argument to send() is defined by SUS to be a "const void *"
+ * and so we use the same signature here to keep compilers happy when
+ * handling callers.
+ *
+ * But the buf member of a WSABUF struct is defined as "char *", so we cast
+ * the second argument to that here when assigning it, also to keep compilers
+ * happy.
+ */
+
+int
+pgwin32_send(SOCKET s, const void *buf, int len, int flags)
+{
+ WSABUF wbuf;
+ int r;
+ DWORD b;
+
+ if (pgwin32_poll_signals())
+ return -1;
+
+ wbuf.len = len;
+ wbuf.buf = (char *) buf;
+
+ /*
+ * Readiness of socket to send data to UDP socket may be not true: socket
+ * can become busy again! So loop until send or error occurs.
+ */
+ for (;;)
+ {
+ r = WSASend(s, &wbuf, 1, &b, flags, NULL, NULL);
+ if (r != SOCKET_ERROR && b > 0)
+ /* Write succeeded right away */
+ return b;
+
+ if (r == SOCKET_ERROR &&
+ WSAGetLastError() != WSAEWOULDBLOCK)
+ {
+ TranslateSocketError();
+ return -1;
+ }
+
+ if (pgwin32_noblock)
+ {
+ /*
+ * No data sent, and we are in "emulated non-blocking mode", so
+ * return indicating that we'd block if we were to continue.
+ */
+ errno = EWOULDBLOCK;
+ return -1;
+ }
+
+ /* No error, zero bytes (win2000+) or error+WSAEWOULDBLOCK (<=nt4) */
+
+ if (pgwin32_waitforsinglesocket(s, FD_WRITE | FD_CLOSE, INFINITE) == 0)
+ return -1;
+ }
+
+ return -1;
+}
+
+
+/*
+ * Wait for activity on one or more sockets.
+ * While waiting, allow signals to run
+ *
+ * NOTE! Currently does not implement exceptfds check,
+ * since it is not used in postgresql!
+ */
+int
+pgwin32_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timeval *timeout)
+{
+ WSAEVENT events[FD_SETSIZE * 2]; /* worst case is readfds totally
+ * different from writefds, so
+ * 2*FD_SETSIZE sockets */
+ SOCKET sockets[FD_SETSIZE * 2];
+ int numevents = 0;
+ int i;
+ int r;
+ DWORD timeoutval = WSA_INFINITE;
+ FD_SET outreadfds;
+ FD_SET outwritefds;
+ int nummatches = 0;
+
+ Assert(exceptfds == NULL);
+
+ if (pgwin32_poll_signals())
+ return -1;
+
+ FD_ZERO(&outreadfds);
+ FD_ZERO(&outwritefds);
+
+ /*
+ * Windows does not guarantee to log an FD_WRITE network event indicating
+ * that more data can be sent unless the previous send() failed with
+ * WSAEWOULDBLOCK. While our caller might well have made such a call, we
+ * cannot assume that here. Therefore, if waiting for write-ready, force
+ * the issue by doing a dummy send(). If the dummy send() succeeds,
+ * assume that the socket is in fact write-ready, and return immediately.
+ * Also, if it fails with something other than WSAEWOULDBLOCK, return a
+ * write-ready indication to let our caller deal with the error condition.
+ */
+ if (writefds != NULL)
+ {
+ for (i = 0; i < writefds->fd_count; i++)
+ {
+ char c;
+ WSABUF buf;
+ DWORD sent;
+
+ buf.buf = &c;
+ buf.len = 0;
+
+ r = WSASend(writefds->fd_array[i], &buf, 1, &sent, 0, NULL, NULL);
+ if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
+ FD_SET(writefds->fd_array[i], &outwritefds);
+ }
+
+ /* If we found any write-ready sockets, just return them immediately */
+ if (outwritefds.fd_count > 0)
+ {
+ memcpy(writefds, &outwritefds, sizeof(fd_set));
+ if (readfds)
+ FD_ZERO(readfds);
+ return outwritefds.fd_count;
+ }
+ }
+
+
+ /* Now set up for an actual select */
+
+ if (timeout != NULL)
+ {
+ /* timeoutval is in milliseconds */
+ timeoutval = timeout->tv_sec * 1000 + timeout->tv_usec / 1000;
+ }
+
+ if (readfds != NULL)
+ {
+ for (i = 0; i < readfds->fd_count; i++)
+ {
+ events[numevents] = WSACreateEvent();
+ sockets[numevents] = readfds->fd_array[i];
+ numevents++;
+ }
+ }
+ if (writefds != NULL)
+ {
+ for (i = 0; i < writefds->fd_count; i++)
+ {
+ if (!readfds ||
+ !FD_ISSET(writefds->fd_array[i], readfds))
+ {
+ /* If the socket is not in the read list */
+ events[numevents] = WSACreateEvent();
+ sockets[numevents] = writefds->fd_array[i];
+ numevents++;
+ }
+ }
+ }
+
+ for (i = 0; i < numevents; i++)
+ {
+ int flags = 0;
+
+ if (readfds && FD_ISSET(sockets[i], readfds))
+ flags |= FD_READ | FD_ACCEPT | FD_CLOSE;
+
+ if (writefds && FD_ISSET(sockets[i], writefds))
+ flags |= FD_WRITE | FD_CLOSE;
+
+ if (WSAEventSelect(sockets[i], events[i], flags) != 0)
+ {
+ TranslateSocketError();
+ /* release already-assigned event objects */
+ while (--i >= 0)
+ WSAEventSelect(sockets[i], NULL, 0);
+ for (i = 0; i < numevents; i++)
+ WSACloseEvent(events[i]);
+ return -1;
+ }
+ }
+
+ events[numevents] = pgwin32_signal_event;
+ r = WaitForMultipleObjectsEx(numevents + 1, events, FALSE, timeoutval, TRUE);
+ if (r != WAIT_TIMEOUT && r != WAIT_IO_COMPLETION && r != (WAIT_OBJECT_0 + numevents))
+ {
+ /*
+ * We scan all events, even those not signaled, in case more than one
+ * event has been tagged but Wait.. can only return one.
+ */
+ WSANETWORKEVENTS resEvents;
+
+ for (i = 0; i < numevents; i++)
+ {
+ ZeroMemory(&resEvents, sizeof(resEvents));
+ if (WSAEnumNetworkEvents(sockets[i], events[i], &resEvents) != 0)
+ elog(ERROR, "failed to enumerate network events: error code %u",
+ WSAGetLastError());
+ /* Read activity? */
+ if (readfds && FD_ISSET(sockets[i], readfds))
+ {
+ if ((resEvents.lNetworkEvents & FD_READ) ||
+ (resEvents.lNetworkEvents & FD_ACCEPT) ||
+ (resEvents.lNetworkEvents & FD_CLOSE))
+ {
+ FD_SET(sockets[i], &outreadfds);
+
+ nummatches++;
+ }
+ }
+ /* Write activity? */
+ if (writefds && FD_ISSET(sockets[i], writefds))
+ {
+ if ((resEvents.lNetworkEvents & FD_WRITE) ||
+ (resEvents.lNetworkEvents & FD_CLOSE))
+ {
+ FD_SET(sockets[i], &outwritefds);
+
+ nummatches++;
+ }
+ }
+ }
+ }
+
+ /* Clean up all the event objects */
+ for (i = 0; i < numevents; i++)
+ {
+ WSAEventSelect(sockets[i], NULL, 0);
+ WSACloseEvent(events[i]);
+ }
+
+ if (r == WSA_WAIT_TIMEOUT)
+ {
+ if (readfds)
+ FD_ZERO(readfds);
+ if (writefds)
+ FD_ZERO(writefds);
+ return 0;
+ }
+
+ /* Signal-like events. */
+ if (r == WAIT_OBJECT_0 + numevents || r == WAIT_IO_COMPLETION)
+ {
+ pgwin32_dispatch_queued_signals();
+ errno = EINTR;
+ if (readfds)
+ FD_ZERO(readfds);
+ if (writefds)
+ FD_ZERO(writefds);
+ return -1;
+ }
+
+ /* Overwrite socket sets with our resulting values */
+ if (readfds)
+ memcpy(readfds, &outreadfds, sizeof(fd_set));
+ if (writefds)
+ memcpy(writefds, &outwritefds, sizeof(fd_set));
+ return nummatches;
+}
diff --git a/src/backend/port/win32/timer.c b/src/backend/port/win32/timer.c
new file mode 100644
index 0000000..bb98178
--- /dev/null
+++ b/src/backend/port/win32/timer.c
@@ -0,0 +1,121 @@
+/*-------------------------------------------------------------------------
+ *
+ * timer.c
+ * Microsoft Windows Win32 Timer Implementation
+ *
+ * Limitations of this implementation:
+ *
+ * - Does not support interval timer (value->it_interval)
+ * - Only supports ITIMER_REAL
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/port/win32/timer.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+
+/* Communication area for inter-thread communication */
+typedef struct timerCA
+{
+ struct itimerval value;
+ HANDLE event;
+ CRITICAL_SECTION crit_sec;
+} timerCA;
+
+static timerCA timerCommArea;
+static HANDLE timerThreadHandle = INVALID_HANDLE_VALUE;
+
+
+/* Timer management thread */
+static DWORD WINAPI
+pg_timer_thread(LPVOID param)
+{
+ DWORD waittime;
+
+ Assert(param == NULL);
+
+ waittime = INFINITE;
+
+ for (;;)
+ {
+ int r;
+
+ r = WaitForSingleObjectEx(timerCommArea.event, waittime, FALSE);
+ if (r == WAIT_OBJECT_0)
+ {
+ /* Event signaled from main thread, change the timer */
+ EnterCriticalSection(&timerCommArea.crit_sec);
+ if (timerCommArea.value.it_value.tv_sec == 0 &&
+ timerCommArea.value.it_value.tv_usec == 0)
+ waittime = INFINITE; /* Cancel the interrupt */
+ else
+ {
+ /* WaitForSingleObjectEx() uses milliseconds, round up */
+ waittime = (timerCommArea.value.it_value.tv_usec + 999) / 1000 +
+ timerCommArea.value.it_value.tv_sec * 1000;
+ }
+ ResetEvent(timerCommArea.event);
+ LeaveCriticalSection(&timerCommArea.crit_sec);
+ }
+ else if (r == WAIT_TIMEOUT)
+ {
+ /* Timeout expired, signal SIGALRM and turn it off */
+ pg_queue_signal(SIGALRM);
+ waittime = INFINITE;
+ }
+ else
+ {
+ /* Should never happen */
+ Assert(false);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Win32 setitimer emulation by creating a persistent thread
+ * to handle the timer setting and notification upon timeout.
+ */
+int
+setitimer(int which, const struct itimerval *value, struct itimerval *ovalue)
+{
+ Assert(value != NULL);
+ Assert(value->it_interval.tv_sec == 0 && value->it_interval.tv_usec == 0);
+ Assert(which == ITIMER_REAL);
+
+ if (timerThreadHandle == INVALID_HANDLE_VALUE)
+ {
+ /* First call in this backend, create event and the timer thread */
+ timerCommArea.event = CreateEvent(NULL, TRUE, FALSE, NULL);
+ if (timerCommArea.event == NULL)
+ ereport(FATAL,
+ (errmsg_internal("could not create timer event: error code %lu",
+ GetLastError())));
+
+ MemSet(&timerCommArea.value, 0, sizeof(struct itimerval));
+
+ InitializeCriticalSection(&timerCommArea.crit_sec);
+
+ timerThreadHandle = CreateThread(NULL, 0, pg_timer_thread, NULL, 0, NULL);
+ if (timerThreadHandle == INVALID_HANDLE_VALUE)
+ ereport(FATAL,
+ (errmsg_internal("could not create timer thread: error code %lu",
+ GetLastError())));
+ }
+
+ /* Request the timer thread to change settings */
+ EnterCriticalSection(&timerCommArea.crit_sec);
+ if (ovalue)
+ *ovalue = timerCommArea.value;
+ timerCommArea.value = *value;
+ LeaveCriticalSection(&timerCommArea.crit_sec);
+ SetEvent(timerCommArea.event);
+
+ return 0;
+}
diff --git a/src/backend/port/win32_sema.c b/src/backend/port/win32_sema.c
new file mode 100644
index 0000000..d15c4c1
--- /dev/null
+++ b/src/backend/port/win32_sema.c
@@ -0,0 +1,235 @@
+/*-------------------------------------------------------------------------
+ *
+ * win32_sema.c
+ * Microsoft Windows Win32 Semaphores Emulation
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/port/win32_sema.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/pg_sema.h"
+
+static HANDLE *mySemSet; /* IDs of sema sets acquired so far */
+static int numSems; /* number of sema sets acquired so far */
+static int maxSems; /* allocated size of mySemaSet array */
+
+static void ReleaseSemaphores(int code, Datum arg);
+
+
+/*
+ * Report amount of shared memory needed for semaphores
+ */
+Size
+PGSemaphoreShmemSize(int maxSemas)
+{
+ /* No shared memory needed on Windows */
+ return 0;
+}
+
+/*
+ * PGReserveSemaphores --- initialize semaphore support
+ *
+ * In the Win32 implementation, we acquire semaphores on-demand; the
+ * maxSemas parameter is just used to size the array that keeps track of
+ * acquired semas for subsequent releasing. We use anonymous semaphores
+ * so the semaphores are automatically freed when the last referencing
+ * process exits.
+ */
+void
+PGReserveSemaphores(int maxSemas)
+{
+ mySemSet = (HANDLE *) malloc(maxSemas * sizeof(HANDLE));
+ if (mySemSet == NULL)
+ elog(PANIC, "out of memory");
+ numSems = 0;
+ maxSems = maxSemas;
+
+ on_shmem_exit(ReleaseSemaphores, 0);
+}
+
+/*
+ * Release semaphores at shutdown or shmem reinitialization
+ *
+ * (called as an on_shmem_exit callback, hence funny argument list)
+ */
+static void
+ReleaseSemaphores(int code, Datum arg)
+{
+ int i;
+
+ for (i = 0; i < numSems; i++)
+ CloseHandle(mySemSet[i]);
+ free(mySemSet);
+}
+
+/*
+ * PGSemaphoreCreate
+ *
+ * Allocate a PGSemaphore structure with initial count 1
+ */
+PGSemaphore
+PGSemaphoreCreate(void)
+{
+ HANDLE cur_handle;
+ SECURITY_ATTRIBUTES sec_attrs;
+
+ /* Can't do this in a backend, because static state is postmaster's */
+ Assert(!IsUnderPostmaster);
+
+ if (numSems >= maxSems)
+ elog(PANIC, "too many semaphores created");
+
+ ZeroMemory(&sec_attrs, sizeof(sec_attrs));
+ sec_attrs.nLength = sizeof(sec_attrs);
+ sec_attrs.lpSecurityDescriptor = NULL;
+ sec_attrs.bInheritHandle = TRUE;
+
+ /* We don't need a named semaphore */
+ cur_handle = CreateSemaphore(&sec_attrs, 1, 32767, NULL);
+ if (cur_handle)
+ {
+ /* Successfully done */
+ mySemSet[numSems++] = cur_handle;
+ }
+ else
+ ereport(PANIC,
+ (errmsg("could not create semaphore: error code %lu",
+ GetLastError())));
+
+ return (PGSemaphore) cur_handle;
+}
+
+/*
+ * PGSemaphoreReset
+ *
+ * Reset a previously-initialized PGSemaphore to have count 0
+ */
+void
+PGSemaphoreReset(PGSemaphore sema)
+{
+ /*
+ * There's no direct API for this in Win32, so we have to ratchet the
+ * semaphore down to 0 with repeated trylock's.
+ */
+ while (PGSemaphoreTryLock(sema))
+ /* loop */ ;
+}
+
+/*
+ * PGSemaphoreLock
+ *
+ * Lock a semaphore (decrement count), blocking if count would be < 0.
+ */
+void
+PGSemaphoreLock(PGSemaphore sema)
+{
+ HANDLE wh[2];
+ bool done = false;
+
+ /*
+ * Note: pgwin32_signal_event should be first to ensure that it will be
+ * reported when multiple events are set. We want to guarantee that
+ * pending signals are serviced.
+ */
+ wh[0] = pgwin32_signal_event;
+ wh[1] = sema;
+
+ /*
+ * As in other implementations of PGSemaphoreLock, we need to check for
+ * cancel/die interrupts each time through the loop. But here, there is
+ * no hidden magic about whether the syscall will internally service a
+ * signal --- we do that ourselves.
+ */
+ while (!done)
+ {
+ DWORD rc;
+
+ CHECK_FOR_INTERRUPTS();
+
+ rc = WaitForMultipleObjectsEx(2, wh, FALSE, INFINITE, TRUE);
+ switch (rc)
+ {
+ case WAIT_OBJECT_0:
+ /* Signal event is set - we have a signal to deliver */
+ pgwin32_dispatch_queued_signals();
+ break;
+ case WAIT_OBJECT_0 + 1:
+ /* We got it! */
+ done = true;
+ break;
+ case WAIT_IO_COMPLETION:
+
+ /*
+ * The system interrupted the wait to execute an I/O
+ * completion routine or asynchronous procedure call in this
+ * thread. PostgreSQL does not provoke either of these, but
+ * atypical loaded DLLs or even other processes might do so.
+ * Now, resume waiting.
+ */
+ break;
+ case WAIT_FAILED:
+ ereport(FATAL,
+ (errmsg("could not lock semaphore: error code %lu",
+ GetLastError())));
+ break;
+ default:
+ elog(FATAL, "unexpected return code from WaitForMultipleObjectsEx(): %lu", rc);
+ break;
+ }
+ }
+}
+
+/*
+ * PGSemaphoreUnlock
+ *
+ * Unlock a semaphore (increment count)
+ */
+void
+PGSemaphoreUnlock(PGSemaphore sema)
+{
+ if (!ReleaseSemaphore(sema, 1, NULL))
+ ereport(FATAL,
+ (errmsg("could not unlock semaphore: error code %lu",
+ GetLastError())));
+}
+
+/*
+ * PGSemaphoreTryLock
+ *
+ * Lock a semaphore only if able to do so without blocking
+ */
+bool
+PGSemaphoreTryLock(PGSemaphore sema)
+{
+ DWORD ret;
+
+ ret = WaitForSingleObject(sema, 0);
+
+ if (ret == WAIT_OBJECT_0)
+ {
+ /* We got it! */
+ return true;
+ }
+ else if (ret == WAIT_TIMEOUT)
+ {
+ /* Can't get it */
+ errno = EAGAIN;
+ return false;
+ }
+
+ /* Otherwise we are in trouble */
+ ereport(FATAL,
+ (errmsg("could not try-lock semaphore: error code %lu",
+ GetLastError())));
+
+ /* keep compiler quiet */
+ return false;
+}
diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c
new file mode 100644
index 0000000..30b0730
--- /dev/null
+++ b/src/backend/port/win32_shmem.c
@@ -0,0 +1,599 @@
+/*-------------------------------------------------------------------------
+ *
+ * win32_shmem.c
+ * Implement shared memory using win32 facilities
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/port/win32_shmem.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "storage/pg_shmem.h"
+
+/*
+ * Early in a process's life, Windows asynchronously creates threads for the
+ * process's "default thread pool"
+ * (https://docs.microsoft.com/en-us/windows/desktop/ProcThread/thread-pools).
+ * Occasionally, thread creation allocates a stack after
+ * PGSharedMemoryReAttach() has released UsedShmemSegAddr and before it has
+ * mapped shared memory at UsedShmemSegAddr. This would cause mapping to fail
+ * if the allocator preferred the just-released region for allocating the new
+ * thread stack. We observed such failures in some Windows Server 2016
+ * configurations. To give the system another region to prefer, reserve and
+ * release an additional, protective region immediately before reserving or
+ * releasing shared memory. The idea is that, if the allocator handed out
+ * REGION1 pages before REGION2 pages at one occasion, it will do so whenever
+ * both regions are free. Windows Server 2016 exhibits that behavior, and a
+ * system behaving differently would have less need to protect
+ * UsedShmemSegAddr. The protective region must be at least large enough for
+ * one thread stack. However, ten times as much is less than 2% of the 32-bit
+ * address space and is negligible relative to the 64-bit address space.
+ */
+#define PROTECTIVE_REGION_SIZE (10 * WIN32_STACK_RLIMIT)
+void *ShmemProtectiveRegion = NULL;
+
+HANDLE UsedShmemSegID = INVALID_HANDLE_VALUE;
+void *UsedShmemSegAddr = NULL;
+static Size UsedShmemSegSize = 0;
+
+static bool EnableLockPagesPrivilege(int elevel);
+static void pgwin32_SharedMemoryDelete(int status, Datum shmId);
+
+/*
+ * Generate shared memory segment name. Expand the data directory, to generate
+ * an identifier unique for this data directory. Then replace all backslashes
+ * with forward slashes, since backslashes aren't permitted in global object names.
+ *
+ * Store the shared memory segment in the Global\ namespace (requires NT2 TSE or
+ * 2000, but that's all we support for other reasons as well), to make sure you can't
+ * open two postmasters in different sessions against the same data directory.
+ *
+ * XXX: What happens with junctions? It's only someone breaking things on purpose,
+ * and this is still better than before, but we might want to do something about
+ * that sometime in the future.
+ */
+static char *
+GetSharedMemName(void)
+{
+ char *retptr;
+ DWORD bufsize;
+ DWORD r;
+ char *cp;
+
+ bufsize = GetFullPathName(DataDir, 0, NULL, NULL);
+ if (bufsize == 0)
+ elog(FATAL, "could not get size for full pathname of datadir %s: error code %lu",
+ DataDir, GetLastError());
+
+ retptr = malloc(bufsize + 18); /* 18 for Global\PostgreSQL: */
+ if (retptr == NULL)
+ elog(FATAL, "could not allocate memory for shared memory name");
+
+ strcpy(retptr, "Global\\PostgreSQL:");
+ r = GetFullPathName(DataDir, bufsize, retptr + 18, NULL);
+ if (r == 0 || r > bufsize)
+ elog(FATAL, "could not generate full pathname for datadir %s: error code %lu",
+ DataDir, GetLastError());
+
+ /*
+ * XXX: Intentionally overwriting the Global\ part here. This was not the
+ * original approach, but putting it in the actual Global\ namespace
+ * causes permission errors in a lot of cases, so we leave it in the
+ * default namespace for now.
+ */
+ for (cp = retptr; *cp; cp++)
+ if (*cp == '\\')
+ *cp = '/';
+
+ return retptr;
+}
+
+
+/*
+ * PGSharedMemoryIsInUse
+ *
+ * Is a previously-existing shmem segment still existing and in use?
+ *
+ * The point of this exercise is to detect the case where a prior postmaster
+ * crashed, but it left child backends that are still running. Therefore
+ * we only care about shmem segments that are associated with the intended
+ * DataDir. This is an important consideration since accidental matches of
+ * shmem segment IDs are reasonably common.
+ */
+bool
+PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
+{
+ char *szShareMem;
+ HANDLE hmap;
+
+ szShareMem = GetSharedMemName();
+
+ hmap = OpenFileMapping(FILE_MAP_READ, FALSE, szShareMem);
+
+ free(szShareMem);
+
+ if (hmap == NULL)
+ return false;
+
+ CloseHandle(hmap);
+ return true;
+}
+
+/*
+ * EnableLockPagesPrivilege
+ *
+ * Try to acquire SeLockMemoryPrivilege so we can use large pages.
+ */
+static bool
+EnableLockPagesPrivilege(int elevel)
+{
+ HANDLE hToken;
+ TOKEN_PRIVILEGES tp;
+ LUID luid;
+
+ if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken))
+ {
+ ereport(elevel,
+ (errmsg("could not enable Lock Pages in Memory user right: error code %lu", GetLastError()),
+ errdetail("Failed system call was %s.", "OpenProcessToken")));
+ return FALSE;
+ }
+
+ if (!LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &luid))
+ {
+ ereport(elevel,
+ (errmsg("could not enable Lock Pages in Memory user right: error code %lu", GetLastError()),
+ errdetail("Failed system call was %s.", "LookupPrivilegeValue")));
+ CloseHandle(hToken);
+ return FALSE;
+ }
+ tp.PrivilegeCount = 1;
+ tp.Privileges[0].Luid = luid;
+ tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+
+ if (!AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL))
+ {
+ ereport(elevel,
+ (errmsg("could not enable Lock Pages in Memory user right: error code %lu", GetLastError()),
+ errdetail("Failed system call was %s.", "AdjustTokenPrivileges")));
+ CloseHandle(hToken);
+ return FALSE;
+ }
+
+ if (GetLastError() != ERROR_SUCCESS)
+ {
+ if (GetLastError() == ERROR_NOT_ALL_ASSIGNED)
+ ereport(elevel,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("could not enable Lock Pages in Memory user right"),
+ errhint("Assign Lock Pages in Memory user right to the Windows user account which runs PostgreSQL.")));
+ else
+ ereport(elevel,
+ (errmsg("could not enable Lock Pages in Memory user right: error code %lu", GetLastError()),
+ errdetail("Failed system call was %s.", "AdjustTokenPrivileges")));
+ CloseHandle(hToken);
+ return FALSE;
+ }
+
+ CloseHandle(hToken);
+
+ return TRUE;
+}
+
+/*
+ * PGSharedMemoryCreate
+ *
+ * Create a shared memory segment of the given size and initialize its
+ * standard header.
+ */
+PGShmemHeader *
+PGSharedMemoryCreate(Size size,
+ PGShmemHeader **shim)
+{
+ void *memAddress;
+ PGShmemHeader *hdr;
+ HANDLE hmap,
+ hmap2;
+ char *szShareMem;
+ int i;
+ DWORD size_high;
+ DWORD size_low;
+ SIZE_T largePageSize = 0;
+ Size orig_size = size;
+ DWORD flProtect = PAGE_READWRITE;
+
+ ShmemProtectiveRegion = VirtualAlloc(NULL, PROTECTIVE_REGION_SIZE,
+ MEM_RESERVE, PAGE_NOACCESS);
+ if (ShmemProtectiveRegion == NULL)
+ elog(FATAL, "could not reserve memory region: error code %lu",
+ GetLastError());
+
+ /* Room for a header? */
+ Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
+
+ szShareMem = GetSharedMemName();
+
+ UsedShmemSegAddr = NULL;
+
+ if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY)
+ {
+ /* Does the processor support large pages? */
+ largePageSize = GetLargePageMinimum();
+ if (largePageSize == 0)
+ {
+ ereport(huge_pages == HUGE_PAGES_ON ? FATAL : DEBUG1,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("the processor does not support large pages")));
+ ereport(DEBUG1,
+ (errmsg("disabling huge pages")));
+ }
+ else if (!EnableLockPagesPrivilege(huge_pages == HUGE_PAGES_ON ? FATAL : DEBUG1))
+ {
+ ereport(DEBUG1,
+ (errmsg("disabling huge pages")));
+ }
+ else
+ {
+ /* Huge pages available and privilege enabled, so turn on */
+ flProtect = PAGE_READWRITE | SEC_COMMIT | SEC_LARGE_PAGES;
+
+ /* Round size up as appropriate. */
+ if (size % largePageSize != 0)
+ size += largePageSize - (size % largePageSize);
+ }
+ }
+
+retry:
+#ifdef _WIN64
+ size_high = size >> 32;
+#else
+ size_high = 0;
+#endif
+ size_low = (DWORD) size;
+
+ /*
+ * When recycling a shared memory segment, it may take a short while
+ * before it gets dropped from the global namespace. So re-try after
+ * sleeping for a second, and continue retrying 10 times. (both the 1
+ * second time and the 10 retries are completely arbitrary)
+ */
+ for (i = 0; i < 10; i++)
+ {
+ /*
+ * In case CreateFileMapping() doesn't set the error code to 0 on
+ * success
+ */
+ SetLastError(0);
+
+ hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */
+ NULL, /* Default security attrs */
+ flProtect,
+ size_high, /* Size Upper 32 Bits */
+ size_low, /* Size Lower 32 bits */
+ szShareMem);
+
+ if (!hmap)
+ {
+ if (GetLastError() == ERROR_NO_SYSTEM_RESOURCES &&
+ huge_pages == HUGE_PAGES_TRY &&
+ (flProtect & SEC_LARGE_PAGES) != 0)
+ {
+ elog(DEBUG1, "CreateFileMapping(%zu) with SEC_LARGE_PAGES failed, "
+ "huge pages disabled",
+ size);
+
+ /*
+ * Use the original size, not the rounded-up value, when
+ * falling back to non-huge pages.
+ */
+ size = orig_size;
+ flProtect = PAGE_READWRITE;
+ goto retry;
+ }
+ else
+ ereport(FATAL,
+ (errmsg("could not create shared memory segment: error code %lu", GetLastError()),
+ errdetail("Failed system call was CreateFileMapping(size=%zu, name=%s).",
+ size, szShareMem)));
+ }
+
+ /*
+ * If the segment already existed, CreateFileMapping() will return a
+ * handle to the existing one and set ERROR_ALREADY_EXISTS.
+ */
+ if (GetLastError() == ERROR_ALREADY_EXISTS)
+ {
+ CloseHandle(hmap); /* Close the handle, since we got a valid one
+ * to the previous segment. */
+ hmap = NULL;
+ Sleep(1000);
+ continue;
+ }
+ break;
+ }
+
+ /*
+ * If the last call in the loop still returned ERROR_ALREADY_EXISTS, this
+ * shared memory segment exists and we assume it belongs to somebody else.
+ */
+ if (!hmap)
+ ereport(FATAL,
+ (errmsg("pre-existing shared memory block is still in use"),
+ errhint("Check if there are any old server processes still running, and terminate them.")));
+
+ free(szShareMem);
+
+ /*
+ * Make the handle inheritable
+ */
+ if (!DuplicateHandle(GetCurrentProcess(), hmap, GetCurrentProcess(), &hmap2, 0, TRUE, DUPLICATE_SAME_ACCESS))
+ ereport(FATAL,
+ (errmsg("could not create shared memory segment: error code %lu", GetLastError()),
+ errdetail("Failed system call was DuplicateHandle.")));
+
+ /*
+ * Close the old, non-inheritable handle. If this fails we don't really
+ * care.
+ */
+ if (!CloseHandle(hmap))
+ elog(LOG, "could not close handle to shared memory: error code %lu", GetLastError());
+
+
+ /*
+ * Get a pointer to the new shared memory segment. Map the whole segment
+ * at once, and let the system decide on the initial address.
+ */
+ memAddress = MapViewOfFileEx(hmap2, FILE_MAP_WRITE | FILE_MAP_READ, 0, 0, 0, NULL);
+ if (!memAddress)
+ ereport(FATAL,
+ (errmsg("could not create shared memory segment: error code %lu", GetLastError()),
+ errdetail("Failed system call was MapViewOfFileEx.")));
+
+
+
+ /*
+ * OK, we created a new segment. Mark it as created by this process. The
+ * order of assignments here is critical so that another Postgres process
+ * can't see the header as valid but belonging to an invalid PID!
+ */
+ hdr = (PGShmemHeader *) memAddress;
+ hdr->creatorPID = getpid();
+ hdr->magic = PGShmemMagic;
+
+ /*
+ * Initialize space allocation status for segment.
+ */
+ hdr->totalsize = size;
+ hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
+ hdr->dsm_control = 0;
+
+ /* Save info for possible future use */
+ UsedShmemSegAddr = memAddress;
+ UsedShmemSegSize = size;
+ UsedShmemSegID = hmap2;
+
+ /* Register on-exit routine to delete the new segment */
+ on_shmem_exit(pgwin32_SharedMemoryDelete, PointerGetDatum(hmap2));
+
+ *shim = hdr;
+ return hdr;
+}
+
+/*
+ * PGSharedMemoryReAttach
+ *
+ * This is called during startup of a postmaster child process to re-attach to
+ * an already existing shared memory segment, using the handle inherited from
+ * the postmaster.
+ *
+ * ShmemProtectiveRegion, UsedShmemSegID and UsedShmemSegAddr are implicit
+ * parameters to this routine. The caller must have already restored them to
+ * the postmaster's values.
+ */
+void
+PGSharedMemoryReAttach(void)
+{
+ PGShmemHeader *hdr;
+ void *origUsedShmemSegAddr = UsedShmemSegAddr;
+
+ Assert(ShmemProtectiveRegion != NULL);
+ Assert(UsedShmemSegAddr != NULL);
+ Assert(IsUnderPostmaster);
+
+ /*
+ * Release memory region reservations made by the postmaster
+ */
+ if (VirtualFree(ShmemProtectiveRegion, 0, MEM_RELEASE) == 0)
+ elog(FATAL, "failed to release reserved memory region (addr=%p): error code %lu",
+ ShmemProtectiveRegion, GetLastError());
+ if (VirtualFree(UsedShmemSegAddr, 0, MEM_RELEASE) == 0)
+ elog(FATAL, "failed to release reserved memory region (addr=%p): error code %lu",
+ UsedShmemSegAddr, GetLastError());
+
+ hdr = (PGShmemHeader *) MapViewOfFileEx(UsedShmemSegID, FILE_MAP_READ | FILE_MAP_WRITE, 0, 0, 0, UsedShmemSegAddr);
+ if (!hdr)
+ elog(FATAL, "could not reattach to shared memory (key=%p, addr=%p): error code %lu",
+ UsedShmemSegID, UsedShmemSegAddr, GetLastError());
+ if (hdr != origUsedShmemSegAddr)
+ elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)",
+ hdr, origUsedShmemSegAddr);
+ if (hdr->magic != PGShmemMagic)
+ elog(FATAL, "reattaching to shared memory returned non-PostgreSQL memory");
+ dsm_set_control_handle(hdr->dsm_control);
+
+ UsedShmemSegAddr = hdr; /* probably redundant */
+}
+
+/*
+ * PGSharedMemoryNoReAttach
+ *
+ * This is called during startup of a postmaster child process when we choose
+ * *not* to re-attach to the existing shared memory segment. We must clean up
+ * to leave things in the appropriate state.
+ *
+ * The child process startup logic might or might not call PGSharedMemoryDetach
+ * after this; make sure that it will be a no-op if called.
+ *
+ * ShmemProtectiveRegion, UsedShmemSegID and UsedShmemSegAddr are implicit
+ * parameters to this routine. The caller must have already restored them to
+ * the postmaster's values.
+ */
+void
+PGSharedMemoryNoReAttach(void)
+{
+ Assert(ShmemProtectiveRegion != NULL);
+ Assert(UsedShmemSegAddr != NULL);
+ Assert(IsUnderPostmaster);
+
+ /*
+ * Under Windows we will not have mapped the segment, so we don't need to
+ * un-map it. Just reset UsedShmemSegAddr to show we're not attached.
+ */
+ UsedShmemSegAddr = NULL;
+
+ /*
+ * We *must* close the inherited shmem segment handle, else Windows will
+ * consider the existence of this process to mean it can't release the
+ * shmem segment yet. We can now use PGSharedMemoryDetach to do that.
+ */
+ PGSharedMemoryDetach();
+}
+
+/*
+ * PGSharedMemoryDetach
+ *
+ * Detach from the shared memory segment, if still attached. This is not
+ * intended to be called explicitly by the process that originally created the
+ * segment (it will have an on_shmem_exit callback registered to do that).
+ * Rather, this is for subprocesses that have inherited an attachment and want
+ * to get rid of it.
+ *
+ * ShmemProtectiveRegion, UsedShmemSegID and UsedShmemSegAddr are implicit
+ * parameters to this routine.
+ */
+void
+PGSharedMemoryDetach(void)
+{
+ /*
+ * Releasing the protective region liberates an unimportant quantity of
+ * address space, but be tidy.
+ */
+ if (ShmemProtectiveRegion != NULL)
+ {
+ if (VirtualFree(ShmemProtectiveRegion, 0, MEM_RELEASE) == 0)
+ elog(LOG, "failed to release reserved memory region (addr=%p): error code %lu",
+ ShmemProtectiveRegion, GetLastError());
+
+ ShmemProtectiveRegion = NULL;
+ }
+
+ /* Unmap the view, if it's mapped */
+ if (UsedShmemSegAddr != NULL)
+ {
+ if (!UnmapViewOfFile(UsedShmemSegAddr))
+ elog(LOG, "could not unmap view of shared memory: error code %lu",
+ GetLastError());
+
+ UsedShmemSegAddr = NULL;
+ }
+
+ /* And close the shmem handle, if we have one */
+ if (UsedShmemSegID != INVALID_HANDLE_VALUE)
+ {
+ if (!CloseHandle(UsedShmemSegID))
+ elog(LOG, "could not close handle to shared memory: error code %lu",
+ GetLastError());
+
+ UsedShmemSegID = INVALID_HANDLE_VALUE;
+ }
+}
+
+
+/*
+ * pgwin32_SharedMemoryDelete
+ *
+ * Detach from and delete the shared memory segment
+ * (called as an on_shmem_exit callback, hence funny argument list)
+ */
+static void
+pgwin32_SharedMemoryDelete(int status, Datum shmId)
+{
+ Assert(DatumGetPointer(shmId) == UsedShmemSegID);
+ PGSharedMemoryDetach();
+}
+
+/*
+ * pgwin32_ReserveSharedMemoryRegion(hChild)
+ *
+ * Reserve the memory region that will be used for shared memory in a child
+ * process. It is called before the child process starts, to make sure the
+ * memory is available.
+ *
+ * Once the child starts, DLLs loading in different order or threads getting
+ * scheduled differently may allocate memory which can conflict with the
+ * address space we need for our shared memory. By reserving the shared
+ * memory region before the child starts, and freeing it only just before we
+ * attempt to get access to the shared memory forces these allocations to
+ * be given different address ranges that don't conflict.
+ *
+ * NOTE! This function executes in the postmaster, and should for this
+ * reason not use elog(FATAL) since that would take down the postmaster.
+ */
+int
+pgwin32_ReserveSharedMemoryRegion(HANDLE hChild)
+{
+ void *address;
+
+ Assert(ShmemProtectiveRegion != NULL);
+ Assert(UsedShmemSegAddr != NULL);
+ Assert(UsedShmemSegSize != 0);
+
+ /* ShmemProtectiveRegion */
+ address = VirtualAllocEx(hChild, ShmemProtectiveRegion,
+ PROTECTIVE_REGION_SIZE,
+ MEM_RESERVE, PAGE_NOACCESS);
+ if (address == NULL)
+ {
+ /* Don't use FATAL since we're running in the postmaster */
+ elog(LOG, "could not reserve shared memory region (addr=%p) for child %p: error code %lu",
+ ShmemProtectiveRegion, hChild, GetLastError());
+ return false;
+ }
+ if (address != ShmemProtectiveRegion)
+ {
+ /*
+ * Should never happen - in theory if allocation granularity causes
+ * strange effects it could, so check just in case.
+ *
+ * Don't use FATAL since we're running in the postmaster.
+ */
+ elog(LOG, "reserved shared memory region got incorrect address %p, expected %p",
+ address, ShmemProtectiveRegion);
+ return false;
+ }
+
+ /* UsedShmemSegAddr */
+ address = VirtualAllocEx(hChild, UsedShmemSegAddr, UsedShmemSegSize,
+ MEM_RESERVE, PAGE_READWRITE);
+ if (address == NULL)
+ {
+ elog(LOG, "could not reserve shared memory region (addr=%p) for child %p: error code %lu",
+ UsedShmemSegAddr, hChild, GetLastError());
+ return false;
+ }
+ if (address != UsedShmemSegAddr)
+ {
+ elog(LOG, "reserved shared memory region got incorrect address %p, expected %p",
+ address, UsedShmemSegAddr);
+ return false;
+ }
+
+ return true;
+}