diff options
Diffstat (limited to '')
-rw-r--r-- | src/backend/port/.gitignore | 3 | ||||
-rw-r--r-- | src/backend/port/Makefile | 48 | ||||
-rwxr-xr-x | src/backend/port/aix/mkldexport.sh | 61 | ||||
-rw-r--r-- | src/backend/port/atomics.c | 239 | ||||
-rw-r--r-- | src/backend/port/hpux/tas.c.template | 40 | ||||
-rw-r--r-- | src/backend/port/posix_sema.c | 388 | ||||
-rw-r--r-- | src/backend/port/sysv_sema.c | 517 | ||||
-rw-r--r-- | src/backend/port/sysv_shmem.c | 963 | ||||
-rw-r--r-- | src/backend/port/tas/dummy.s | 0 | ||||
-rw-r--r-- | src/backend/port/tas/hpux_hppa.s | 28 | ||||
-rw-r--r-- | src/backend/port/tas/sunstudio_sparc.s | 53 | ||||
-rw-r--r-- | src/backend/port/tas/sunstudio_x86.s | 43 | ||||
-rw-r--r-- | src/backend/port/win32/Makefile | 23 | ||||
-rw-r--r-- | src/backend/port/win32/crashdump.c | 181 | ||||
-rw-r--r-- | src/backend/port/win32/signal.c | 354 | ||||
-rw-r--r-- | src/backend/port/win32/socket.c | 705 | ||||
-rw-r--r-- | src/backend/port/win32/timer.c | 121 | ||||
-rw-r--r-- | src/backend/port/win32_sema.c | 235 | ||||
-rw-r--r-- | src/backend/port/win32_shmem.c | 621 |
19 files changed, 4623 insertions, 0 deletions
diff --git a/src/backend/port/.gitignore b/src/backend/port/.gitignore new file mode 100644 index 0000000..4ef36b8 --- /dev/null +++ b/src/backend/port/.gitignore @@ -0,0 +1,3 @@ +/pg_sema.c +/pg_shmem.c +/tas.s diff --git a/src/backend/port/Makefile b/src/backend/port/Makefile new file mode 100644 index 0000000..2d00b4f --- /dev/null +++ b/src/backend/port/Makefile @@ -0,0 +1,48 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for the port-specific subsystem of the backend +# +# We have two different modes of operation: 1) put stuff specific to Port X +# in subdirectory X and have that subdirectory's make file make it all, and +# 2) use conditional statements in the present make file to include what's +# necessary for a specific port in our own output. (1) came first, but (2) +# is superior for many things, like when the same thing needs to be done for +# multiple ports and you don't want to duplicate files in multiple +# subdirectories. Much of the stuff done via Method 1 today should probably +# be converted to Method 2. +# +# IDENTIFICATION +# src/backend/port/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/port +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + $(TAS) \ + atomics.o \ + pg_sema.o \ + pg_shmem.o + +ifeq ($(PORTNAME), win32) +SUBDIRS += win32 +endif + +include $(top_srcdir)/src/backend/common.mk + +tas.o: tas.s +ifeq ($(SUN_STUDIO_CC), yes) +# preprocess assembler file with cpp + $(CC) $(CFLAGS) -c -P $< + mv $*.i $*_cpp.s + $(CC) $(CFLAGS) -c $*_cpp.s -o $@ +else + $(CC) $(CFLAGS) -c $< +endif + +distclean clean: + rm -f tas_cpp.s + $(MAKE) -C win32 clean diff --git a/src/backend/port/aix/mkldexport.sh b/src/backend/port/aix/mkldexport.sh new file mode 100755 index 0000000..adf3793 --- /dev/null +++ b/src/backend/port/aix/mkldexport.sh @@ -0,0 +1,61 @@ +#!/bin/sh +# +# mkldexport +# create an AIX exports file from an object file +# +# src/backend/port/aix/mkldexport.sh +# +# Usage: +# mkldexport objectfile [location] +# where +# objectfile is the current location of the object file. +# location is the eventual (installed) location of the +# object file (if different from the current +# working directory). +# +# [This file comes from the Postgres 4.2 distribution. - ay 7/95] +# +# Header: /usr/local/devel/postgres/src/tools/mkldexport/RCS/mkldexport.sh,v 1.2 1994/03/13 04:59:12 aoki Exp +# + +# setting this to nm -B might be better +# ... due to changes in AIX 4.x ... +# ... let us search in different directories - Gerhard Reithofer +if [ -x /usr/ucb/nm ] +then NM=/usr/ucb/nm +elif [ -x /usr/bin/nm ] +then NM=/usr/bin/nm +elif [ -x /usr/ccs/bin/nm ] +then NM=/usr/ccs/bin/nm +elif [ -x /usr/usg/bin/nm ] +then NM=/usr/usg/bin/nm +else echo "Fatal error: cannot find `nm' ... please check your installation." + exit 1 +fi + +CMDNAME=`basename $0` +if [ -z "$1" ]; then + echo "Usage: $CMDNAME object [location]" + exit 1 +fi +OBJNAME=`basename $1` +if [ "`basename $OBJNAME`" != "`basename $OBJNAME .o`" ]; then + OBJNAME=`basename $OBJNAME .o`.so +fi +if [ -z "$2" ]; then + echo '#!' +else + if [ "$2" = "." ]; then + # for the base executable (AIX 4.2 and up) + echo '#! .' + else + echo '#!' $2 + fi +fi +$NM -BCg $1 | \ + egrep ' [TDB] ' | \ + sed -e 's/.* //' | \ + egrep -v '\$' | \ + sed -e 's/^[.]//' | \ + sort | \ + uniq diff --git a/src/backend/port/atomics.c b/src/backend/port/atomics.c new file mode 100644 index 0000000..ba274be --- /dev/null +++ b/src/backend/port/atomics.c @@ -0,0 +1,239 @@ +/*------------------------------------------------------------------------- + * + * atomics.c + * Non-Inline parts of the atomics implementation + * + * Portions Copyright (c) 2013-2022, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/port/atomics.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "miscadmin.h" +#include "port/atomics.h" +#include "storage/spin.h" + +#ifdef PG_HAVE_MEMORY_BARRIER_EMULATION +#ifdef WIN32 +#error "barriers are required (and provided) on WIN32 platforms" +#endif +#include <signal.h> +#endif + +#ifdef PG_HAVE_MEMORY_BARRIER_EMULATION +void +pg_spinlock_barrier(void) +{ + /* + * NB: we have to be reentrant here, some barriers are placed in signal + * handlers. + * + * We use kill(0) for the fallback barrier as we assume that kernels on + * systems old enough to require fallback barrier support will include an + * appropriate barrier while checking the existence of the postmaster pid. + */ + (void) kill(PostmasterPid, 0); +} +#endif + +#ifdef PG_HAVE_COMPILER_BARRIER_EMULATION +void +pg_extern_compiler_barrier(void) +{ + /* do nothing */ +} +#endif + + +#ifdef PG_HAVE_ATOMIC_FLAG_SIMULATION + +void +pg_atomic_init_flag_impl(volatile pg_atomic_flag *ptr) +{ + StaticAssertStmt(sizeof(ptr->sema) >= sizeof(slock_t), + "size mismatch of atomic_flag vs slock_t"); + +#ifndef HAVE_SPINLOCKS + + /* + * NB: If we're using semaphore based TAS emulation, be careful to use a + * separate set of semaphores. Otherwise we'd get in trouble if an atomic + * var would be manipulated while spinlock is held. + */ + s_init_lock_sema((slock_t *) &ptr->sema, true); +#else + SpinLockInit((slock_t *) &ptr->sema); +#endif + + ptr->value = false; +} + +bool +pg_atomic_test_set_flag_impl(volatile pg_atomic_flag *ptr) +{ + uint32 oldval; + + SpinLockAcquire((slock_t *) &ptr->sema); + oldval = ptr->value; + ptr->value = true; + SpinLockRelease((slock_t *) &ptr->sema); + + return oldval == 0; +} + +void +pg_atomic_clear_flag_impl(volatile pg_atomic_flag *ptr) +{ + SpinLockAcquire((slock_t *) &ptr->sema); + ptr->value = false; + SpinLockRelease((slock_t *) &ptr->sema); +} + +bool +pg_atomic_unlocked_test_flag_impl(volatile pg_atomic_flag *ptr) +{ + return ptr->value == 0; +} + +#endif /* PG_HAVE_ATOMIC_FLAG_SIMULATION */ + +#ifdef PG_HAVE_ATOMIC_U32_SIMULATION +void +pg_atomic_init_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val_) +{ + StaticAssertStmt(sizeof(ptr->sema) >= sizeof(slock_t), + "size mismatch of atomic_uint32 vs slock_t"); + + /* + * If we're using semaphore based atomic flags, be careful about nested + * usage of atomics while a spinlock is held. + */ +#ifndef HAVE_SPINLOCKS + s_init_lock_sema((slock_t *) &ptr->sema, true); +#else + SpinLockInit((slock_t *) &ptr->sema); +#endif + ptr->value = val_; +} + +void +pg_atomic_write_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val) +{ + /* + * One might think that an unlocked write doesn't need to acquire the + * spinlock, but one would be wrong. Even an unlocked write has to cause a + * concurrent pg_atomic_compare_exchange_u32() (et al) to fail. + */ + SpinLockAcquire((slock_t *) &ptr->sema); + ptr->value = val; + SpinLockRelease((slock_t *) &ptr->sema); +} + +bool +pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr, + uint32 *expected, uint32 newval) +{ + bool ret; + + /* + * Do atomic op under a spinlock. It might look like we could just skip + * the cmpxchg if the lock isn't available, but that'd just emulate a + * 'weak' compare and swap. I.e. one that allows spurious failures. Since + * several algorithms rely on a strong variant and that is efficiently + * implementable on most major architectures let's emulate it here as + * well. + */ + SpinLockAcquire((slock_t *) &ptr->sema); + + /* perform compare/exchange logic */ + ret = ptr->value == *expected; + *expected = ptr->value; + if (ret) + ptr->value = newval; + + /* and release lock */ + SpinLockRelease((slock_t *) &ptr->sema); + + return ret; +} + +uint32 +pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_) +{ + uint32 oldval; + + SpinLockAcquire((slock_t *) &ptr->sema); + oldval = ptr->value; + ptr->value += add_; + SpinLockRelease((slock_t *) &ptr->sema); + return oldval; +} + +#endif /* PG_HAVE_ATOMIC_U32_SIMULATION */ + + +#ifdef PG_HAVE_ATOMIC_U64_SIMULATION + +void +pg_atomic_init_u64_impl(volatile pg_atomic_uint64 *ptr, uint64 val_) +{ + StaticAssertStmt(sizeof(ptr->sema) >= sizeof(slock_t), + "size mismatch of atomic_uint64 vs slock_t"); + + /* + * If we're using semaphore based atomic flags, be careful about nested + * usage of atomics while a spinlock is held. + */ +#ifndef HAVE_SPINLOCKS + s_init_lock_sema((slock_t *) &ptr->sema, true); +#else + SpinLockInit((slock_t *) &ptr->sema); +#endif + ptr->value = val_; +} + +bool +pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr, + uint64 *expected, uint64 newval) +{ + bool ret; + + /* + * Do atomic op under a spinlock. It might look like we could just skip + * the cmpxchg if the lock isn't available, but that'd just emulate a + * 'weak' compare and swap. I.e. one that allows spurious failures. Since + * several algorithms rely on a strong variant and that is efficiently + * implementable on most major architectures let's emulate it here as + * well. + */ + SpinLockAcquire((slock_t *) &ptr->sema); + + /* perform compare/exchange logic */ + ret = ptr->value == *expected; + *expected = ptr->value; + if (ret) + ptr->value = newval; + + /* and release lock */ + SpinLockRelease((slock_t *) &ptr->sema); + + return ret; +} + +uint64 +pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_) +{ + uint64 oldval; + + SpinLockAcquire((slock_t *) &ptr->sema); + oldval = ptr->value; + ptr->value += add_; + SpinLockRelease((slock_t *) &ptr->sema); + return oldval; +} + +#endif /* PG_HAVE_ATOMIC_U64_SIMULATION */ diff --git a/src/backend/port/hpux/tas.c.template b/src/backend/port/hpux/tas.c.template new file mode 100644 index 0000000..5ccbbcd --- /dev/null +++ b/src/backend/port/hpux/tas.c.template @@ -0,0 +1,40 @@ +/* + * tas() for HPPA. + * + * To generate tas.s using this template: + * 1. cc +O2 -S -c tas.c + * 2. edit tas.s: + * - replace the LDW with LDCWX + * 3. install as src/backend/port/tas/hpux_hppa.s. + * + * For details about the LDCWX instruction, see the "Precision + * Architecture and Instruction Reference Manual" (09740-90014 of June + * 1987), p. 5-38. + */ + +int +tas(lock) + int *lock; /* LDCWX is a word instruction */ +{ + /* + * LDCWX requires that we align the "semaphore" to a 16-byte + * boundary. The actual datum is a single word (4 bytes). + */ + lock = ((uintptr_t) lock + 15) & ~15; + + /* + * The LDCWX instruction atomically clears the target word and + * returns the previous value. Hence, if the instruction returns + * 0, someone else has already acquired the lock before we tested + * it (i.e., we have failed). + * + * Notice that this means that we actually clear the word to set + * the lock and set the word to clear the lock. This is the + * opposite behavior from the SPARC LDSTUB instruction. For some + * reason everything that H-P does is rather baroque... + */ + if (*lock) { /* this generates the LDW */ + return(0); /* success */ + } + return(1); /* failure */ +} diff --git a/src/backend/port/posix_sema.c b/src/backend/port/posix_sema.c new file mode 100644 index 0000000..a97a3ed --- /dev/null +++ b/src/backend/port/posix_sema.c @@ -0,0 +1,388 @@ +/*------------------------------------------------------------------------- + * + * posix_sema.c + * Implement PGSemaphores using POSIX semaphore facilities + * + * We prefer the unnamed style of POSIX semaphore (the kind made with + * sem_init). We can cope with the kind made with sem_open, however. + * + * In either implementation, typedef PGSemaphore is equivalent to "sem_t *". + * With unnamed semaphores, the sem_t structs live in an array in shared + * memory. With named semaphores, that's not true because we cannot persuade + * sem_open to do its allocation there. Therefore, the named-semaphore code + * *does not cope with EXEC_BACKEND*. The sem_t structs will just be in the + * postmaster's private memory, where they are successfully inherited by + * forked backends, but they could not be accessed by exec'd backends. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/port/posix_sema.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <fcntl.h> +#include <semaphore.h> +#include <signal.h> +#include <unistd.h> +#include <sys/stat.h> + +#include "miscadmin.h" +#include "storage/ipc.h" +#include "storage/pg_sema.h" +#include "storage/shmem.h" + + +/* see file header comment */ +#if defined(USE_NAMED_POSIX_SEMAPHORES) && defined(EXEC_BACKEND) +#error cannot use named POSIX semaphores with EXEC_BACKEND +#endif + +typedef union SemTPadded +{ + sem_t pgsem; + char pad[PG_CACHE_LINE_SIZE]; +} SemTPadded; + +/* typedef PGSemaphore is equivalent to pointer to sem_t */ +typedef struct PGSemaphoreData +{ + SemTPadded sem_padded; +} PGSemaphoreData; + +#define PG_SEM_REF(x) (&(x)->sem_padded.pgsem) + +#define IPCProtection (0600) /* access/modify by user only */ + +#ifdef USE_NAMED_POSIX_SEMAPHORES +static sem_t **mySemPointers; /* keep track of created semaphores */ +#else +static PGSemaphore sharedSemas; /* array of PGSemaphoreData in shared memory */ +#endif +static int numSems; /* number of semas acquired so far */ +static int maxSems; /* allocated size of above arrays */ +static int nextSemKey; /* next name to try */ + + +static void ReleaseSemaphores(int status, Datum arg); + + +#ifdef USE_NAMED_POSIX_SEMAPHORES + +/* + * PosixSemaphoreCreate + * + * Attempt to create a new named semaphore. + * + * If we fail with a failure code other than collision-with-existing-sema, + * print out an error and abort. Other types of errors suggest nonrecoverable + * problems. + */ +static sem_t * +PosixSemaphoreCreate(void) +{ + int semKey; + char semname[64]; + sem_t *mySem; + + for (;;) + { + semKey = nextSemKey++; + + snprintf(semname, sizeof(semname), "/pgsql-%d", semKey); + + mySem = sem_open(semname, O_CREAT | O_EXCL, + (mode_t) IPCProtection, (unsigned) 1); + +#ifdef SEM_FAILED + if (mySem != (sem_t *) SEM_FAILED) + break; +#else + if (mySem != (sem_t *) (-1)) + break; +#endif + + /* Loop if error indicates a collision */ + if (errno == EEXIST || errno == EACCES || errno == EINTR) + continue; + + /* + * Else complain and abort + */ + elog(FATAL, "sem_open(\"%s\") failed: %m", semname); + } + + /* + * Unlink the semaphore immediately, so it can't be accessed externally. + * This also ensures that it will go away if we crash. + */ + sem_unlink(semname); + + return mySem; +} +#else /* !USE_NAMED_POSIX_SEMAPHORES */ + +/* + * PosixSemaphoreCreate + * + * Attempt to create a new unnamed semaphore. + */ +static void +PosixSemaphoreCreate(sem_t *sem) +{ + if (sem_init(sem, 1, 1) < 0) + elog(FATAL, "sem_init failed: %m"); +} +#endif /* USE_NAMED_POSIX_SEMAPHORES */ + + +/* + * PosixSemaphoreKill - removes a semaphore + */ +static void +PosixSemaphoreKill(sem_t *sem) +{ +#ifdef USE_NAMED_POSIX_SEMAPHORES + /* Got to use sem_close for named semaphores */ + if (sem_close(sem) < 0) + elog(LOG, "sem_close failed: %m"); +#else + /* Got to use sem_destroy for unnamed semaphores */ + if (sem_destroy(sem) < 0) + elog(LOG, "sem_destroy failed: %m"); +#endif +} + + +/* + * Report amount of shared memory needed for semaphores + */ +Size +PGSemaphoreShmemSize(int maxSemas) +{ +#ifdef USE_NAMED_POSIX_SEMAPHORES + /* No shared memory needed in this case */ + return 0; +#else + /* Need a PGSemaphoreData per semaphore */ + return mul_size(maxSemas, sizeof(PGSemaphoreData)); +#endif +} + +/* + * PGReserveSemaphores --- initialize semaphore support + * + * This is called during postmaster start or shared memory reinitialization. + * It should do whatever is needed to be able to support up to maxSemas + * subsequent PGSemaphoreCreate calls. Also, if any system resources + * are acquired here or in PGSemaphoreCreate, register an on_shmem_exit + * callback to release them. + * + * In the Posix implementation, we acquire semaphores on-demand; the + * maxSemas parameter is just used to size the arrays. For unnamed + * semaphores, there is an array of PGSemaphoreData structs in shared memory. + * For named semaphores, we keep a postmaster-local array of sem_t pointers, + * which we use for releasing the semaphores when done. + * (This design minimizes the dependency of postmaster shutdown on the + * contents of shared memory, which a failed backend might have clobbered. + * We can't do much about the possibility of sem_destroy() crashing, but + * we don't have to expose the counters to other processes.) + */ +void +PGReserveSemaphores(int maxSemas) +{ + struct stat statbuf; + + /* + * We use the data directory's inode number to seed the search for free + * semaphore keys. This minimizes the odds of collision with other + * postmasters, while maximizing the odds that we will detect and clean up + * semaphores left over from a crashed postmaster in our own directory. + */ + if (stat(DataDir, &statbuf) < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not stat data directory \"%s\": %m", + DataDir))); + +#ifdef USE_NAMED_POSIX_SEMAPHORES + mySemPointers = (sem_t **) malloc(maxSemas * sizeof(sem_t *)); + if (mySemPointers == NULL) + elog(PANIC, "out of memory"); +#else + + /* + * We must use ShmemAllocUnlocked(), since the spinlock protecting + * ShmemAlloc() won't be ready yet. (This ordering is necessary when we + * are emulating spinlocks with semaphores.) + */ + sharedSemas = (PGSemaphore) + ShmemAllocUnlocked(PGSemaphoreShmemSize(maxSemas)); +#endif + + numSems = 0; + maxSems = maxSemas; + nextSemKey = statbuf.st_ino; + + on_shmem_exit(ReleaseSemaphores, 0); +} + +/* + * Release semaphores at shutdown or shmem reinitialization + * + * (called as an on_shmem_exit callback, hence funny argument list) + */ +static void +ReleaseSemaphores(int status, Datum arg) +{ + int i; + +#ifdef USE_NAMED_POSIX_SEMAPHORES + for (i = 0; i < numSems; i++) + PosixSemaphoreKill(mySemPointers[i]); + free(mySemPointers); +#endif + +#ifdef USE_UNNAMED_POSIX_SEMAPHORES + for (i = 0; i < numSems; i++) + PosixSemaphoreKill(PG_SEM_REF(sharedSemas + i)); +#endif +} + +/* + * PGSemaphoreCreate + * + * Allocate a PGSemaphore structure with initial count 1 + */ +PGSemaphore +PGSemaphoreCreate(void) +{ + PGSemaphore sema; + sem_t *newsem; + + /* Can't do this in a backend, because static state is postmaster's */ + Assert(!IsUnderPostmaster); + + if (numSems >= maxSems) + elog(PANIC, "too many semaphores created"); + +#ifdef USE_NAMED_POSIX_SEMAPHORES + newsem = PosixSemaphoreCreate(); + /* Remember new sema for ReleaseSemaphores */ + mySemPointers[numSems] = newsem; + sema = (PGSemaphore) newsem; +#else + sema = &sharedSemas[numSems]; + newsem = PG_SEM_REF(sema); + PosixSemaphoreCreate(newsem); +#endif + + numSems++; + + return sema; +} + +/* + * PGSemaphoreReset + * + * Reset a previously-initialized PGSemaphore to have count 0 + */ +void +PGSemaphoreReset(PGSemaphore sema) +{ + /* + * There's no direct API for this in POSIX, so we have to ratchet the + * semaphore down to 0 with repeated trywait's. + */ + for (;;) + { + if (sem_trywait(PG_SEM_REF(sema)) < 0) + { + if (errno == EAGAIN || errno == EDEADLK) + break; /* got it down to 0 */ + if (errno == EINTR) + continue; /* can this happen? */ + elog(FATAL, "sem_trywait failed: %m"); + } + } +} + +/* + * PGSemaphoreLock + * + * Lock a semaphore (decrement count), blocking if count would be < 0 + */ +void +PGSemaphoreLock(PGSemaphore sema) +{ + int errStatus; + + /* See notes in sysv_sema.c's implementation of PGSemaphoreLock. */ + do + { + errStatus = sem_wait(PG_SEM_REF(sema)); + } while (errStatus < 0 && errno == EINTR); + + if (errStatus < 0) + elog(FATAL, "sem_wait failed: %m"); +} + +/* + * PGSemaphoreUnlock + * + * Unlock a semaphore (increment count) + */ +void +PGSemaphoreUnlock(PGSemaphore sema) +{ + int errStatus; + + /* + * Note: if errStatus is -1 and errno == EINTR then it means we returned + * from the operation prematurely because we were sent a signal. So we + * try and unlock the semaphore again. Not clear this can really happen, + * but might as well cope. + */ + do + { + errStatus = sem_post(PG_SEM_REF(sema)); + } while (errStatus < 0 && errno == EINTR); + + if (errStatus < 0) + elog(FATAL, "sem_post failed: %m"); +} + +/* + * PGSemaphoreTryLock + * + * Lock a semaphore only if able to do so without blocking + */ +bool +PGSemaphoreTryLock(PGSemaphore sema) +{ + int errStatus; + + /* + * Note: if errStatus is -1 and errno == EINTR then it means we returned + * from the operation prematurely because we were sent a signal. So we + * try and lock the semaphore again. + */ + do + { + errStatus = sem_trywait(PG_SEM_REF(sema)); + } while (errStatus < 0 && errno == EINTR); + + if (errStatus < 0) + { + if (errno == EAGAIN || errno == EDEADLK) + return false; /* failed to lock it */ + /* Otherwise we got trouble */ + elog(FATAL, "sem_trywait failed: %m"); + } + + return true; +} diff --git a/src/backend/port/sysv_sema.c b/src/backend/port/sysv_sema.c new file mode 100644 index 0000000..ea3ad6d --- /dev/null +++ b/src/backend/port/sysv_sema.c @@ -0,0 +1,517 @@ +/*------------------------------------------------------------------------- + * + * sysv_sema.c + * Implement PGSemaphores using SysV semaphore facilities + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/port/sysv_sema.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <signal.h> +#include <unistd.h> +#include <sys/file.h> +#include <sys/stat.h> +#ifdef HAVE_SYS_IPC_H +#include <sys/ipc.h> +#endif +#ifdef HAVE_SYS_SEM_H +#include <sys/sem.h> +#endif + +#include "miscadmin.h" +#include "storage/ipc.h" +#include "storage/pg_sema.h" +#include "storage/shmem.h" + + +typedef struct PGSemaphoreData +{ + int semId; /* semaphore set identifier */ + int semNum; /* semaphore number within set */ +} PGSemaphoreData; + +#ifndef HAVE_UNION_SEMUN +union semun +{ + int val; + struct semid_ds *buf; + unsigned short *array; +}; +#endif + +typedef key_t IpcSemaphoreKey; /* semaphore key passed to semget(2) */ +typedef int IpcSemaphoreId; /* semaphore ID returned by semget(2) */ + +/* + * SEMAS_PER_SET is the number of useful semaphores in each semaphore set + * we allocate. It must be *less than* your kernel's SEMMSL (max semaphores + * per set) parameter, which is often around 25. (Less than, because we + * allocate one extra sema in each set for identification purposes.) + */ +#define SEMAS_PER_SET 16 + +#define IPCProtection (0600) /* access/modify by user only */ + +#define PGSemaMagic 537 /* must be less than SEMVMX */ + + +static PGSemaphore sharedSemas; /* array of PGSemaphoreData in shared memory */ +static int numSharedSemas; /* number of PGSemaphoreDatas used so far */ +static int maxSharedSemas; /* allocated size of PGSemaphoreData array */ +static IpcSemaphoreId *mySemaSets; /* IDs of sema sets acquired so far */ +static int numSemaSets; /* number of sema sets acquired so far */ +static int maxSemaSets; /* allocated size of mySemaSets array */ +static IpcSemaphoreKey nextSemaKey; /* next key to try using */ +static int nextSemaNumber; /* next free sem num in last sema set */ + + +static IpcSemaphoreId InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey, + int numSems); +static void IpcSemaphoreInitialize(IpcSemaphoreId semId, int semNum, + int value); +static void IpcSemaphoreKill(IpcSemaphoreId semId); +static int IpcSemaphoreGetValue(IpcSemaphoreId semId, int semNum); +static pid_t IpcSemaphoreGetLastPID(IpcSemaphoreId semId, int semNum); +static IpcSemaphoreId IpcSemaphoreCreate(int numSems); +static void ReleaseSemaphores(int status, Datum arg); + + +/* + * InternalIpcSemaphoreCreate + * + * Attempt to create a new semaphore set with the specified key. + * Will fail (return -1) if such a set already exists. + * + * If we fail with a failure code other than collision-with-existing-set, + * print out an error and abort. Other types of errors suggest nonrecoverable + * problems. + */ +static IpcSemaphoreId +InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey, int numSems) +{ + int semId; + + semId = semget(semKey, numSems, IPC_CREAT | IPC_EXCL | IPCProtection); + + if (semId < 0) + { + int saved_errno = errno; + + /* + * Fail quietly if error indicates a collision with existing set. One + * would expect EEXIST, given that we said IPC_EXCL, but perhaps we + * could get a permission violation instead? Also, EIDRM might occur + * if an old set is slated for destruction but not gone yet. + */ + if (saved_errno == EEXIST || saved_errno == EACCES +#ifdef EIDRM + || saved_errno == EIDRM +#endif + ) + return -1; + + /* + * Else complain and abort + */ + ereport(FATAL, + (errmsg("could not create semaphores: %m"), + errdetail("Failed system call was semget(%lu, %d, 0%o).", + (unsigned long) semKey, numSems, + IPC_CREAT | IPC_EXCL | IPCProtection), + (saved_errno == ENOSPC) ? + errhint("This error does *not* mean that you have run out of disk space. " + "It occurs when either the system limit for the maximum number of " + "semaphore sets (SEMMNI), or the system wide maximum number of " + "semaphores (SEMMNS), would be exceeded. You need to raise the " + "respective kernel parameter. Alternatively, reduce PostgreSQL's " + "consumption of semaphores by reducing its max_connections parameter.\n" + "The PostgreSQL documentation contains more information about " + "configuring your system for PostgreSQL.") : 0)); + } + + return semId; +} + +/* + * Initialize a semaphore to the specified value. + */ +static void +IpcSemaphoreInitialize(IpcSemaphoreId semId, int semNum, int value) +{ + union semun semun; + + semun.val = value; + if (semctl(semId, semNum, SETVAL, semun) < 0) + { + int saved_errno = errno; + + ereport(FATAL, + (errmsg_internal("semctl(%d, %d, SETVAL, %d) failed: %m", + semId, semNum, value), + (saved_errno == ERANGE) ? + errhint("You possibly need to raise your kernel's SEMVMX value to be at least " + "%d. Look into the PostgreSQL documentation for details.", + value) : 0)); + } +} + +/* + * IpcSemaphoreKill(semId) - removes a semaphore set + */ +static void +IpcSemaphoreKill(IpcSemaphoreId semId) +{ + union semun semun; + + semun.val = 0; /* unused, but keep compiler quiet */ + + if (semctl(semId, 0, IPC_RMID, semun) < 0) + elog(LOG, "semctl(%d, 0, IPC_RMID, ...) failed: %m", semId); +} + +/* Get the current value (semval) of the semaphore */ +static int +IpcSemaphoreGetValue(IpcSemaphoreId semId, int semNum) +{ + union semun dummy; /* for Solaris */ + + dummy.val = 0; /* unused */ + + return semctl(semId, semNum, GETVAL, dummy); +} + +/* Get the PID of the last process to do semop() on the semaphore */ +static pid_t +IpcSemaphoreGetLastPID(IpcSemaphoreId semId, int semNum) +{ + union semun dummy; /* for Solaris */ + + dummy.val = 0; /* unused */ + + return semctl(semId, semNum, GETPID, dummy); +} + + +/* + * Create a semaphore set with the given number of useful semaphores + * (an additional sema is actually allocated to serve as identifier). + * Dead Postgres sema sets are recycled if found, but we do not fail + * upon collision with non-Postgres sema sets. + * + * The idea here is to detect and re-use keys that may have been assigned + * by a crashed postmaster or backend. + */ +static IpcSemaphoreId +IpcSemaphoreCreate(int numSems) +{ + IpcSemaphoreId semId; + union semun semun; + PGSemaphoreData mysema; + + /* Loop till we find a free IPC key */ + for (nextSemaKey++;; nextSemaKey++) + { + pid_t creatorPID; + + /* Try to create new semaphore set */ + semId = InternalIpcSemaphoreCreate(nextSemaKey, numSems + 1); + if (semId >= 0) + break; /* successful create */ + + /* See if it looks to be leftover from a dead Postgres process */ + semId = semget(nextSemaKey, numSems + 1, 0); + if (semId < 0) + continue; /* failed: must be some other app's */ + if (IpcSemaphoreGetValue(semId, numSems) != PGSemaMagic) + continue; /* sema belongs to a non-Postgres app */ + + /* + * If the creator PID is my own PID or does not belong to any extant + * process, it's safe to zap it. + */ + creatorPID = IpcSemaphoreGetLastPID(semId, numSems); + if (creatorPID <= 0) + continue; /* oops, GETPID failed */ + if (creatorPID != getpid()) + { + if (kill(creatorPID, 0) == 0 || errno != ESRCH) + continue; /* sema belongs to a live process */ + } + + /* + * The sema set appears to be from a dead Postgres process, or from a + * previous cycle of life in this same process. Zap it, if possible. + * This probably shouldn't fail, but if it does, assume the sema set + * belongs to someone else after all, and continue quietly. + */ + semun.val = 0; /* unused, but keep compiler quiet */ + if (semctl(semId, 0, IPC_RMID, semun) < 0) + continue; + + /* + * Now try again to create the sema set. + */ + semId = InternalIpcSemaphoreCreate(nextSemaKey, numSems + 1); + if (semId >= 0) + break; /* successful create */ + + /* + * Can only get here if some other process managed to create the same + * sema key before we did. Let him have that one, loop around to try + * next key. + */ + } + + /* + * OK, we created a new sema set. Mark it as created by this process. We + * do this by setting the spare semaphore to PGSemaMagic-1 and then + * incrementing it with semop(). That leaves it with value PGSemaMagic + * and sempid referencing this process. + */ + IpcSemaphoreInitialize(semId, numSems, PGSemaMagic - 1); + mysema.semId = semId; + mysema.semNum = numSems; + PGSemaphoreUnlock(&mysema); + + return semId; +} + + +/* + * Report amount of shared memory needed for semaphores + */ +Size +PGSemaphoreShmemSize(int maxSemas) +{ + return mul_size(maxSemas, sizeof(PGSemaphoreData)); +} + +/* + * PGReserveSemaphores --- initialize semaphore support + * + * This is called during postmaster start or shared memory reinitialization. + * It should do whatever is needed to be able to support up to maxSemas + * subsequent PGSemaphoreCreate calls. Also, if any system resources + * are acquired here or in PGSemaphoreCreate, register an on_shmem_exit + * callback to release them. + * + * In the SysV implementation, we acquire semaphore sets on-demand; the + * maxSemas parameter is just used to size the arrays. There is an array + * of PGSemaphoreData structs in shared memory, and a postmaster-local array + * with one entry per SysV semaphore set, which we use for releasing the + * semaphore sets when done. (This design ensures that postmaster shutdown + * doesn't rely on the contents of shared memory, which a failed backend might + * have clobbered.) + */ +void +PGReserveSemaphores(int maxSemas) +{ + struct stat statbuf; + + /* + * We use the data directory's inode number to seed the search for free + * semaphore keys. This minimizes the odds of collision with other + * postmasters, while maximizing the odds that we will detect and clean up + * semaphores left over from a crashed postmaster in our own directory. + */ + if (stat(DataDir, &statbuf) < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not stat data directory \"%s\": %m", + DataDir))); + + /* + * We must use ShmemAllocUnlocked(), since the spinlock protecting + * ShmemAlloc() won't be ready yet. (This ordering is necessary when we + * are emulating spinlocks with semaphores.) + */ + sharedSemas = (PGSemaphore) + ShmemAllocUnlocked(PGSemaphoreShmemSize(maxSemas)); + numSharedSemas = 0; + maxSharedSemas = maxSemas; + + maxSemaSets = (maxSemas + SEMAS_PER_SET - 1) / SEMAS_PER_SET; + mySemaSets = (IpcSemaphoreId *) + malloc(maxSemaSets * sizeof(IpcSemaphoreId)); + if (mySemaSets == NULL) + elog(PANIC, "out of memory"); + numSemaSets = 0; + nextSemaKey = statbuf.st_ino; + nextSemaNumber = SEMAS_PER_SET; /* force sema set alloc on 1st call */ + + on_shmem_exit(ReleaseSemaphores, 0); +} + +/* + * Release semaphores at shutdown or shmem reinitialization + * + * (called as an on_shmem_exit callback, hence funny argument list) + */ +static void +ReleaseSemaphores(int status, Datum arg) +{ + int i; + + for (i = 0; i < numSemaSets; i++) + IpcSemaphoreKill(mySemaSets[i]); + free(mySemaSets); +} + +/* + * PGSemaphoreCreate + * + * Allocate a PGSemaphore structure with initial count 1 + */ +PGSemaphore +PGSemaphoreCreate(void) +{ + PGSemaphore sema; + + /* Can't do this in a backend, because static state is postmaster's */ + Assert(!IsUnderPostmaster); + + if (nextSemaNumber >= SEMAS_PER_SET) + { + /* Time to allocate another semaphore set */ + if (numSemaSets >= maxSemaSets) + elog(PANIC, "too many semaphores created"); + mySemaSets[numSemaSets] = IpcSemaphoreCreate(SEMAS_PER_SET); + numSemaSets++; + nextSemaNumber = 0; + } + /* Use the next shared PGSemaphoreData */ + if (numSharedSemas >= maxSharedSemas) + elog(PANIC, "too many semaphores created"); + sema = &sharedSemas[numSharedSemas++]; + /* Assign the next free semaphore in the current set */ + sema->semId = mySemaSets[numSemaSets - 1]; + sema->semNum = nextSemaNumber++; + /* Initialize it to count 1 */ + IpcSemaphoreInitialize(sema->semId, sema->semNum, 1); + + return sema; +} + +/* + * PGSemaphoreReset + * + * Reset a previously-initialized PGSemaphore to have count 0 + */ +void +PGSemaphoreReset(PGSemaphore sema) +{ + IpcSemaphoreInitialize(sema->semId, sema->semNum, 0); +} + +/* + * PGSemaphoreLock + * + * Lock a semaphore (decrement count), blocking if count would be < 0 + */ +void +PGSemaphoreLock(PGSemaphore sema) +{ + int errStatus; + struct sembuf sops; + + sops.sem_op = -1; /* decrement */ + sops.sem_flg = 0; + sops.sem_num = sema->semNum; + + /* + * Note: if errStatus is -1 and errno == EINTR then it means we returned + * from the operation prematurely because we were sent a signal. So we + * try and lock the semaphore again. + * + * We used to check interrupts here, but that required servicing + * interrupts directly from signal handlers. Which is hard to do safely + * and portably. + */ + do + { + errStatus = semop(sema->semId, &sops, 1); + } while (errStatus < 0 && errno == EINTR); + + if (errStatus < 0) + elog(FATAL, "semop(id=%d) failed: %m", sema->semId); +} + +/* + * PGSemaphoreUnlock + * + * Unlock a semaphore (increment count) + */ +void +PGSemaphoreUnlock(PGSemaphore sema) +{ + int errStatus; + struct sembuf sops; + + sops.sem_op = 1; /* increment */ + sops.sem_flg = 0; + sops.sem_num = sema->semNum; + + /* + * Note: if errStatus is -1 and errno == EINTR then it means we returned + * from the operation prematurely because we were sent a signal. So we + * try and unlock the semaphore again. Not clear this can really happen, + * but might as well cope. + */ + do + { + errStatus = semop(sema->semId, &sops, 1); + } while (errStatus < 0 && errno == EINTR); + + if (errStatus < 0) + elog(FATAL, "semop(id=%d) failed: %m", sema->semId); +} + +/* + * PGSemaphoreTryLock + * + * Lock a semaphore only if able to do so without blocking + */ +bool +PGSemaphoreTryLock(PGSemaphore sema) +{ + int errStatus; + struct sembuf sops; + + sops.sem_op = -1; /* decrement */ + sops.sem_flg = IPC_NOWAIT; /* but don't block */ + sops.sem_num = sema->semNum; + + /* + * Note: if errStatus is -1 and errno == EINTR then it means we returned + * from the operation prematurely because we were sent a signal. So we + * try and lock the semaphore again. + */ + do + { + errStatus = semop(sema->semId, &sops, 1); + } while (errStatus < 0 && errno == EINTR); + + if (errStatus < 0) + { + /* Expect EAGAIN or EWOULDBLOCK (platform-dependent) */ +#ifdef EAGAIN + if (errno == EAGAIN) + return false; /* failed to lock it */ +#endif +#if defined(EWOULDBLOCK) && (!defined(EAGAIN) || (EWOULDBLOCK != EAGAIN)) + if (errno == EWOULDBLOCK) + return false; /* failed to lock it */ +#endif + /* Otherwise we got trouble */ + elog(FATAL, "semop(id=%d) failed: %m", sema->semId); + } + + return true; +} diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c new file mode 100644 index 0000000..ea287c7 --- /dev/null +++ b/src/backend/port/sysv_shmem.c @@ -0,0 +1,963 @@ +/*------------------------------------------------------------------------- + * + * sysv_shmem.c + * Implement shared memory using SysV facilities + * + * These routines used to be a fairly thin layer on top of SysV shared + * memory functionality. With the addition of anonymous-shmem logic, + * they're a bit fatter now. We still require a SysV shmem block to + * exist, though, because mmap'd shmem provides no way to find out how + * many processes are attached, which we need for interlocking purposes. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/port/sysv_shmem.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <signal.h> +#include <unistd.h> +#include <sys/file.h> +#include <sys/mman.h> +#include <sys/stat.h> +#ifdef HAVE_SYS_IPC_H +#include <sys/ipc.h> +#endif +#ifdef HAVE_SYS_SHM_H +#include <sys/shm.h> +#endif + +#include "miscadmin.h" +#include "port/pg_bitutils.h" +#include "portability/mem.h" +#include "storage/dsm.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/pg_shmem.h" +#include "utils/guc.h" +#include "utils/pidfile.h" + + +/* + * As of PostgreSQL 9.3, we normally allocate only a very small amount of + * System V shared memory, and only for the purposes of providing an + * interlock to protect the data directory. The real shared memory block + * is allocated using mmap(). This works around the problem that many + * systems have very low limits on the amount of System V shared memory + * that can be allocated. Even a limit of a few megabytes will be enough + * to run many copies of PostgreSQL without needing to adjust system settings. + * + * We assume that no one will attempt to run PostgreSQL 9.3 or later on + * systems that are ancient enough that anonymous shared memory is not + * supported, such as pre-2.4 versions of Linux. If that turns out to be + * false, we might need to add compile and/or run-time tests here and do this + * only if the running kernel supports it. + * + * However, we must always disable this logic in the EXEC_BACKEND case, and + * fall back to the old method of allocating the entire segment using System V + * shared memory, because there's no way to attach an anonymous mmap'd segment + * to a process after exec(). Since EXEC_BACKEND is intended only for + * developer use, this shouldn't be a big problem. Because of this, we do + * not worry about supporting anonymous shmem in the EXEC_BACKEND cases below. + * + * As of PostgreSQL 12, we regained the ability to use a large System V shared + * memory region even in non-EXEC_BACKEND builds, if shared_memory_type is set + * to sysv (though this is not the default). + */ + + +typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */ +typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */ + +/* + * How does a given IpcMemoryId relate to this PostgreSQL process? + * + * One could recycle unattached segments of different data directories if we + * distinguished that case from other SHMSTATE_FOREIGN cases. Doing so would + * cause us to visit less of the key space, making us less likely to detect a + * SHMSTATE_ATTACHED key. It would also complicate the concurrency analysis, + * in that postmasters of different data directories could simultaneously + * attempt to recycle a given key. We'll waste keys longer in some cases, but + * avoiding the problems of the alternative justifies that loss. + */ +typedef enum +{ + SHMSTATE_ANALYSIS_FAILURE, /* unexpected failure to analyze the ID */ + SHMSTATE_ATTACHED, /* pertinent to DataDir, has attached PIDs */ + SHMSTATE_ENOENT, /* no segment of that ID */ + SHMSTATE_FOREIGN, /* exists, but not pertinent to DataDir */ + SHMSTATE_UNATTACHED /* pertinent to DataDir, no attached PIDs */ +} IpcMemoryState; + + +unsigned long UsedShmemSegID = 0; +void *UsedShmemSegAddr = NULL; + +static Size AnonymousShmemSize; +static void *AnonymousShmem = NULL; + +static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size); +static void IpcMemoryDetach(int status, Datum shmaddr); +static void IpcMemoryDelete(int status, Datum shmId); +static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId, + void *attachAt, + PGShmemHeader **addr); + + +/* + * InternalIpcMemoryCreate(memKey, size) + * + * Attempt to create a new shared memory segment with the specified key. + * Will fail (return NULL) if such a segment already exists. If successful, + * attach the segment to the current process and return its attached address. + * On success, callbacks are registered with on_shmem_exit to detach and + * delete the segment when on_shmem_exit is called. + * + * If we fail with a failure code other than collision-with-existing-segment, + * print out an error and abort. Other types of errors are not recoverable. + */ +static void * +InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size) +{ + IpcMemoryId shmid; + void *requestedAddress = NULL; + void *memAddress; + + /* + * Normally we just pass requestedAddress = NULL to shmat(), allowing the + * system to choose where the segment gets mapped. But in an EXEC_BACKEND + * build, it's possible for whatever is chosen in the postmaster to not + * work for backends, due to variations in address space layout. As a + * rather klugy workaround, allow the user to specify the address to use + * via setting the environment variable PG_SHMEM_ADDR. (If this were of + * interest for anything except debugging, we'd probably create a cleaner + * and better-documented way to set it, such as a GUC.) + */ +#ifdef EXEC_BACKEND + { + char *pg_shmem_addr = getenv("PG_SHMEM_ADDR"); + + if (pg_shmem_addr) + requestedAddress = (void *) strtoul(pg_shmem_addr, NULL, 0); + else + { +#if defined(__darwin__) && SIZEOF_VOID_P == 8 + /* + * Provide a default value that is believed to avoid problems with + * ASLR on the current macOS release. + */ + requestedAddress = (void *) 0x80000000000; +#endif + } + } +#endif + + shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection); + + if (shmid < 0) + { + int shmget_errno = errno; + + /* + * Fail quietly if error indicates a collision with existing segment. + * One would expect EEXIST, given that we said IPC_EXCL, but perhaps + * we could get a permission violation instead? Also, EIDRM might + * occur if an old seg is slated for destruction but not gone yet. + */ + if (shmget_errno == EEXIST || shmget_errno == EACCES +#ifdef EIDRM + || shmget_errno == EIDRM +#endif + ) + return NULL; + + /* + * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if + * there is an existing segment but it's smaller than "size" (this is + * a result of poorly-thought-out ordering of error tests). To + * distinguish between collision and invalid size in such cases, we + * make a second try with size = 0. These kernels do not test size + * against SHMMIN in the preexisting-segment case, so we will not get + * EINVAL a second time if there is such a segment. + */ + if (shmget_errno == EINVAL) + { + shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection); + + if (shmid < 0) + { + /* As above, fail quietly if we verify a collision */ + if (errno == EEXIST || errno == EACCES +#ifdef EIDRM + || errno == EIDRM +#endif + ) + return NULL; + /* Otherwise, fall through to report the original error */ + } + else + { + /* + * On most platforms we cannot get here because SHMMIN is + * greater than zero. However, if we do succeed in creating a + * zero-size segment, free it and then fall through to report + * the original error. + */ + if (shmctl(shmid, IPC_RMID, NULL) < 0) + elog(LOG, "shmctl(%d, %d, 0) failed: %m", + (int) shmid, IPC_RMID); + } + } + + /* + * Else complain and abort. + * + * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX + * is violated. SHMALL violation might be reported as either ENOMEM + * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which + * it should be. SHMMNI violation is ENOSPC, per spec. Just plain + * not-enough-RAM is ENOMEM. + */ + errno = shmget_errno; + ereport(FATAL, + (errmsg("could not create shared memory segment: %m"), + errdetail("Failed system call was shmget(key=%lu, size=%zu, 0%o).", + (unsigned long) memKey, size, + IPC_CREAT | IPC_EXCL | IPCProtection), + (shmget_errno == EINVAL) ? + errhint("This error usually means that PostgreSQL's request for a shared memory " + "segment exceeded your kernel's SHMMAX parameter, or possibly that " + "it is less than " + "your kernel's SHMMIN parameter.\n" + "The PostgreSQL documentation contains more information about shared " + "memory configuration.") : 0, + (shmget_errno == ENOMEM) ? + errhint("This error usually means that PostgreSQL's request for a shared " + "memory segment exceeded your kernel's SHMALL parameter. You might need " + "to reconfigure the kernel with larger SHMALL.\n" + "The PostgreSQL documentation contains more information about shared " + "memory configuration.") : 0, + (shmget_errno == ENOSPC) ? + errhint("This error does *not* mean that you have run out of disk space. " + "It occurs either if all available shared memory IDs have been taken, " + "in which case you need to raise the SHMMNI parameter in your kernel, " + "or because the system's overall limit for shared memory has been " + "reached.\n" + "The PostgreSQL documentation contains more information about shared " + "memory configuration.") : 0)); + } + + /* Register on-exit routine to delete the new segment */ + on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid)); + + /* OK, should be able to attach to the segment */ + memAddress = shmat(shmid, requestedAddress, PG_SHMAT_FLAGS); + + if (memAddress == (void *) -1) + elog(FATAL, "shmat(id=%d, addr=%p, flags=0x%x) failed: %m", + shmid, requestedAddress, PG_SHMAT_FLAGS); + + /* Register on-exit routine to detach new segment before deleting */ + on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress)); + + /* + * Store shmem key and ID in data directory lockfile. Format to try to + * keep it the same length always (trailing junk in the lockfile won't + * hurt, but might confuse humans). + */ + { + char line[64]; + + sprintf(line, "%9lu %9lu", + (unsigned long) memKey, (unsigned long) shmid); + AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line); + } + + return memAddress; +} + +/****************************************************************************/ +/* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */ +/* from process' address space */ +/* (called as an on_shmem_exit callback, hence funny argument list) */ +/****************************************************************************/ +static void +IpcMemoryDetach(int status, Datum shmaddr) +{ + /* Detach System V shared memory block. */ + if (shmdt((void *) DatumGetPointer(shmaddr)) < 0) + elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr)); +} + +/****************************************************************************/ +/* IpcMemoryDelete(status, shmId) deletes a shared memory segment */ +/* (called as an on_shmem_exit callback, hence funny argument list) */ +/****************************************************************************/ +static void +IpcMemoryDelete(int status, Datum shmId) +{ + if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0) + elog(LOG, "shmctl(%d, %d, 0) failed: %m", + DatumGetInt32(shmId), IPC_RMID); +} + +/* + * PGSharedMemoryIsInUse + * + * Is a previously-existing shmem segment still existing and in use? + * + * The point of this exercise is to detect the case where a prior postmaster + * crashed, but it left child backends that are still running. Therefore + * we only care about shmem segments that are associated with the intended + * DataDir. This is an important consideration since accidental matches of + * shmem segment IDs are reasonably common. + */ +bool +PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2) +{ + PGShmemHeader *memAddress; + IpcMemoryState state; + + state = PGSharedMemoryAttach((IpcMemoryId) id2, NULL, &memAddress); + if (memAddress && shmdt((void *) memAddress) < 0) + elog(LOG, "shmdt(%p) failed: %m", memAddress); + switch (state) + { + case SHMSTATE_ENOENT: + case SHMSTATE_FOREIGN: + case SHMSTATE_UNATTACHED: + return false; + case SHMSTATE_ANALYSIS_FAILURE: + case SHMSTATE_ATTACHED: + return true; + } + return true; +} + +/* + * Test for a segment with id shmId; see comment at IpcMemoryState. + * + * If the segment exists, we'll attempt to attach to it, using attachAt + * if that's not NULL (but it's best to pass NULL if possible). + * + * *addr is set to the segment memory address if we attached to it, else NULL. + */ +static IpcMemoryState +PGSharedMemoryAttach(IpcMemoryId shmId, + void *attachAt, + PGShmemHeader **addr) +{ + struct shmid_ds shmStat; + struct stat statbuf; + PGShmemHeader *hdr; + + *addr = NULL; + + /* + * First, try to stat the shm segment ID, to see if it exists at all. + */ + if (shmctl(shmId, IPC_STAT, &shmStat) < 0) + { + /* + * EINVAL actually has multiple possible causes documented in the + * shmctl man page, but we assume it must mean the segment no longer + * exists. + */ + if (errno == EINVAL) + return SHMSTATE_ENOENT; + + /* + * EACCES implies we have no read permission, which means it is not a + * Postgres shmem segment (or at least, not one that is relevant to + * our data directory). + */ + if (errno == EACCES) + return SHMSTATE_FOREIGN; + + /* + * Some Linux kernel versions (in fact, all of them as of July 2007) + * sometimes return EIDRM when EINVAL is correct. The Linux kernel + * actually does not have any internal state that would justify + * returning EIDRM, so we can get away with assuming that EIDRM is + * equivalent to EINVAL on that platform. + */ +#ifdef HAVE_LINUX_EIDRM_BUG + if (errno == EIDRM) + return SHMSTATE_ENOENT; +#endif + + /* + * Otherwise, we had better assume that the segment is in use. The + * only likely case is (non-Linux, assumed spec-compliant) EIDRM, + * which implies that the segment has been IPC_RMID'd but there are + * still processes attached to it. + */ + return SHMSTATE_ANALYSIS_FAILURE; + } + + /* + * Try to attach to the segment and see if it matches our data directory. + * This avoids any risk of duplicate-shmem-key conflicts on machines that + * are running several postmasters under the same userid. + * + * (When we're called from PGSharedMemoryCreate, this stat call is + * duplicative; but since this isn't a high-traffic case it's not worth + * trying to optimize.) + */ + if (stat(DataDir, &statbuf) < 0) + return SHMSTATE_ANALYSIS_FAILURE; /* can't stat; be conservative */ + + hdr = (PGShmemHeader *) shmat(shmId, attachAt, PG_SHMAT_FLAGS); + if (hdr == (PGShmemHeader *) -1) + { + /* + * Attachment failed. The cases we're interested in are the same as + * for the shmctl() call above. In particular, note that the owning + * postmaster could have terminated and removed the segment between + * shmctl() and shmat(). + * + * If attachAt isn't NULL, it's possible that EINVAL reflects a + * problem with that address not a vanished segment, so it's best to + * pass NULL when probing for conflicting segments. + */ + if (errno == EINVAL) + return SHMSTATE_ENOENT; /* segment disappeared */ + if (errno == EACCES) + return SHMSTATE_FOREIGN; /* must be non-Postgres */ +#ifdef HAVE_LINUX_EIDRM_BUG + if (errno == EIDRM) + return SHMSTATE_ENOENT; /* segment disappeared */ +#endif + /* Otherwise, be conservative. */ + return SHMSTATE_ANALYSIS_FAILURE; + } + *addr = hdr; + + if (hdr->magic != PGShmemMagic || + hdr->device != statbuf.st_dev || + hdr->inode != statbuf.st_ino) + { + /* + * It's either not a Postgres segment, or not one for my data + * directory. + */ + return SHMSTATE_FOREIGN; + } + + /* + * It does match our data directory, so now test whether any processes are + * still attached to it. (We are, now, but the shm_nattch result is from + * before we attached to it.) + */ + return shmStat.shm_nattch == 0 ? SHMSTATE_UNATTACHED : SHMSTATE_ATTACHED; +} + +/* + * Identify the huge page size to use, and compute the related mmap flags. + * + * Some Linux kernel versions have a bug causing mmap() to fail on requests + * that are not a multiple of the hugepage size. Versions without that bug + * instead silently round the request up to the next hugepage multiple --- + * and then munmap() fails when we give it a size different from that. + * So we have to round our request up to a multiple of the actual hugepage + * size to avoid trouble. + * + * Doing the round-up ourselves also lets us make use of the extra memory, + * rather than just wasting it. Currently, we just increase the available + * space recorded in the shmem header, which will make the extra usable for + * purposes such as additional locktable entries. Someday, for very large + * hugepage sizes, we might want to think about more invasive strategies, + * such as increasing shared_buffers to absorb the extra space. + * + * Returns the (real, assumed or config provided) page size into + * *hugepagesize, and the hugepage-related mmap flags to use into + * *mmap_flags if requested by the caller. If huge pages are not supported, + * *hugepagesize and *mmap_flags are set to 0. + */ +void +GetHugePageSize(Size *hugepagesize, int *mmap_flags) +{ +#ifdef MAP_HUGETLB + + Size default_hugepagesize = 0; + Size hugepagesize_local = 0; + int mmap_flags_local = 0; + + /* + * System-dependent code to find out the default huge page size. + * + * On Linux, read /proc/meminfo looking for a line like "Hugepagesize: + * nnnn kB". Ignore any failures, falling back to the preset default. + */ +#ifdef __linux__ + + { + FILE *fp = AllocateFile("/proc/meminfo", "r"); + char buf[128]; + unsigned int sz; + char ch; + + if (fp) + { + while (fgets(buf, sizeof(buf), fp)) + { + if (sscanf(buf, "Hugepagesize: %u %c", &sz, &ch) == 2) + { + if (ch == 'k') + { + default_hugepagesize = sz * (Size) 1024; + break; + } + /* We could accept other units besides kB, if needed */ + } + } + FreeFile(fp); + } + } +#endif /* __linux__ */ + + if (huge_page_size != 0) + { + /* If huge page size is requested explicitly, use that. */ + hugepagesize_local = (Size) huge_page_size * 1024; + } + else if (default_hugepagesize != 0) + { + /* Otherwise use the system default, if we have it. */ + hugepagesize_local = default_hugepagesize; + } + else + { + /* + * If we fail to find out the system's default huge page size, or no + * huge page size is requested explicitly, assume it is 2MB. This will + * work fine when the actual size is less. If it's more, we might get + * mmap() or munmap() failures due to unaligned requests; but at this + * writing, there are no reports of any non-Linux systems being picky + * about that. + */ + hugepagesize_local = 2 * 1024 * 1024; + } + + mmap_flags_local = MAP_HUGETLB; + + /* + * On recent enough Linux, also include the explicit page size, if + * necessary. + */ +#if defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT) + if (hugepagesize_local != default_hugepagesize) + { + int shift = pg_ceil_log2_64(hugepagesize_local); + + mmap_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; + } +#endif + + /* assign the results found */ + if (mmap_flags) + *mmap_flags = mmap_flags_local; + if (hugepagesize) + *hugepagesize = hugepagesize_local; + +#else + + if (hugepagesize) + *hugepagesize = 0; + if (mmap_flags) + *mmap_flags = 0; + +#endif /* MAP_HUGETLB */ +} + +/* + * Creates an anonymous mmap()ed shared memory segment. + * + * Pass the requested size in *size. This function will modify *size to the + * actual size of the allocation, if it ends up allocating a segment that is + * larger than requested. + */ +static void * +CreateAnonymousSegment(Size *size) +{ + Size allocsize = *size; + void *ptr = MAP_FAILED; + int mmap_errno = 0; + +#ifndef MAP_HUGETLB + /* PGSharedMemoryCreate should have dealt with this case */ + Assert(huge_pages != HUGE_PAGES_ON); +#else + if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY) + { + /* + * Round up the request size to a suitable large value. + */ + Size hugepagesize; + int mmap_flags; + + GetHugePageSize(&hugepagesize, &mmap_flags); + + if (allocsize % hugepagesize != 0) + allocsize += hugepagesize - (allocsize % hugepagesize); + + ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, + PG_MMAP_FLAGS | mmap_flags, -1, 0); + mmap_errno = errno; + if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED) + elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m", + allocsize); + } +#endif + + if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON) + { + /* + * Use the original size, not the rounded-up value, when falling back + * to non-huge pages. + */ + allocsize = *size; + ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, + PG_MMAP_FLAGS, -1, 0); + mmap_errno = errno; + } + + if (ptr == MAP_FAILED) + { + errno = mmap_errno; + ereport(FATAL, + (errmsg("could not map anonymous shared memory: %m"), + (mmap_errno == ENOMEM) ? + errhint("This error usually means that PostgreSQL's request " + "for a shared memory segment exceeded available memory, " + "swap space, or huge pages. To reduce the request size " + "(currently %zu bytes), reduce PostgreSQL's shared " + "memory usage, perhaps by reducing shared_buffers or " + "max_connections.", + allocsize) : 0)); + } + + *size = allocsize; + return ptr; +} + +/* + * AnonymousShmemDetach --- detach from an anonymous mmap'd block + * (called as an on_shmem_exit callback, hence funny argument list) + */ +static void +AnonymousShmemDetach(int status, Datum arg) +{ + /* Release anonymous shared memory block, if any. */ + if (AnonymousShmem != NULL) + { + if (munmap(AnonymousShmem, AnonymousShmemSize) < 0) + elog(LOG, "munmap(%p, %zu) failed: %m", + AnonymousShmem, AnonymousShmemSize); + AnonymousShmem = NULL; + } +} + +/* + * PGSharedMemoryCreate + * + * Create a shared memory segment of the given size and initialize its + * standard header. Also, register an on_shmem_exit callback to release + * the storage. + * + * Dead Postgres segments pertinent to this DataDir are recycled if found, but + * we do not fail upon collision with foreign shmem segments. The idea here + * is to detect and re-use keys that may have been assigned by a crashed + * postmaster or backend. + */ +PGShmemHeader * +PGSharedMemoryCreate(Size size, + PGShmemHeader **shim) +{ + IpcMemoryKey NextShmemSegID; + void *memAddress; + PGShmemHeader *hdr; + struct stat statbuf; + Size sysvsize; + + /* + * We use the data directory's ID info (inode and device numbers) to + * positively identify shmem segments associated with this data dir, and + * also as seeds for searching for a free shmem key. + */ + if (stat(DataDir, &statbuf) < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not stat data directory \"%s\": %m", + DataDir))); + + /* Complain if hugepages demanded but we can't possibly support them */ +#if !defined(MAP_HUGETLB) + if (huge_pages == HUGE_PAGES_ON) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("huge pages not supported on this platform"))); +#endif + + /* For now, we don't support huge pages in SysV memory */ + if (huge_pages == HUGE_PAGES_ON && shared_memory_type != SHMEM_TYPE_MMAP) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("huge pages not supported with the current shared_memory_type setting"))); + + /* Room for a header? */ + Assert(size > MAXALIGN(sizeof(PGShmemHeader))); + + if (shared_memory_type == SHMEM_TYPE_MMAP) + { + AnonymousShmem = CreateAnonymousSegment(&size); + AnonymousShmemSize = size; + + /* Register on-exit routine to unmap the anonymous segment */ + on_shmem_exit(AnonymousShmemDetach, (Datum) 0); + + /* Now we need only allocate a minimal-sized SysV shmem block. */ + sysvsize = sizeof(PGShmemHeader); + } + else + sysvsize = size; + + /* + * Loop till we find a free IPC key. Trust CreateDataDirLockFile() to + * ensure no more than one postmaster per data directory can enter this + * loop simultaneously. (CreateDataDirLockFile() does not entirely ensure + * that, but prefer fixing it over coping here.) + */ + NextShmemSegID = statbuf.st_ino; + + for (;;) + { + IpcMemoryId shmid; + PGShmemHeader *oldhdr; + IpcMemoryState state; + + /* Try to create new segment */ + memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize); + if (memAddress) + break; /* successful create and attach */ + + /* Check shared memory and possibly remove and recreate */ + + /* + * shmget() failure is typically EACCES, hence SHMSTATE_FOREIGN. + * ENOENT, a narrow possibility, implies SHMSTATE_ENOENT, but one can + * safely treat SHMSTATE_ENOENT like SHMSTATE_FOREIGN. + */ + shmid = shmget(NextShmemSegID, sizeof(PGShmemHeader), 0); + if (shmid < 0) + { + oldhdr = NULL; + state = SHMSTATE_FOREIGN; + } + else + state = PGSharedMemoryAttach(shmid, NULL, &oldhdr); + + switch (state) + { + case SHMSTATE_ANALYSIS_FAILURE: + case SHMSTATE_ATTACHED: + ereport(FATAL, + (errcode(ERRCODE_LOCK_FILE_EXISTS), + errmsg("pre-existing shared memory block (key %lu, ID %lu) is still in use", + (unsigned long) NextShmemSegID, + (unsigned long) shmid), + errhint("Terminate any old server processes associated with data directory \"%s\".", + DataDir))); + break; + case SHMSTATE_ENOENT: + + /* + * To our surprise, some other process deleted since our last + * InternalIpcMemoryCreate(). Moments earlier, we would have + * seen SHMSTATE_FOREIGN. Try that same ID again. + */ + elog(LOG, + "shared memory block (key %lu, ID %lu) deleted during startup", + (unsigned long) NextShmemSegID, + (unsigned long) shmid); + break; + case SHMSTATE_FOREIGN: + NextShmemSegID++; + break; + case SHMSTATE_UNATTACHED: + + /* + * The segment pertains to DataDir, and every process that had + * used it has died or detached. Zap it, if possible, and any + * associated dynamic shared memory segments, as well. This + * shouldn't fail, but if it does, assume the segment belongs + * to someone else after all, and try the next candidate. + * Otherwise, try again to create the segment. That may fail + * if some other process creates the same shmem key before we + * do, in which case we'll try the next key. + */ + if (oldhdr->dsm_control != 0) + dsm_cleanup_using_control_segment(oldhdr->dsm_control); + if (shmctl(shmid, IPC_RMID, NULL) < 0) + NextShmemSegID++; + break; + } + + if (oldhdr && shmdt((void *) oldhdr) < 0) + elog(LOG, "shmdt(%p) failed: %m", oldhdr); + } + + /* Initialize new segment. */ + hdr = (PGShmemHeader *) memAddress; + hdr->creatorPID = getpid(); + hdr->magic = PGShmemMagic; + hdr->dsm_control = 0; + + /* Fill in the data directory ID info, too */ + hdr->device = statbuf.st_dev; + hdr->inode = statbuf.st_ino; + + /* + * Initialize space allocation status for segment. + */ + hdr->totalsize = size; + hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); + *shim = hdr; + + /* Save info for possible future use */ + UsedShmemSegAddr = memAddress; + UsedShmemSegID = (unsigned long) NextShmemSegID; + + /* + * If AnonymousShmem is NULL here, then we're not using anonymous shared + * memory, and should return a pointer to the System V shared memory + * block. Otherwise, the System V shared memory block is only a shim, and + * we must return a pointer to the real block. + */ + if (AnonymousShmem == NULL) + return hdr; + memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader)); + return (PGShmemHeader *) AnonymousShmem; +} + +#ifdef EXEC_BACKEND + +/* + * PGSharedMemoryReAttach + * + * This is called during startup of a postmaster child process to re-attach to + * an already existing shared memory segment. This is needed only in the + * EXEC_BACKEND case; otherwise postmaster children inherit the shared memory + * segment attachment via fork(). + * + * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this + * routine. The caller must have already restored them to the postmaster's + * values. + */ +void +PGSharedMemoryReAttach(void) +{ + IpcMemoryId shmid; + PGShmemHeader *hdr; + IpcMemoryState state; + void *origUsedShmemSegAddr = UsedShmemSegAddr; + + Assert(UsedShmemSegAddr != NULL); + Assert(IsUnderPostmaster); + +#ifdef __CYGWIN__ + /* cygipc (currently) appears to not detach on exec. */ + PGSharedMemoryDetach(); + UsedShmemSegAddr = origUsedShmemSegAddr; +#endif + + elog(DEBUG3, "attaching to %p", UsedShmemSegAddr); + shmid = shmget(UsedShmemSegID, sizeof(PGShmemHeader), 0); + if (shmid < 0) + state = SHMSTATE_FOREIGN; + else + state = PGSharedMemoryAttach(shmid, UsedShmemSegAddr, &hdr); + if (state != SHMSTATE_ATTACHED) + elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m", + (int) UsedShmemSegID, UsedShmemSegAddr); + if (hdr != origUsedShmemSegAddr) + elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)", + hdr, origUsedShmemSegAddr); + dsm_set_control_handle(hdr->dsm_control); + + UsedShmemSegAddr = hdr; /* probably redundant */ +} + +/* + * PGSharedMemoryNoReAttach + * + * This is called during startup of a postmaster child process when we choose + * *not* to re-attach to the existing shared memory segment. We must clean up + * to leave things in the appropriate state. This is not used in the non + * EXEC_BACKEND case, either. + * + * The child process startup logic might or might not call PGSharedMemoryDetach + * after this; make sure that it will be a no-op if called. + * + * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this + * routine. The caller must have already restored them to the postmaster's + * values. + */ +void +PGSharedMemoryNoReAttach(void) +{ + Assert(UsedShmemSegAddr != NULL); + Assert(IsUnderPostmaster); + +#ifdef __CYGWIN__ + /* cygipc (currently) appears to not detach on exec. */ + PGSharedMemoryDetach(); +#endif + + /* For cleanliness, reset UsedShmemSegAddr to show we're not attached. */ + UsedShmemSegAddr = NULL; + /* And the same for UsedShmemSegID. */ + UsedShmemSegID = 0; +} + +#endif /* EXEC_BACKEND */ + +/* + * PGSharedMemoryDetach + * + * Detach from the shared memory segment, if still attached. This is not + * intended to be called explicitly by the process that originally created the + * segment (it will have on_shmem_exit callback(s) registered to do that). + * Rather, this is for subprocesses that have inherited an attachment and want + * to get rid of it. + * + * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this + * routine, also AnonymousShmem and AnonymousShmemSize. + */ +void +PGSharedMemoryDetach(void) +{ + if (UsedShmemSegAddr != NULL) + { + if ((shmdt(UsedShmemSegAddr) < 0) +#if defined(EXEC_BACKEND) && defined(__CYGWIN__) + /* Work-around for cygipc exec bug */ + && shmdt(NULL) < 0 +#endif + ) + elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr); + UsedShmemSegAddr = NULL; + } + + if (AnonymousShmem != NULL) + { + if (munmap(AnonymousShmem, AnonymousShmemSize) < 0) + elog(LOG, "munmap(%p, %zu) failed: %m", + AnonymousShmem, AnonymousShmemSize); + AnonymousShmem = NULL; + } +} diff --git a/src/backend/port/tas/dummy.s b/src/backend/port/tas/dummy.s new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/src/backend/port/tas/dummy.s diff --git a/src/backend/port/tas/hpux_hppa.s b/src/backend/port/tas/hpux_hppa.s new file mode 100644 index 0000000..d978a7c --- /dev/null +++ b/src/backend/port/tas/hpux_hppa.s @@ -0,0 +1,28 @@ + + .SPACE $TEXT$,SORT=8 + .SUBSPA $CODE$,QUAD=0,ALIGN=4,ACCESS=44,CODE_ONLY,SORT=24 +tas + .PROC + .CALLINFO CALLER,FRAME=0,ENTRY_SR=3 + .ENTRY + LDO 15(%r26),%r31 ;offset 0x0 + DEPI 0,31,4,%r31 ;offset 0x4 + LDCWX 0(0,%r31),%r23 ;offset 0x8 + COMICLR,= 0,%r23,%r0 ;offset 0xc + DEP,TR %r0,31,32,%r28 ;offset 0x10 +$00000001 + LDI 1,%r28 ;offset 0x14 +$L0 + .EXIT + BV,N %r0(%r2) ;offset 0x18 + .PROCEND ;in=26;out=28; + + + .SPACE $TEXT$ + .SUBSPA $CODE$ + .SPACE $PRIVATE$,SORT=16 + .SUBSPA $DATA$,QUAD=1,ALIGN=8,ACCESS=31,SORT=16 + .SPACE $TEXT$ + .SUBSPA $CODE$ + .EXPORT tas,ENTRY,PRIV_LEV=3,ARGW0=GR,RTNVAL=GR + .END diff --git a/src/backend/port/tas/sunstudio_sparc.s b/src/backend/port/tas/sunstudio_sparc.s new file mode 100644 index 0000000..da9ed35 --- /dev/null +++ b/src/backend/port/tas/sunstudio_sparc.s @@ -0,0 +1,53 @@ +!------------------------------------------------------------------------- +! +! sunstudio_sparc.s +! compare and swap for Sun Studio on Sparc +! +! Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group +! Portions Copyright (c) 1994, Regents of the University of California +! +! IDENTIFICATION +! src/backend/port/tas/sunstudio_sparc.s +! +!------------------------------------------------------------------------- + +! Fortunately the Sun compiler can process cpp conditionals with -P + +! '/' is the comment for x86, while '!' is the comment for Sparc + +#if defined(__sparcv9) || defined(__sparc) + + .section ".text" + .align 8 + .skip 24 + .align 4 + + .global pg_atomic_cas +pg_atomic_cas: + + ! "cas" only works on sparcv9 and sparcv8plus chips, and + ! requires a compiler targeting these CPUs. It will fail + ! on a compiler targeting sparcv8, and of course will not + ! be understood by a sparcv8 CPU. gcc continues to use + ! "ldstub" because it targets sparcv7. + ! + ! There is actually a trick for embedding "cas" in a + ! sparcv8-targeted compiler, but it can only be run + ! on a sparcv8plus/v9 cpus: + ! + ! http://cvs.opensolaris.org/source/xref/on/usr/src/lib/libc/sparc/threads/sparc.il + ! + ! NB: We're assuming we're running on a TSO system here - solaris + ! userland luckily always has done so. + +#if defined(__sparcv9) || defined(__sparcv8plus) + cas [%o0],%o2,%o1 +#else + ldstub [%o0],%o1 +#endif + mov %o1,%o0 + retl + nop + .type pg_atomic_cas,2 + .size pg_atomic_cas,(.-pg_atomic_cas) +#endif diff --git a/src/backend/port/tas/sunstudio_x86.s b/src/backend/port/tas/sunstudio_x86.s new file mode 100644 index 0000000..808b207 --- /dev/null +++ b/src/backend/port/tas/sunstudio_x86.s @@ -0,0 +1,43 @@ +/------------------------------------------------------------------------- +/ +/ sunstudio_x86.s +/ compare and swap for Sun Studio on x86 +/ +/ Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group +/ Portions Copyright (c) 1994, Regents of the University of California +/ +/ IDENTIFICATION +/ src/backend/port/tas/sunstudio_x86.s +/ +/------------------------------------------------------------------------- + +/ Fortunately the Sun compiler can process cpp conditionals with -P + +/ '/' is the comment for x86, while '!' is the comment for Sparc + + .file "tas.s" + +#if defined(__amd64) + .code64 +#endif + + .globl pg_atomic_cas + .type pg_atomic_cas, @function + + .section .text, "ax" + .align 16 + +pg_atomic_cas: +#if defined(__amd64) + movl %edx,%eax + lock + cmpxchgl %esi,(%rdi) +#else + movl 4(%esp), %edx + movl 8(%esp), %ecx + movl 12(%esp), %eax + lock + cmpxchgl %ecx, (%edx) +#endif + ret + .size pg_atomic_cas, . - pg_atomic_cas diff --git a/src/backend/port/win32/Makefile b/src/backend/port/win32/Makefile new file mode 100644 index 0000000..90126f6 --- /dev/null +++ b/src/backend/port/win32/Makefile @@ -0,0 +1,23 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for backend/port/win32 +# +# IDENTIFICATION +# src/backend/port/win32/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/port/win32 +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + signal.o \ + socket.o \ + timer.o +ifeq ($(have_win32_dbghelp), yes) +OBJS += crashdump.o +endif + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/port/win32/crashdump.c b/src/backend/port/win32/crashdump.c new file mode 100644 index 0000000..cea7be2 --- /dev/null +++ b/src/backend/port/win32/crashdump.c @@ -0,0 +1,181 @@ +/*------------------------------------------------------------------------- + * + * crashdump.c + * Automatic crash dump creation for PostgreSQL on Windows + * + * The crashdump feature traps unhandled win32 exceptions produced by the + * backend, and tries to produce a Windows MiniDump crash + * dump for later debugging and analysis. The machine performing the dump + * doesn't need any special debugging tools; the user only needs to send + * the dump to somebody who has the same version of PostgreSQL and has debugging + * tools. + * + * crashdump module originally by Craig Ringer <ringerc@ringerc.id.au> + * + * LIMITATIONS + * =========== + * This *won't* work in hard OOM situations or stack overflows. + * + * For those, it'd be necessary to take a much more complicated approach where + * the handler switches to a new stack (if it can) and forks a helper process + * to debug it self. + * + * POSSIBLE FUTURE WORK + * ==================== + * For bonus points, the crash dump format permits embedding of user-supplied + * data. If there's anything else that should always be supplied with a crash + * dump (postgresql.conf? Last few lines of a log file?), it could potentially + * be added, though at the cost of a greater chance of the crash dump failing. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/port/win32/crashdump.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +/* + * Some versions of the MS SDK contain "typedef enum { ... } ;" which the MS + * compiler quite sanely complains about. Well done, Microsoft. + * This pragma disables the warning just while we include the header. + * The pragma is known to work with all (as at the time of writing) supported + * versions of MSVC. + */ +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4091) +#endif +#include <dbghelp.h> +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +/* + * Much of the following code is based on CodeProject and MSDN examples, + * particularly + * http://www.codeproject.com/KB/debug/postmortemdebug_standalone1.aspx + * + * Useful MSDN articles: + * + * http://msdn.microsoft.com/en-us/library/ff805116(v=VS.85).aspx + * http://msdn.microsoft.com/en-us/library/ms679294(VS.85).aspx + * + * Other useful articles on working with minidumps: + * http://www.debuginfo.com/articles/effminidumps.html + */ + +typedef BOOL (WINAPI * MINIDUMPWRITEDUMP) (HANDLE hProcess, DWORD dwPid, HANDLE hFile, MINIDUMP_TYPE DumpType, + CONST PMINIDUMP_EXCEPTION_INFORMATION ExceptionParam, + CONST PMINIDUMP_USER_STREAM_INFORMATION UserStreamParam, + CONST PMINIDUMP_CALLBACK_INFORMATION CallbackParam +); + + +/* + * This function is the exception handler passed to SetUnhandledExceptionFilter. + * It's invoked only if there's an unhandled exception. The handler will use + * dbghelp.dll to generate a crash dump, then resume the normal unhandled + * exception process, which will generally exit with an error message from + * the runtime. + * + * This function is run under the unhandled exception handler, effectively + * in a crash context, so it should be careful with memory and avoid using + * any PostgreSQL functions. + */ +static LONG WINAPI +crashDumpHandler(struct _EXCEPTION_POINTERS *pExceptionInfo) +{ + /* + * We only write crash dumps if the "crashdumps" directory within the + * postgres data directory exists. + */ + DWORD attribs = GetFileAttributesA("crashdumps"); + + if (attribs != INVALID_FILE_ATTRIBUTES && (attribs & FILE_ATTRIBUTE_DIRECTORY)) + { + /* 'crashdumps' exists and is a directory. Try to write a dump' */ + HMODULE hDll = NULL; + MINIDUMPWRITEDUMP pDump = NULL; + MINIDUMP_TYPE dumpType; + char dumpPath[_MAX_PATH]; + HANDLE selfProcHandle = GetCurrentProcess(); + DWORD selfPid = GetProcessId(selfProcHandle); + HANDLE dumpFile; + DWORD systemTicks; + struct _MINIDUMP_EXCEPTION_INFORMATION ExInfo; + + ExInfo.ThreadId = GetCurrentThreadId(); + ExInfo.ExceptionPointers = pExceptionInfo; + ExInfo.ClientPointers = FALSE; + + /* Load the dbghelp.dll library and functions */ + hDll = LoadLibrary("dbghelp.dll"); + if (hDll == NULL) + { + write_stderr("could not load dbghelp.dll, cannot write crash dump\n"); + return EXCEPTION_CONTINUE_SEARCH; + } + + pDump = (MINIDUMPWRITEDUMP) (pg_funcptr_t) GetProcAddress(hDll, "MiniDumpWriteDump"); + + if (pDump == NULL) + { + write_stderr("could not load required functions in dbghelp.dll, cannot write crash dump\n"); + return EXCEPTION_CONTINUE_SEARCH; + } + + /* + * Dump as much as we can, except shared memory, code segments, and + * memory mapped files. Exactly what we can dump depends on the + * version of dbghelp.dll, see: + * http://msdn.microsoft.com/en-us/library/ms680519(v=VS.85).aspx + */ + dumpType = MiniDumpNormal | MiniDumpWithHandleData | + MiniDumpWithDataSegs; + + if (GetProcAddress(hDll, "EnumDirTree") != NULL) + { + /* If this function exists, we have version 5.2 or newer */ + dumpType |= MiniDumpWithIndirectlyReferencedMemory | + MiniDumpWithPrivateReadWriteMemory; + } + + systemTicks = GetTickCount(); + snprintf(dumpPath, _MAX_PATH, + "crashdumps\\postgres-pid%0i-%0i.mdmp", + (int) selfPid, (int) systemTicks); + dumpPath[_MAX_PATH - 1] = '\0'; + + dumpFile = CreateFile(dumpPath, GENERIC_WRITE, FILE_SHARE_WRITE, + NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, + NULL); + if (dumpFile == INVALID_HANDLE_VALUE) + { + write_stderr("could not open crash dump file \"%s\" for writing: error code %lu\n", + dumpPath, GetLastError()); + return EXCEPTION_CONTINUE_SEARCH; + } + + if ((*pDump) (selfProcHandle, selfPid, dumpFile, dumpType, &ExInfo, + NULL, NULL)) + write_stderr("wrote crash dump to file \"%s\"\n", dumpPath); + else + write_stderr("could not write crash dump to file \"%s\": error code %lu\n", + dumpPath, GetLastError()); + + CloseHandle(dumpFile); + } + + return EXCEPTION_CONTINUE_SEARCH; +} + + +void +pgwin32_install_crashdump_handler(void) +{ + SetUnhandledExceptionFilter(crashDumpHandler); +} diff --git a/src/backend/port/win32/signal.c b/src/backend/port/win32/signal.c new file mode 100644 index 0000000..b71164d --- /dev/null +++ b/src/backend/port/win32/signal.c @@ -0,0 +1,354 @@ +/*------------------------------------------------------------------------- + * + * signal.c + * Microsoft Windows Win32 Signal Emulation Functions + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/port/win32/signal.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "libpq/pqsignal.h" + +/* + * These are exported for use by the UNBLOCKED_SIGNAL_QUEUE() macro. + * pg_signal_queue must be volatile since it is changed by the signal + * handling thread and inspected without any lock by the main thread. + * pg_signal_mask is only changed by main thread so shouldn't need it. + */ +volatile int pg_signal_queue; +int pg_signal_mask; + +HANDLE pgwin32_signal_event; +HANDLE pgwin32_initial_signal_pipe = INVALID_HANDLE_VALUE; + +/* + * pg_signal_crit_sec is used to protect only pg_signal_queue. That is the only + * variable that can be accessed from the signal sending threads! + */ +static CRITICAL_SECTION pg_signal_crit_sec; + +/* Note that array elements 0 are unused since they correspond to signal 0 */ +static pqsigfunc pg_signal_array[PG_SIGNAL_COUNT]; +static pqsigfunc pg_signal_defaults[PG_SIGNAL_COUNT]; + + +/* Signal handling thread functions */ +static DWORD WINAPI pg_signal_thread(LPVOID param); +static BOOL WINAPI pg_console_handler(DWORD dwCtrlType); + + +/* + * pg_usleep --- delay the specified number of microseconds, but + * stop waiting if a signal arrives. + * + * This replaces the non-signal-aware version provided by src/port/pgsleep.c. + */ +void +pg_usleep(long microsec) +{ + if (unlikely(pgwin32_signal_event == NULL)) + { + /* + * If we're reached by pgwin32_open_handle() early in startup before + * the signal event is set up, just fall back to a regular + * non-interruptible sleep. + */ + SleepEx((microsec < 500 ? 1 : (microsec + 500) / 1000), FALSE); + return; + } + + if (WaitForSingleObject(pgwin32_signal_event, + (microsec < 500 ? 1 : (microsec + 500) / 1000)) + == WAIT_OBJECT_0) + { + pgwin32_dispatch_queued_signals(); + errno = EINTR; + return; + } +} + + +/* Initialization */ +void +pgwin32_signal_initialize(void) +{ + int i; + HANDLE signal_thread_handle; + + InitializeCriticalSection(&pg_signal_crit_sec); + + for (i = 0; i < PG_SIGNAL_COUNT; i++) + { + pg_signal_array[i] = SIG_DFL; + pg_signal_defaults[i] = SIG_IGN; + } + pg_signal_mask = 0; + pg_signal_queue = 0; + + /* Create the global event handle used to flag signals */ + pgwin32_signal_event = CreateEvent(NULL, TRUE, FALSE, NULL); + if (pgwin32_signal_event == NULL) + ereport(FATAL, + (errmsg_internal("could not create signal event: error code %lu", GetLastError()))); + + /* Create thread for handling signals */ + signal_thread_handle = CreateThread(NULL, 0, pg_signal_thread, NULL, 0, NULL); + if (signal_thread_handle == NULL) + ereport(FATAL, + (errmsg_internal("could not create signal handler thread"))); + + /* Create console control handle to pick up Ctrl-C etc */ + if (!SetConsoleCtrlHandler(pg_console_handler, TRUE)) + ereport(FATAL, + (errmsg_internal("could not set console control handler"))); +} + +/* + * Dispatch all signals currently queued and not blocked + * Blocked signals are ignored, and will be fired at the time of + * the pqsigsetmask() call. + */ +void +pgwin32_dispatch_queued_signals(void) +{ + int exec_mask; + + Assert(pgwin32_signal_event != NULL); + EnterCriticalSection(&pg_signal_crit_sec); + while ((exec_mask = UNBLOCKED_SIGNAL_QUEUE()) != 0) + { + /* One or more unblocked signals queued for execution */ + int i; + + for (i = 1; i < PG_SIGNAL_COUNT; i++) + { + if (exec_mask & sigmask(i)) + { + /* Execute this signal */ + pqsigfunc sig = pg_signal_array[i]; + + if (sig == SIG_DFL) + sig = pg_signal_defaults[i]; + pg_signal_queue &= ~sigmask(i); + if (sig != SIG_ERR && sig != SIG_IGN && sig != SIG_DFL) + { + LeaveCriticalSection(&pg_signal_crit_sec); + sig(i); + EnterCriticalSection(&pg_signal_crit_sec); + break; /* Restart outer loop, in case signal mask or + * queue has been modified inside signal + * handler */ + } + } + } + } + ResetEvent(pgwin32_signal_event); + LeaveCriticalSection(&pg_signal_crit_sec); +} + +/* signal masking. Only called on main thread, no sync required */ +int +pqsigsetmask(int mask) +{ + int prevmask; + + prevmask = pg_signal_mask; + pg_signal_mask = mask; + + /* + * Dispatch any signals queued up right away, in case we have unblocked + * one or more signals previously queued + */ + pgwin32_dispatch_queued_signals(); + + return prevmask; +} + + +/* + * Unix-like signal handler installation + * + * Only called on main thread, no sync required + */ +pqsigfunc +pqsignal(int signum, pqsigfunc handler) +{ + pqsigfunc prevfunc; + + if (signum >= PG_SIGNAL_COUNT || signum < 0) + return SIG_ERR; + prevfunc = pg_signal_array[signum]; + pg_signal_array[signum] = handler; + return prevfunc; +} + +/* Create the signal listener pipe for specified PID */ +HANDLE +pgwin32_create_signal_listener(pid_t pid) +{ + char pipename[128]; + HANDLE pipe; + + snprintf(pipename, sizeof(pipename), "\\\\.\\pipe\\pgsignal_%u", (int) pid); + + pipe = CreateNamedPipe(pipename, PIPE_ACCESS_DUPLEX, + PIPE_TYPE_MESSAGE | PIPE_READMODE_MESSAGE | PIPE_WAIT, + PIPE_UNLIMITED_INSTANCES, 16, 16, 1000, NULL); + + if (pipe == INVALID_HANDLE_VALUE) + ereport(ERROR, + (errmsg("could not create signal listener pipe for PID %d: error code %lu", + (int) pid, GetLastError()))); + + return pipe; +} + + +/* + * All functions below execute on the signal handler thread + * and must be synchronized as such! + * NOTE! The only global variable that can be used is + * pg_signal_queue! + */ + + +/* + * Queue a signal for the main thread, by setting the flag bit and event. + */ +void +pg_queue_signal(int signum) +{ + Assert(pgwin32_signal_event != NULL); + if (signum >= PG_SIGNAL_COUNT || signum <= 0) + return; /* ignore any bad signal number */ + + EnterCriticalSection(&pg_signal_crit_sec); + pg_signal_queue |= sigmask(signum); + LeaveCriticalSection(&pg_signal_crit_sec); + + SetEvent(pgwin32_signal_event); +} + +/* Signal handling thread */ +static DWORD WINAPI +pg_signal_thread(LPVOID param) +{ + char pipename[128]; + HANDLE pipe = pgwin32_initial_signal_pipe; + + /* Set up pipe name, in case we have to re-create the pipe. */ + snprintf(pipename, sizeof(pipename), "\\\\.\\pipe\\pgsignal_%lu", GetCurrentProcessId()); + + for (;;) + { + BOOL fConnected; + + /* Create a new pipe instance if we don't have one. */ + if (pipe == INVALID_HANDLE_VALUE) + { + pipe = CreateNamedPipe(pipename, PIPE_ACCESS_DUPLEX, + PIPE_TYPE_MESSAGE | PIPE_READMODE_MESSAGE | PIPE_WAIT, + PIPE_UNLIMITED_INSTANCES, 16, 16, 1000, NULL); + + if (pipe == INVALID_HANDLE_VALUE) + { + write_stderr("could not create signal listener pipe: error code %lu; retrying\n", GetLastError()); + SleepEx(500, FALSE); + continue; + } + } + + /* + * Wait for a client to connect. If something connects before we + * reach here, we'll get back a "failure" with ERROR_PIPE_CONNECTED, + * which is actually a success (way to go, Microsoft). + */ + fConnected = ConnectNamedPipe(pipe, NULL) ? TRUE : (GetLastError() == ERROR_PIPE_CONNECTED); + if (fConnected) + { + /* + * We have a connection from a would-be signal sender. Process it. + */ + BYTE sigNum; + DWORD bytes; + + if (ReadFile(pipe, &sigNum, 1, &bytes, NULL) && + bytes == 1) + { + /* + * Queue the signal before responding to the client. In this + * way, it's guaranteed that once kill() has returned in the + * signal sender, the next CHECK_FOR_INTERRUPTS() in the + * signal recipient will see the signal. (This is a stronger + * guarantee than POSIX makes; maybe we don't need it? But + * without it, we've seen timing bugs on Windows that do not + * manifest on any known Unix.) + */ + pg_queue_signal(sigNum); + + /* + * Write something back to the client, allowing its + * CallNamedPipe() call to terminate. + */ + WriteFile(pipe, &sigNum, 1, &bytes, NULL); /* Don't care if it + * works or not */ + + /* + * We must wait for the client to read the data before we can + * disconnect, else the data will be lost. (If the WriteFile + * call failed, there'll be nothing in the buffer, so this + * shouldn't block.) + */ + FlushFileBuffers(pipe); + } + else + { + /* + * If we fail to read a byte from the client, assume it's the + * client's problem and do nothing. Perhaps it'd be better to + * force a pipe close and reopen? + */ + } + + /* Disconnect from client so that we can re-use the pipe. */ + DisconnectNamedPipe(pipe); + } + else + { + /* + * Connection failed. Cleanup and try again. + * + * This should never happen. If it does, there's a window where + * we'll miss signals until we manage to re-create the pipe. + * However, just trying to use the same pipe again is probably not + * going to work, so we have little choice. + */ + CloseHandle(pipe); + pipe = INVALID_HANDLE_VALUE; + } + } + return 0; +} + + +/* Console control handler will execute on a thread created + by the OS at the time of invocation */ +static BOOL WINAPI +pg_console_handler(DWORD dwCtrlType) +{ + if (dwCtrlType == CTRL_C_EVENT || + dwCtrlType == CTRL_BREAK_EVENT || + dwCtrlType == CTRL_CLOSE_EVENT || + dwCtrlType == CTRL_SHUTDOWN_EVENT) + { + pg_queue_signal(SIGINT); + return TRUE; + } + return FALSE; +} diff --git a/src/backend/port/win32/socket.c b/src/backend/port/win32/socket.c new file mode 100644 index 0000000..52944a0 --- /dev/null +++ b/src/backend/port/win32/socket.c @@ -0,0 +1,705 @@ +/*------------------------------------------------------------------------- + * + * socket.c + * Microsoft Windows Win32 Socket Functions + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/port/win32/socket.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +/* + * Indicate if pgwin32_recv() and pgwin32_send() should operate + * in non-blocking mode. + * + * Since the socket emulation layer always sets the actual socket to + * non-blocking mode in order to be able to deliver signals, we must + * specify this in a separate flag if we actually need non-blocking + * operation. + * + * This flag changes the behaviour *globally* for all socket operations, + * so it should only be set for very short periods of time. + */ +int pgwin32_noblock = 0; + +/* Undef the macros defined in win32.h, so we can access system functions */ +#undef socket +#undef bind +#undef listen +#undef accept +#undef connect +#undef select +#undef recv +#undef send + +/* + * Blocking socket functions implemented so they listen on both + * the socket and the signal event, required for signal handling. + */ + +/* + * Convert the last socket error code into errno + * + * Note: where there is a direct correspondence between a WSAxxx error code + * and a Berkeley error symbol, this mapping is actually a no-op, because + * in win32_port.h we redefine the network-related Berkeley error symbols to + * have the values of their WSAxxx counterparts. The point of the switch is + * mostly to translate near-miss error codes into something that's sensible + * in the Berkeley universe. + */ +static void +TranslateSocketError(void) +{ + switch (WSAGetLastError()) + { + case WSAEINVAL: + case WSANOTINITIALISED: + case WSAEINVALIDPROVIDER: + case WSAEINVALIDPROCTABLE: + case WSAEDESTADDRREQ: + errno = EINVAL; + break; + case WSAEINPROGRESS: + errno = EINPROGRESS; + break; + case WSAEFAULT: + errno = EFAULT; + break; + case WSAEISCONN: + errno = EISCONN; + break; + case WSAEMSGSIZE: + errno = EMSGSIZE; + break; + case WSAEAFNOSUPPORT: + errno = EAFNOSUPPORT; + break; + case WSAEMFILE: + errno = EMFILE; + break; + case WSAENOBUFS: + errno = ENOBUFS; + break; + case WSAEPROTONOSUPPORT: + case WSAEPROTOTYPE: + case WSAESOCKTNOSUPPORT: + errno = EPROTONOSUPPORT; + break; + case WSAECONNABORTED: + errno = ECONNABORTED; + break; + case WSAECONNREFUSED: + errno = ECONNREFUSED; + break; + case WSAECONNRESET: + errno = ECONNRESET; + break; + case WSAEINTR: + errno = EINTR; + break; + case WSAENOTSOCK: + errno = ENOTSOCK; + break; + case WSAEOPNOTSUPP: + errno = EOPNOTSUPP; + break; + case WSAEWOULDBLOCK: + errno = EWOULDBLOCK; + break; + case WSAEACCES: + errno = EACCES; + break; + case WSAEADDRINUSE: + errno = EADDRINUSE; + break; + case WSAEADDRNOTAVAIL: + errno = EADDRNOTAVAIL; + break; + case WSAEHOSTDOWN: + errno = EHOSTDOWN; + break; + case WSAEHOSTUNREACH: + case WSAHOST_NOT_FOUND: + errno = EHOSTUNREACH; + break; + case WSAENETDOWN: + errno = ENETDOWN; + break; + case WSAENETUNREACH: + errno = ENETUNREACH; + break; + case WSAENETRESET: + errno = ENETRESET; + break; + case WSAENOTCONN: + case WSAESHUTDOWN: + case WSAEDISCON: + errno = ENOTCONN; + break; + case WSAETIMEDOUT: + errno = ETIMEDOUT; + break; + default: + ereport(NOTICE, + (errmsg_internal("unrecognized win32 socket error code: %d", + WSAGetLastError()))); + errno = EINVAL; + break; + } +} + +static int +pgwin32_poll_signals(void) +{ + if (UNBLOCKED_SIGNAL_QUEUE()) + { + pgwin32_dispatch_queued_signals(); + errno = EINTR; + return 1; + } + return 0; +} + +static int +isDataGram(SOCKET s) +{ + int type; + int typelen = sizeof(type); + + if (getsockopt(s, SOL_SOCKET, SO_TYPE, (char *) &type, &typelen)) + return 1; + + return (type == SOCK_DGRAM) ? 1 : 0; +} + +int +pgwin32_waitforsinglesocket(SOCKET s, int what, int timeout) +{ + static HANDLE waitevent = INVALID_HANDLE_VALUE; + static SOCKET current_socket = INVALID_SOCKET; + static int isUDP = 0; + HANDLE events[2]; + int r; + + /* Create an event object just once and use it on all future calls */ + if (waitevent == INVALID_HANDLE_VALUE) + { + waitevent = CreateEvent(NULL, TRUE, FALSE, NULL); + + if (waitevent == INVALID_HANDLE_VALUE) + ereport(ERROR, + (errmsg_internal("could not create socket waiting event: error code %lu", GetLastError()))); + } + else if (!ResetEvent(waitevent)) + ereport(ERROR, + (errmsg_internal("could not reset socket waiting event: error code %lu", GetLastError()))); + + /* + * Track whether socket is UDP or not. (NB: most likely, this is both + * useless and wrong; there is no reason to think that the behavior of + * WSAEventSelect is different for TCP and UDP.) + */ + if (current_socket != s) + isUDP = isDataGram(s); + current_socket = s; + + /* + * Attach event to socket. NOTE: we must detach it again before + * returning, since other bits of code may try to attach other events to + * the socket. + */ + if (WSAEventSelect(s, waitevent, what) != 0) + { + TranslateSocketError(); + return 0; + } + + events[0] = pgwin32_signal_event; + events[1] = waitevent; + + /* + * Just a workaround of unknown locking problem with writing in UDP socket + * under high load: Client's pgsql backend sleeps infinitely in + * WaitForMultipleObjectsEx, pgstat process sleeps in pgwin32_select(). + * So, we will wait with small timeout(0.1 sec) and if socket is still + * blocked, try WSASend (see comments in pgwin32_select) and wait again. + */ + if ((what & FD_WRITE) && isUDP) + { + for (;;) + { + r = WaitForMultipleObjectsEx(2, events, FALSE, 100, TRUE); + + if (r == WAIT_TIMEOUT) + { + char c; + WSABUF buf; + DWORD sent; + + buf.buf = &c; + buf.len = 0; + + r = WSASend(s, &buf, 1, &sent, 0, NULL, NULL); + if (r == 0) /* Completed - means things are fine! */ + { + WSAEventSelect(s, NULL, 0); + return 1; + } + else if (WSAGetLastError() != WSAEWOULDBLOCK) + { + TranslateSocketError(); + WSAEventSelect(s, NULL, 0); + return 0; + } + } + else + break; + } + } + else + r = WaitForMultipleObjectsEx(2, events, FALSE, timeout, TRUE); + + WSAEventSelect(s, NULL, 0); + + if (r == WAIT_OBJECT_0 || r == WAIT_IO_COMPLETION) + { + pgwin32_dispatch_queued_signals(); + errno = EINTR; + return 0; + } + if (r == WAIT_OBJECT_0 + 1) + return 1; + if (r == WAIT_TIMEOUT) + { + errno = EWOULDBLOCK; + return 0; + } + ereport(ERROR, + (errmsg_internal("unrecognized return value from WaitForMultipleObjects: %d (error code %lu)", r, GetLastError()))); + return 0; +} + +/* + * Create a socket, setting it to overlapped and non-blocking + */ +SOCKET +pgwin32_socket(int af, int type, int protocol) +{ + SOCKET s; + unsigned long on = 1; + + s = WSASocket(af, type, protocol, NULL, 0, WSA_FLAG_OVERLAPPED); + if (s == INVALID_SOCKET) + { + TranslateSocketError(); + return INVALID_SOCKET; + } + + if (ioctlsocket(s, FIONBIO, &on)) + { + TranslateSocketError(); + return INVALID_SOCKET; + } + errno = 0; + + return s; +} + +int +pgwin32_bind(SOCKET s, struct sockaddr *addr, int addrlen) +{ + int res; + + res = bind(s, addr, addrlen); + if (res < 0) + TranslateSocketError(); + return res; +} + +int +pgwin32_listen(SOCKET s, int backlog) +{ + int res; + + res = listen(s, backlog); + if (res < 0) + TranslateSocketError(); + return res; +} + +SOCKET +pgwin32_accept(SOCKET s, struct sockaddr *addr, int *addrlen) +{ + SOCKET rs; + + /* + * Poll for signals, but don't return with EINTR, since we don't handle + * that in pqcomm.c + */ + pgwin32_poll_signals(); + + rs = WSAAccept(s, addr, addrlen, NULL, 0); + if (rs == INVALID_SOCKET) + { + TranslateSocketError(); + return INVALID_SOCKET; + } + return rs; +} + + +/* No signal delivery during connect. */ +int +pgwin32_connect(SOCKET s, const struct sockaddr *addr, int addrlen) +{ + int r; + + r = WSAConnect(s, addr, addrlen, NULL, NULL, NULL, NULL); + if (r == 0) + return 0; + + if (WSAGetLastError() != WSAEWOULDBLOCK) + { + TranslateSocketError(); + return -1; + } + + while (pgwin32_waitforsinglesocket(s, FD_CONNECT, INFINITE) == 0) + { + /* Loop endlessly as long as we are just delivering signals */ + } + + return 0; +} + +int +pgwin32_recv(SOCKET s, char *buf, int len, int f) +{ + WSABUF wbuf; + int r; + DWORD b; + DWORD flags = f; + int n; + + if (pgwin32_poll_signals()) + return -1; + + wbuf.len = len; + wbuf.buf = buf; + + r = WSARecv(s, &wbuf, 1, &b, &flags, NULL, NULL); + if (r != SOCKET_ERROR) + return b; /* success */ + + if (WSAGetLastError() != WSAEWOULDBLOCK) + { + TranslateSocketError(); + return -1; + } + + if (pgwin32_noblock) + { + /* + * No data received, and we are in "emulated non-blocking mode", so + * return indicating that we'd block if we were to continue. + */ + errno = EWOULDBLOCK; + return -1; + } + + /* We're in blocking mode, so wait for data */ + + for (n = 0; n < 5; n++) + { + if (pgwin32_waitforsinglesocket(s, FD_READ | FD_CLOSE | FD_ACCEPT, + INFINITE) == 0) + return -1; /* errno already set */ + + r = WSARecv(s, &wbuf, 1, &b, &flags, NULL, NULL); + if (r != SOCKET_ERROR) + return b; /* success */ + if (WSAGetLastError() != WSAEWOULDBLOCK) + { + TranslateSocketError(); + return -1; + } + + /* + * There seem to be cases on win2k (at least) where WSARecv can return + * WSAEWOULDBLOCK even when pgwin32_waitforsinglesocket claims the + * socket is readable. In this case, just sleep for a moment and try + * again. We try up to 5 times - if it fails more than that it's not + * likely to ever come back. + */ + pg_usleep(10000); + } + ereport(NOTICE, + (errmsg_internal("could not read from ready socket (after retries)"))); + errno = EWOULDBLOCK; + return -1; +} + +/* + * The second argument to send() is defined by SUS to be a "const void *" + * and so we use the same signature here to keep compilers happy when + * handling callers. + * + * But the buf member of a WSABUF struct is defined as "char *", so we cast + * the second argument to that here when assigning it, also to keep compilers + * happy. + */ + +int +pgwin32_send(SOCKET s, const void *buf, int len, int flags) +{ + WSABUF wbuf; + int r; + DWORD b; + + if (pgwin32_poll_signals()) + return -1; + + wbuf.len = len; + wbuf.buf = (char *) buf; + + /* + * Readiness of socket to send data to UDP socket may be not true: socket + * can become busy again! So loop until send or error occurs. + */ + for (;;) + { + r = WSASend(s, &wbuf, 1, &b, flags, NULL, NULL); + if (r != SOCKET_ERROR && b > 0) + /* Write succeeded right away */ + return b; + + if (r == SOCKET_ERROR && + WSAGetLastError() != WSAEWOULDBLOCK) + { + TranslateSocketError(); + return -1; + } + + if (pgwin32_noblock) + { + /* + * No data sent, and we are in "emulated non-blocking mode", so + * return indicating that we'd block if we were to continue. + */ + errno = EWOULDBLOCK; + return -1; + } + + /* No error, zero bytes (win2000+) or error+WSAEWOULDBLOCK (<=nt4) */ + + if (pgwin32_waitforsinglesocket(s, FD_WRITE | FD_CLOSE, INFINITE) == 0) + return -1; + } + + return -1; +} + + +/* + * Wait for activity on one or more sockets. + * While waiting, allow signals to run + * + * NOTE! Currently does not implement exceptfds check, + * since it is not used in postgresql! + */ +int +pgwin32_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timeval *timeout) +{ + WSAEVENT events[FD_SETSIZE * 2]; /* worst case is readfds totally + * different from writefds, so + * 2*FD_SETSIZE sockets */ + SOCKET sockets[FD_SETSIZE * 2]; + int numevents = 0; + int i; + int r; + DWORD timeoutval = WSA_INFINITE; + FD_SET outreadfds; + FD_SET outwritefds; + int nummatches = 0; + + Assert(exceptfds == NULL); + + if (pgwin32_poll_signals()) + return -1; + + FD_ZERO(&outreadfds); + FD_ZERO(&outwritefds); + + /* + * Windows does not guarantee to log an FD_WRITE network event indicating + * that more data can be sent unless the previous send() failed with + * WSAEWOULDBLOCK. While our caller might well have made such a call, we + * cannot assume that here. Therefore, if waiting for write-ready, force + * the issue by doing a dummy send(). If the dummy send() succeeds, + * assume that the socket is in fact write-ready, and return immediately. + * Also, if it fails with something other than WSAEWOULDBLOCK, return a + * write-ready indication to let our caller deal with the error condition. + */ + if (writefds != NULL) + { + for (i = 0; i < writefds->fd_count; i++) + { + char c; + WSABUF buf; + DWORD sent; + + buf.buf = &c; + buf.len = 0; + + r = WSASend(writefds->fd_array[i], &buf, 1, &sent, 0, NULL, NULL); + if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK) + FD_SET(writefds->fd_array[i], &outwritefds); + } + + /* If we found any write-ready sockets, just return them immediately */ + if (outwritefds.fd_count > 0) + { + memcpy(writefds, &outwritefds, sizeof(fd_set)); + if (readfds) + FD_ZERO(readfds); + return outwritefds.fd_count; + } + } + + + /* Now set up for an actual select */ + + if (timeout != NULL) + { + /* timeoutval is in milliseconds */ + timeoutval = timeout->tv_sec * 1000 + timeout->tv_usec / 1000; + } + + if (readfds != NULL) + { + for (i = 0; i < readfds->fd_count; i++) + { + events[numevents] = WSACreateEvent(); + sockets[numevents] = readfds->fd_array[i]; + numevents++; + } + } + if (writefds != NULL) + { + for (i = 0; i < writefds->fd_count; i++) + { + if (!readfds || + !FD_ISSET(writefds->fd_array[i], readfds)) + { + /* If the socket is not in the read list */ + events[numevents] = WSACreateEvent(); + sockets[numevents] = writefds->fd_array[i]; + numevents++; + } + } + } + + for (i = 0; i < numevents; i++) + { + int flags = 0; + + if (readfds && FD_ISSET(sockets[i], readfds)) + flags |= FD_READ | FD_ACCEPT | FD_CLOSE; + + if (writefds && FD_ISSET(sockets[i], writefds)) + flags |= FD_WRITE | FD_CLOSE; + + if (WSAEventSelect(sockets[i], events[i], flags) != 0) + { + TranslateSocketError(); + /* release already-assigned event objects */ + while (--i >= 0) + WSAEventSelect(sockets[i], NULL, 0); + for (i = 0; i < numevents; i++) + WSACloseEvent(events[i]); + return -1; + } + } + + events[numevents] = pgwin32_signal_event; + r = WaitForMultipleObjectsEx(numevents + 1, events, FALSE, timeoutval, TRUE); + if (r != WAIT_TIMEOUT && r != WAIT_IO_COMPLETION && r != (WAIT_OBJECT_0 + numevents)) + { + /* + * We scan all events, even those not signaled, in case more than one + * event has been tagged but Wait.. can only return one. + */ + WSANETWORKEVENTS resEvents; + + for (i = 0; i < numevents; i++) + { + ZeroMemory(&resEvents, sizeof(resEvents)); + if (WSAEnumNetworkEvents(sockets[i], events[i], &resEvents) != 0) + elog(ERROR, "failed to enumerate network events: error code %d", + WSAGetLastError()); + /* Read activity? */ + if (readfds && FD_ISSET(sockets[i], readfds)) + { + if ((resEvents.lNetworkEvents & FD_READ) || + (resEvents.lNetworkEvents & FD_ACCEPT) || + (resEvents.lNetworkEvents & FD_CLOSE)) + { + FD_SET(sockets[i], &outreadfds); + + nummatches++; + } + } + /* Write activity? */ + if (writefds && FD_ISSET(sockets[i], writefds)) + { + if ((resEvents.lNetworkEvents & FD_WRITE) || + (resEvents.lNetworkEvents & FD_CLOSE)) + { + FD_SET(sockets[i], &outwritefds); + + nummatches++; + } + } + } + } + + /* Clean up all the event objects */ + for (i = 0; i < numevents; i++) + { + WSAEventSelect(sockets[i], NULL, 0); + WSACloseEvent(events[i]); + } + + if (r == WSA_WAIT_TIMEOUT) + { + if (readfds) + FD_ZERO(readfds); + if (writefds) + FD_ZERO(writefds); + return 0; + } + + /* Signal-like events. */ + if (r == WAIT_OBJECT_0 + numevents || r == WAIT_IO_COMPLETION) + { + pgwin32_dispatch_queued_signals(); + errno = EINTR; + if (readfds) + FD_ZERO(readfds); + if (writefds) + FD_ZERO(writefds); + return -1; + } + + /* Overwrite socket sets with our resulting values */ + if (readfds) + memcpy(readfds, &outreadfds, sizeof(fd_set)); + if (writefds) + memcpy(writefds, &outwritefds, sizeof(fd_set)); + return nummatches; +} diff --git a/src/backend/port/win32/timer.c b/src/backend/port/win32/timer.c new file mode 100644 index 0000000..3405253 --- /dev/null +++ b/src/backend/port/win32/timer.c @@ -0,0 +1,121 @@ +/*------------------------------------------------------------------------- + * + * timer.c + * Microsoft Windows Win32 Timer Implementation + * + * Limitations of this implementation: + * + * - Does not support interval timer (value->it_interval) + * - Only supports ITIMER_REAL + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/port/win32/timer.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + + +/* Communication area for inter-thread communication */ +typedef struct timerCA +{ + struct itimerval value; + HANDLE event; + CRITICAL_SECTION crit_sec; +} timerCA; + +static timerCA timerCommArea; +static HANDLE timerThreadHandle = INVALID_HANDLE_VALUE; + + +/* Timer management thread */ +static DWORD WINAPI +pg_timer_thread(LPVOID param) +{ + DWORD waittime; + + Assert(param == NULL); + + waittime = INFINITE; + + for (;;) + { + int r; + + r = WaitForSingleObjectEx(timerCommArea.event, waittime, FALSE); + if (r == WAIT_OBJECT_0) + { + /* Event signaled from main thread, change the timer */ + EnterCriticalSection(&timerCommArea.crit_sec); + if (timerCommArea.value.it_value.tv_sec == 0 && + timerCommArea.value.it_value.tv_usec == 0) + waittime = INFINITE; /* Cancel the interrupt */ + else + { + /* WaitForSingleObjectEx() uses milliseconds, round up */ + waittime = (timerCommArea.value.it_value.tv_usec + 999) / 1000 + + timerCommArea.value.it_value.tv_sec * 1000; + } + ResetEvent(timerCommArea.event); + LeaveCriticalSection(&timerCommArea.crit_sec); + } + else if (r == WAIT_TIMEOUT) + { + /* Timeout expired, signal SIGALRM and turn it off */ + pg_queue_signal(SIGALRM); + waittime = INFINITE; + } + else + { + /* Should never happen */ + Assert(false); + } + } + + return 0; +} + +/* + * Win32 setitimer emulation by creating a persistent thread + * to handle the timer setting and notification upon timeout. + */ +int +setitimer(int which, const struct itimerval *value, struct itimerval *ovalue) +{ + Assert(value != NULL); + Assert(value->it_interval.tv_sec == 0 && value->it_interval.tv_usec == 0); + Assert(which == ITIMER_REAL); + + if (timerThreadHandle == INVALID_HANDLE_VALUE) + { + /* First call in this backend, create event and the timer thread */ + timerCommArea.event = CreateEvent(NULL, TRUE, FALSE, NULL); + if (timerCommArea.event == NULL) + ereport(FATAL, + (errmsg_internal("could not create timer event: error code %lu", + GetLastError()))); + + MemSet(&timerCommArea.value, 0, sizeof(struct itimerval)); + + InitializeCriticalSection(&timerCommArea.crit_sec); + + timerThreadHandle = CreateThread(NULL, 0, pg_timer_thread, NULL, 0, NULL); + if (timerThreadHandle == INVALID_HANDLE_VALUE) + ereport(FATAL, + (errmsg_internal("could not create timer thread: error code %lu", + GetLastError()))); + } + + /* Request the timer thread to change settings */ + EnterCriticalSection(&timerCommArea.crit_sec); + if (ovalue) + *ovalue = timerCommArea.value; + timerCommArea.value = *value; + LeaveCriticalSection(&timerCommArea.crit_sec); + SetEvent(timerCommArea.event); + + return 0; +} diff --git a/src/backend/port/win32_sema.c b/src/backend/port/win32_sema.c new file mode 100644 index 0000000..8e9c0f9 --- /dev/null +++ b/src/backend/port/win32_sema.c @@ -0,0 +1,235 @@ +/*------------------------------------------------------------------------- + * + * win32_sema.c + * Microsoft Windows Win32 Semaphores Emulation + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/port/win32_sema.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "miscadmin.h" +#include "storage/ipc.h" +#include "storage/pg_sema.h" + +static HANDLE *mySemSet; /* IDs of sema sets acquired so far */ +static int numSems; /* number of sema sets acquired so far */ +static int maxSems; /* allocated size of mySemaSet array */ + +static void ReleaseSemaphores(int code, Datum arg); + + +/* + * Report amount of shared memory needed for semaphores + */ +Size +PGSemaphoreShmemSize(int maxSemas) +{ + /* No shared memory needed on Windows */ + return 0; +} + +/* + * PGReserveSemaphores --- initialize semaphore support + * + * In the Win32 implementation, we acquire semaphores on-demand; the + * maxSemas parameter is just used to size the array that keeps track of + * acquired semas for subsequent releasing. We use anonymous semaphores + * so the semaphores are automatically freed when the last referencing + * process exits. + */ +void +PGReserveSemaphores(int maxSemas) +{ + mySemSet = (HANDLE *) malloc(maxSemas * sizeof(HANDLE)); + if (mySemSet == NULL) + elog(PANIC, "out of memory"); + numSems = 0; + maxSems = maxSemas; + + on_shmem_exit(ReleaseSemaphores, 0); +} + +/* + * Release semaphores at shutdown or shmem reinitialization + * + * (called as an on_shmem_exit callback, hence funny argument list) + */ +static void +ReleaseSemaphores(int code, Datum arg) +{ + int i; + + for (i = 0; i < numSems; i++) + CloseHandle(mySemSet[i]); + free(mySemSet); +} + +/* + * PGSemaphoreCreate + * + * Allocate a PGSemaphore structure with initial count 1 + */ +PGSemaphore +PGSemaphoreCreate(void) +{ + HANDLE cur_handle; + SECURITY_ATTRIBUTES sec_attrs; + + /* Can't do this in a backend, because static state is postmaster's */ + Assert(!IsUnderPostmaster); + + if (numSems >= maxSems) + elog(PANIC, "too many semaphores created"); + + ZeroMemory(&sec_attrs, sizeof(sec_attrs)); + sec_attrs.nLength = sizeof(sec_attrs); + sec_attrs.lpSecurityDescriptor = NULL; + sec_attrs.bInheritHandle = TRUE; + + /* We don't need a named semaphore */ + cur_handle = CreateSemaphore(&sec_attrs, 1, 32767, NULL); + if (cur_handle) + { + /* Successfully done */ + mySemSet[numSems++] = cur_handle; + } + else + ereport(PANIC, + (errmsg("could not create semaphore: error code %lu", + GetLastError()))); + + return (PGSemaphore) cur_handle; +} + +/* + * PGSemaphoreReset + * + * Reset a previously-initialized PGSemaphore to have count 0 + */ +void +PGSemaphoreReset(PGSemaphore sema) +{ + /* + * There's no direct API for this in Win32, so we have to ratchet the + * semaphore down to 0 with repeated trylock's. + */ + while (PGSemaphoreTryLock(sema)) + /* loop */ ; +} + +/* + * PGSemaphoreLock + * + * Lock a semaphore (decrement count), blocking if count would be < 0. + */ +void +PGSemaphoreLock(PGSemaphore sema) +{ + HANDLE wh[2]; + bool done = false; + + /* + * Note: pgwin32_signal_event should be first to ensure that it will be + * reported when multiple events are set. We want to guarantee that + * pending signals are serviced. + */ + wh[0] = pgwin32_signal_event; + wh[1] = sema; + + /* + * As in other implementations of PGSemaphoreLock, we need to check for + * cancel/die interrupts each time through the loop. But here, there is + * no hidden magic about whether the syscall will internally service a + * signal --- we do that ourselves. + */ + while (!done) + { + DWORD rc; + + CHECK_FOR_INTERRUPTS(); + + rc = WaitForMultipleObjectsEx(2, wh, FALSE, INFINITE, TRUE); + switch (rc) + { + case WAIT_OBJECT_0: + /* Signal event is set - we have a signal to deliver */ + pgwin32_dispatch_queued_signals(); + break; + case WAIT_OBJECT_0 + 1: + /* We got it! */ + done = true; + break; + case WAIT_IO_COMPLETION: + + /* + * The system interrupted the wait to execute an I/O + * completion routine or asynchronous procedure call in this + * thread. PostgreSQL does not provoke either of these, but + * atypical loaded DLLs or even other processes might do so. + * Now, resume waiting. + */ + break; + case WAIT_FAILED: + ereport(FATAL, + (errmsg("could not lock semaphore: error code %lu", + GetLastError()))); + break; + default: + elog(FATAL, "unexpected return code from WaitForMultipleObjectsEx(): %lu", rc); + break; + } + } +} + +/* + * PGSemaphoreUnlock + * + * Unlock a semaphore (increment count) + */ +void +PGSemaphoreUnlock(PGSemaphore sema) +{ + if (!ReleaseSemaphore(sema, 1, NULL)) + ereport(FATAL, + (errmsg("could not unlock semaphore: error code %lu", + GetLastError()))); +} + +/* + * PGSemaphoreTryLock + * + * Lock a semaphore only if able to do so without blocking + */ +bool +PGSemaphoreTryLock(PGSemaphore sema) +{ + DWORD ret; + + ret = WaitForSingleObject(sema, 0); + + if (ret == WAIT_OBJECT_0) + { + /* We got it! */ + return true; + } + else if (ret == WAIT_TIMEOUT) + { + /* Can't get it */ + errno = EAGAIN; + return false; + } + + /* Otherwise we are in trouble */ + ereport(FATAL, + (errmsg("could not try-lock semaphore: error code %lu", + GetLastError()))); + + /* keep compiler quiet */ + return false; +} diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c new file mode 100644 index 0000000..6cf6941 --- /dev/null +++ b/src/backend/port/win32_shmem.c @@ -0,0 +1,621 @@ +/*------------------------------------------------------------------------- + * + * win32_shmem.c + * Implement shared memory using win32 facilities + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/port/win32_shmem.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "miscadmin.h" +#include "storage/dsm.h" +#include "storage/ipc.h" +#include "storage/pg_shmem.h" + +/* + * Early in a process's life, Windows asynchronously creates threads for the + * process's "default thread pool" + * (https://docs.microsoft.com/en-us/windows/desktop/ProcThread/thread-pools). + * Occasionally, thread creation allocates a stack after + * PGSharedMemoryReAttach() has released UsedShmemSegAddr and before it has + * mapped shared memory at UsedShmemSegAddr. This would cause mapping to fail + * if the allocator preferred the just-released region for allocating the new + * thread stack. We observed such failures in some Windows Server 2016 + * configurations. To give the system another region to prefer, reserve and + * release an additional, protective region immediately before reserving or + * releasing shared memory. The idea is that, if the allocator handed out + * REGION1 pages before REGION2 pages at one occasion, it will do so whenever + * both regions are free. Windows Server 2016 exhibits that behavior, and a + * system behaving differently would have less need to protect + * UsedShmemSegAddr. The protective region must be at least large enough for + * one thread stack. However, ten times as much is less than 2% of the 32-bit + * address space and is negligible relative to the 64-bit address space. + */ +#define PROTECTIVE_REGION_SIZE (10 * WIN32_STACK_RLIMIT) +void *ShmemProtectiveRegion = NULL; + +HANDLE UsedShmemSegID = INVALID_HANDLE_VALUE; +void *UsedShmemSegAddr = NULL; +static Size UsedShmemSegSize = 0; + +static bool EnableLockPagesPrivilege(int elevel); +static void pgwin32_SharedMemoryDelete(int status, Datum shmId); + +/* + * Generate shared memory segment name. Expand the data directory, to generate + * an identifier unique for this data directory. Then replace all backslashes + * with forward slashes, since backslashes aren't permitted in global object names. + * + * Store the shared memory segment in the Global\ namespace (requires NT2 TSE or + * 2000, but that's all we support for other reasons as well), to make sure you can't + * open two postmasters in different sessions against the same data directory. + * + * XXX: What happens with junctions? It's only someone breaking things on purpose, + * and this is still better than before, but we might want to do something about + * that sometime in the future. + */ +static char * +GetSharedMemName(void) +{ + char *retptr; + DWORD bufsize; + DWORD r; + char *cp; + + bufsize = GetFullPathName(DataDir, 0, NULL, NULL); + if (bufsize == 0) + elog(FATAL, "could not get size for full pathname of datadir %s: error code %lu", + DataDir, GetLastError()); + + retptr = malloc(bufsize + 18); /* 18 for Global\PostgreSQL: */ + if (retptr == NULL) + elog(FATAL, "could not allocate memory for shared memory name"); + + strcpy(retptr, "Global\\PostgreSQL:"); + r = GetFullPathName(DataDir, bufsize, retptr + 18, NULL); + if (r == 0 || r > bufsize) + elog(FATAL, "could not generate full pathname for datadir %s: error code %lu", + DataDir, GetLastError()); + + /* + * XXX: Intentionally overwriting the Global\ part here. This was not the + * original approach, but putting it in the actual Global\ namespace + * causes permission errors in a lot of cases, so we leave it in the + * default namespace for now. + */ + for (cp = retptr; *cp; cp++) + if (*cp == '\\') + *cp = '/'; + + return retptr; +} + + +/* + * PGSharedMemoryIsInUse + * + * Is a previously-existing shmem segment still existing and in use? + * + * The point of this exercise is to detect the case where a prior postmaster + * crashed, but it left child backends that are still running. Therefore + * we only care about shmem segments that are associated with the intended + * DataDir. This is an important consideration since accidental matches of + * shmem segment IDs are reasonably common. + */ +bool +PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2) +{ + char *szShareMem; + HANDLE hmap; + + szShareMem = GetSharedMemName(); + + hmap = OpenFileMapping(FILE_MAP_READ, FALSE, szShareMem); + + free(szShareMem); + + if (hmap == NULL) + return false; + + CloseHandle(hmap); + return true; +} + +/* + * EnableLockPagesPrivilege + * + * Try to acquire SeLockMemoryPrivilege so we can use large pages. + */ +static bool +EnableLockPagesPrivilege(int elevel) +{ + HANDLE hToken; + TOKEN_PRIVILEGES tp; + LUID luid; + + if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken)) + { + ereport(elevel, + (errmsg("could not enable user right \"%s\": error code %lu", + + /* + * translator: This is a term from Windows and should be translated to + * match the Windows localization. + */ + _("Lock pages in memory"), + GetLastError()), + errdetail("Failed system call was %s.", "OpenProcessToken"))); + return FALSE; + } + + if (!LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &luid)) + { + ereport(elevel, + (errmsg("could not enable user right \"%s\": error code %lu", _("Lock pages in memory"), GetLastError()), + errdetail("Failed system call was %s.", "LookupPrivilegeValue"))); + CloseHandle(hToken); + return FALSE; + } + tp.PrivilegeCount = 1; + tp.Privileges[0].Luid = luid; + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + + if (!AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL)) + { + ereport(elevel, + (errmsg("could not enable user right \"%s\": error code %lu", _("Lock pages in memory"), GetLastError()), + errdetail("Failed system call was %s.", "AdjustTokenPrivileges"))); + CloseHandle(hToken); + return FALSE; + } + + if (GetLastError() != ERROR_SUCCESS) + { + if (GetLastError() == ERROR_NOT_ALL_ASSIGNED) + ereport(elevel, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("could not enable user right \"%s\"", _("Lock pages in memory")), + errhint("Assign user right \"%s\" to the Windows user account which runs PostgreSQL.", + _("Lock pages in memory")))); + else + ereport(elevel, + (errmsg("could not enable user right \"%s\": error code %lu", _("Lock pages in memory"), GetLastError()), + errdetail("Failed system call was %s.", "AdjustTokenPrivileges"))); + CloseHandle(hToken); + return FALSE; + } + + CloseHandle(hToken); + + return TRUE; +} + +/* + * PGSharedMemoryCreate + * + * Create a shared memory segment of the given size and initialize its + * standard header. + */ +PGShmemHeader * +PGSharedMemoryCreate(Size size, + PGShmemHeader **shim) +{ + void *memAddress; + PGShmemHeader *hdr; + HANDLE hmap, + hmap2; + char *szShareMem; + int i; + DWORD size_high; + DWORD size_low; + SIZE_T largePageSize = 0; + Size orig_size = size; + DWORD flProtect = PAGE_READWRITE; + + ShmemProtectiveRegion = VirtualAlloc(NULL, PROTECTIVE_REGION_SIZE, + MEM_RESERVE, PAGE_NOACCESS); + if (ShmemProtectiveRegion == NULL) + elog(FATAL, "could not reserve memory region: error code %lu", + GetLastError()); + + /* Room for a header? */ + Assert(size > MAXALIGN(sizeof(PGShmemHeader))); + + szShareMem = GetSharedMemName(); + + UsedShmemSegAddr = NULL; + + if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY) + { + /* Does the processor support large pages? */ + largePageSize = GetLargePageMinimum(); + if (largePageSize == 0) + { + ereport(huge_pages == HUGE_PAGES_ON ? FATAL : DEBUG1, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("the processor does not support large pages"))); + ereport(DEBUG1, + (errmsg_internal("disabling huge pages"))); + } + else if (!EnableLockPagesPrivilege(huge_pages == HUGE_PAGES_ON ? FATAL : DEBUG1)) + { + ereport(DEBUG1, + (errmsg_internal("disabling huge pages"))); + } + else + { + /* Huge pages available and privilege enabled, so turn on */ + flProtect = PAGE_READWRITE | SEC_COMMIT | SEC_LARGE_PAGES; + + /* Round size up as appropriate. */ + if (size % largePageSize != 0) + size += largePageSize - (size % largePageSize); + } + } + +retry: +#ifdef _WIN64 + size_high = size >> 32; +#else + size_high = 0; +#endif + size_low = (DWORD) size; + + /* + * When recycling a shared memory segment, it may take a short while + * before it gets dropped from the global namespace. So re-try after + * sleeping for a second, and continue retrying 10 times. (both the 1 + * second time and the 10 retries are completely arbitrary) + */ + for (i = 0; i < 10; i++) + { + /* + * In case CreateFileMapping() doesn't set the error code to 0 on + * success + */ + SetLastError(0); + + hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */ + NULL, /* Default security attrs */ + flProtect, + size_high, /* Size Upper 32 Bits */ + size_low, /* Size Lower 32 bits */ + szShareMem); + + if (!hmap) + { + if (GetLastError() == ERROR_NO_SYSTEM_RESOURCES && + huge_pages == HUGE_PAGES_TRY && + (flProtect & SEC_LARGE_PAGES) != 0) + { + elog(DEBUG1, "CreateFileMapping(%zu) with SEC_LARGE_PAGES failed, " + "huge pages disabled", + size); + + /* + * Use the original size, not the rounded-up value, when + * falling back to non-huge pages. + */ + size = orig_size; + flProtect = PAGE_READWRITE; + goto retry; + } + else + ereport(FATAL, + (errmsg("could not create shared memory segment: error code %lu", GetLastError()), + errdetail("Failed system call was CreateFileMapping(size=%zu, name=%s).", + size, szShareMem))); + } + + /* + * If the segment already existed, CreateFileMapping() will return a + * handle to the existing one and set ERROR_ALREADY_EXISTS. + */ + if (GetLastError() == ERROR_ALREADY_EXISTS) + { + CloseHandle(hmap); /* Close the handle, since we got a valid one + * to the previous segment. */ + hmap = NULL; + Sleep(1000); + continue; + } + break; + } + + /* + * If the last call in the loop still returned ERROR_ALREADY_EXISTS, this + * shared memory segment exists and we assume it belongs to somebody else. + */ + if (!hmap) + ereport(FATAL, + (errmsg("pre-existing shared memory block is still in use"), + errhint("Check if there are any old server processes still running, and terminate them."))); + + free(szShareMem); + + /* + * Make the handle inheritable + */ + if (!DuplicateHandle(GetCurrentProcess(), hmap, GetCurrentProcess(), &hmap2, 0, TRUE, DUPLICATE_SAME_ACCESS)) + ereport(FATAL, + (errmsg("could not create shared memory segment: error code %lu", GetLastError()), + errdetail("Failed system call was DuplicateHandle."))); + + /* + * Close the old, non-inheritable handle. If this fails we don't really + * care. + */ + if (!CloseHandle(hmap)) + elog(LOG, "could not close handle to shared memory: error code %lu", GetLastError()); + + + /* + * Get a pointer to the new shared memory segment. Map the whole segment + * at once, and let the system decide on the initial address. + */ + memAddress = MapViewOfFileEx(hmap2, FILE_MAP_WRITE | FILE_MAP_READ, 0, 0, 0, NULL); + if (!memAddress) + ereport(FATAL, + (errmsg("could not create shared memory segment: error code %lu", GetLastError()), + errdetail("Failed system call was MapViewOfFileEx."))); + + + + /* + * OK, we created a new segment. Mark it as created by this process. The + * order of assignments here is critical so that another Postgres process + * can't see the header as valid but belonging to an invalid PID! + */ + hdr = (PGShmemHeader *) memAddress; + hdr->creatorPID = getpid(); + hdr->magic = PGShmemMagic; + + /* + * Initialize space allocation status for segment. + */ + hdr->totalsize = size; + hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); + hdr->dsm_control = 0; + + /* Save info for possible future use */ + UsedShmemSegAddr = memAddress; + UsedShmemSegSize = size; + UsedShmemSegID = hmap2; + + /* Register on-exit routine to delete the new segment */ + on_shmem_exit(pgwin32_SharedMemoryDelete, PointerGetDatum(hmap2)); + + *shim = hdr; + return hdr; +} + +/* + * PGSharedMemoryReAttach + * + * This is called during startup of a postmaster child process to re-attach to + * an already existing shared memory segment, using the handle inherited from + * the postmaster. + * + * ShmemProtectiveRegion, UsedShmemSegID and UsedShmemSegAddr are implicit + * parameters to this routine. The caller must have already restored them to + * the postmaster's values. + */ +void +PGSharedMemoryReAttach(void) +{ + PGShmemHeader *hdr; + void *origUsedShmemSegAddr = UsedShmemSegAddr; + + Assert(ShmemProtectiveRegion != NULL); + Assert(UsedShmemSegAddr != NULL); + Assert(IsUnderPostmaster); + + /* + * Release memory region reservations made by the postmaster + */ + if (VirtualFree(ShmemProtectiveRegion, 0, MEM_RELEASE) == 0) + elog(FATAL, "failed to release reserved memory region (addr=%p): error code %lu", + ShmemProtectiveRegion, GetLastError()); + if (VirtualFree(UsedShmemSegAddr, 0, MEM_RELEASE) == 0) + elog(FATAL, "failed to release reserved memory region (addr=%p): error code %lu", + UsedShmemSegAddr, GetLastError()); + + hdr = (PGShmemHeader *) MapViewOfFileEx(UsedShmemSegID, FILE_MAP_READ | FILE_MAP_WRITE, 0, 0, 0, UsedShmemSegAddr); + if (!hdr) + elog(FATAL, "could not reattach to shared memory (key=%p, addr=%p): error code %lu", + UsedShmemSegID, UsedShmemSegAddr, GetLastError()); + if (hdr != origUsedShmemSegAddr) + elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)", + hdr, origUsedShmemSegAddr); + if (hdr->magic != PGShmemMagic) + elog(FATAL, "reattaching to shared memory returned non-PostgreSQL memory"); + dsm_set_control_handle(hdr->dsm_control); + + UsedShmemSegAddr = hdr; /* probably redundant */ +} + +/* + * PGSharedMemoryNoReAttach + * + * This is called during startup of a postmaster child process when we choose + * *not* to re-attach to the existing shared memory segment. We must clean up + * to leave things in the appropriate state. + * + * The child process startup logic might or might not call PGSharedMemoryDetach + * after this; make sure that it will be a no-op if called. + * + * ShmemProtectiveRegion, UsedShmemSegID and UsedShmemSegAddr are implicit + * parameters to this routine. The caller must have already restored them to + * the postmaster's values. + */ +void +PGSharedMemoryNoReAttach(void) +{ + Assert(ShmemProtectiveRegion != NULL); + Assert(UsedShmemSegAddr != NULL); + Assert(IsUnderPostmaster); + + /* + * Under Windows we will not have mapped the segment, so we don't need to + * un-map it. Just reset UsedShmemSegAddr to show we're not attached. + */ + UsedShmemSegAddr = NULL; + + /* + * We *must* close the inherited shmem segment handle, else Windows will + * consider the existence of this process to mean it can't release the + * shmem segment yet. We can now use PGSharedMemoryDetach to do that. + */ + PGSharedMemoryDetach(); +} + +/* + * PGSharedMemoryDetach + * + * Detach from the shared memory segment, if still attached. This is not + * intended to be called explicitly by the process that originally created the + * segment (it will have an on_shmem_exit callback registered to do that). + * Rather, this is for subprocesses that have inherited an attachment and want + * to get rid of it. + * + * ShmemProtectiveRegion, UsedShmemSegID and UsedShmemSegAddr are implicit + * parameters to this routine. + */ +void +PGSharedMemoryDetach(void) +{ + /* + * Releasing the protective region liberates an unimportant quantity of + * address space, but be tidy. + */ + if (ShmemProtectiveRegion != NULL) + { + if (VirtualFree(ShmemProtectiveRegion, 0, MEM_RELEASE) == 0) + elog(LOG, "failed to release reserved memory region (addr=%p): error code %lu", + ShmemProtectiveRegion, GetLastError()); + + ShmemProtectiveRegion = NULL; + } + + /* Unmap the view, if it's mapped */ + if (UsedShmemSegAddr != NULL) + { + if (!UnmapViewOfFile(UsedShmemSegAddr)) + elog(LOG, "could not unmap view of shared memory: error code %lu", + GetLastError()); + + UsedShmemSegAddr = NULL; + } + + /* And close the shmem handle, if we have one */ + if (UsedShmemSegID != INVALID_HANDLE_VALUE) + { + if (!CloseHandle(UsedShmemSegID)) + elog(LOG, "could not close handle to shared memory: error code %lu", + GetLastError()); + + UsedShmemSegID = INVALID_HANDLE_VALUE; + } +} + + +/* + * pgwin32_SharedMemoryDelete + * + * Detach from and delete the shared memory segment + * (called as an on_shmem_exit callback, hence funny argument list) + */ +static void +pgwin32_SharedMemoryDelete(int status, Datum shmId) +{ + Assert(DatumGetPointer(shmId) == UsedShmemSegID); + PGSharedMemoryDetach(); +} + +/* + * pgwin32_ReserveSharedMemoryRegion(hChild) + * + * Reserve the memory region that will be used for shared memory in a child + * process. It is called before the child process starts, to make sure the + * memory is available. + * + * Once the child starts, DLLs loading in different order or threads getting + * scheduled differently may allocate memory which can conflict with the + * address space we need for our shared memory. By reserving the shared + * memory region before the child starts, and freeing it only just before we + * attempt to get access to the shared memory forces these allocations to + * be given different address ranges that don't conflict. + * + * NOTE! This function executes in the postmaster, and should for this + * reason not use elog(FATAL) since that would take down the postmaster. + */ +int +pgwin32_ReserveSharedMemoryRegion(HANDLE hChild) +{ + void *address; + + Assert(ShmemProtectiveRegion != NULL); + Assert(UsedShmemSegAddr != NULL); + Assert(UsedShmemSegSize != 0); + + /* ShmemProtectiveRegion */ + address = VirtualAllocEx(hChild, ShmemProtectiveRegion, + PROTECTIVE_REGION_SIZE, + MEM_RESERVE, PAGE_NOACCESS); + if (address == NULL) + { + /* Don't use FATAL since we're running in the postmaster */ + elog(LOG, "could not reserve shared memory region (addr=%p) for child %p: error code %lu", + ShmemProtectiveRegion, hChild, GetLastError()); + return false; + } + if (address != ShmemProtectiveRegion) + { + /* + * Should never happen - in theory if allocation granularity causes + * strange effects it could, so check just in case. + * + * Don't use FATAL since we're running in the postmaster. + */ + elog(LOG, "reserved shared memory region got incorrect address %p, expected %p", + address, ShmemProtectiveRegion); + return false; + } + + /* UsedShmemSegAddr */ + address = VirtualAllocEx(hChild, UsedShmemSegAddr, UsedShmemSegSize, + MEM_RESERVE, PAGE_READWRITE); + if (address == NULL) + { + elog(LOG, "could not reserve shared memory region (addr=%p) for child %p: error code %lu", + UsedShmemSegAddr, hChild, GetLastError()); + return false; + } + if (address != UsedShmemSegAddr) + { + elog(LOG, "reserved shared memory region got incorrect address %p, expected %p", + address, UsedShmemSegAddr); + return false; + } + + return true; +} + +/* + * This function is provided for consistency with sysv_shmem.c and does not + * provide any useful information for Windows. To obtain the large page size, + * use GetLargePageMinimum() instead. + */ +void +GetHugePageSize(Size *hugepagesize, int *mmap_flags) +{ + if (hugepagesize) + *hugepagesize = 0; + if (mmap_flags) + *mmap_flags = 0; +} |