summaryrefslogtreecommitdiffstats
path: root/src/bin/pg_dump/parallel.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/bin/pg_dump/parallel.c')
-rw-r--r--src/bin/pg_dump/parallel.c1801
1 files changed, 1801 insertions, 0 deletions
diff --git a/src/bin/pg_dump/parallel.c b/src/bin/pg_dump/parallel.c
new file mode 100644
index 0000000..da0723a
--- /dev/null
+++ b/src/bin/pg_dump/parallel.c
@@ -0,0 +1,1801 @@
+/*-------------------------------------------------------------------------
+ *
+ * parallel.c
+ *
+ * Parallel support for pg_dump and pg_restore
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/bin/pg_dump/parallel.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * Parallel operation works like this:
+ *
+ * The original, leader process calls ParallelBackupStart(), which forks off
+ * the desired number of worker processes, which each enter WaitForCommands().
+ *
+ * The leader process dispatches an individual work item to one of the worker
+ * processes in DispatchJobForTocEntry(). We send a command string such as
+ * "DUMP 1234" or "RESTORE 1234", where 1234 is the TocEntry ID.
+ * The worker process receives and decodes the command and passes it to the
+ * routine pointed to by AH->WorkerJobDumpPtr or AH->WorkerJobRestorePtr,
+ * which are routines of the current archive format. That routine performs
+ * the required action (dump or restore) and returns an integer status code.
+ * This is passed back to the leader where we pass it to the
+ * ParallelCompletionPtr callback function that was passed to
+ * DispatchJobForTocEntry(). The callback function does state updating
+ * for the leader control logic in pg_backup_archiver.c.
+ *
+ * In principle additional archive-format-specific information might be needed
+ * in commands or worker status responses, but so far that hasn't proved
+ * necessary, since workers have full copies of the ArchiveHandle/TocEntry
+ * data structures. Remember that we have forked off the workers only after
+ * we have read in the catalog. That's why our worker processes can also
+ * access the catalog information. (In the Windows case, the workers are
+ * threads in the same process. To avoid problems, they work with cloned
+ * copies of the Archive data structure; see RunWorker().)
+ *
+ * In the leader process, the workerStatus field for each worker has one of
+ * the following values:
+ * WRKR_NOT_STARTED: we've not yet forked this worker
+ * WRKR_IDLE: it's waiting for a command
+ * WRKR_WORKING: it's working on a command
+ * WRKR_TERMINATED: process ended
+ * The pstate->te[] entry for each worker is valid when it's in WRKR_WORKING
+ * state, and must be NULL in other states.
+ */
+
+#include "postgres_fe.h"
+
+#ifndef WIN32
+#include <sys/select.h>
+#include <sys/wait.h>
+#include <signal.h>
+#include <unistd.h>
+#include <fcntl.h>
+#endif
+
+#include "fe_utils/string_utils.h"
+#include "parallel.h"
+#include "pg_backup_utils.h"
+#include "port/pg_bswap.h"
+
+/* Mnemonic macros for indexing the fd array returned by pipe(2) */
+#define PIPE_READ 0
+#define PIPE_WRITE 1
+
+#define NO_SLOT (-1) /* Failure result for GetIdleWorker() */
+
+/* Worker process statuses */
+typedef enum
+{
+ WRKR_NOT_STARTED = 0,
+ WRKR_IDLE,
+ WRKR_WORKING,
+ WRKR_TERMINATED
+} T_WorkerStatus;
+
+#define WORKER_IS_RUNNING(workerStatus) \
+ ((workerStatus) == WRKR_IDLE || (workerStatus) == WRKR_WORKING)
+
+/*
+ * Private per-parallel-worker state (typedef for this is in parallel.h).
+ *
+ * Much of this is valid only in the leader process (or, on Windows, should
+ * be touched only by the leader thread). But the AH field should be touched
+ * only by workers. The pipe descriptors are valid everywhere.
+ */
+struct ParallelSlot
+{
+ T_WorkerStatus workerStatus; /* see enum above */
+
+ /* These fields are valid if workerStatus == WRKR_WORKING: */
+ ParallelCompletionPtr callback; /* function to call on completion */
+ void *callback_data; /* passthrough data for it */
+
+ ArchiveHandle *AH; /* Archive data worker is using */
+
+ int pipeRead; /* leader's end of the pipes */
+ int pipeWrite;
+ int pipeRevRead; /* child's end of the pipes */
+ int pipeRevWrite;
+
+ /* Child process/thread identity info: */
+#ifdef WIN32
+ uintptr_t hThread;
+ unsigned int threadId;
+#else
+ pid_t pid;
+#endif
+};
+
+#ifdef WIN32
+
+/*
+ * Structure to hold info passed by _beginthreadex() to the function it calls
+ * via its single allowed argument.
+ */
+typedef struct
+{
+ ArchiveHandle *AH; /* leader database connection */
+ ParallelSlot *slot; /* this worker's parallel slot */
+} WorkerInfo;
+
+/* Windows implementation of pipe access */
+static int pgpipe(int handles[2]);
+#define piperead(a,b,c) recv(a,b,c,0)
+#define pipewrite(a,b,c) send(a,b,c,0)
+
+#else /* !WIN32 */
+
+/* Non-Windows implementation of pipe access */
+#define pgpipe(a) pipe(a)
+#define piperead(a,b,c) read(a,b,c)
+#define pipewrite(a,b,c) write(a,b,c)
+
+#endif /* WIN32 */
+
+/*
+ * State info for archive_close_connection() shutdown callback.
+ */
+typedef struct ShutdownInformation
+{
+ ParallelState *pstate;
+ Archive *AHX;
+} ShutdownInformation;
+
+static ShutdownInformation shutdown_info;
+
+/*
+ * State info for signal handling.
+ * We assume signal_info initializes to zeroes.
+ *
+ * On Unix, myAH is the leader DB connection in the leader process, and the
+ * worker's own connection in worker processes. On Windows, we have only one
+ * instance of signal_info, so myAH is the leader connection and the worker
+ * connections must be dug out of pstate->parallelSlot[].
+ */
+typedef struct DumpSignalInformation
+{
+ ArchiveHandle *myAH; /* database connection to issue cancel for */
+ ParallelState *pstate; /* parallel state, if any */
+ bool handler_set; /* signal handler set up in this process? */
+#ifndef WIN32
+ bool am_worker; /* am I a worker process? */
+#endif
+} DumpSignalInformation;
+
+static volatile DumpSignalInformation signal_info;
+
+#ifdef WIN32
+static CRITICAL_SECTION signal_info_lock;
+#endif
+
+/*
+ * Write a simple string to stderr --- must be safe in a signal handler.
+ * We ignore the write() result since there's not much we could do about it.
+ * Certain compilers make that harder than it ought to be.
+ */
+#define write_stderr(str) \
+ do { \
+ const char *str_ = (str); \
+ int rc_; \
+ rc_ = write(fileno(stderr), str_, strlen(str_)); \
+ (void) rc_; \
+ } while (0)
+
+
+#ifdef WIN32
+/* file-scope variables */
+static DWORD tls_index;
+
+/* globally visible variables (needed by exit_nicely) */
+bool parallel_init_done = false;
+DWORD mainThreadId;
+#endif /* WIN32 */
+
+/* Local function prototypes */
+static ParallelSlot *GetMyPSlot(ParallelState *pstate);
+static void archive_close_connection(int code, void *arg);
+static void ShutdownWorkersHard(ParallelState *pstate);
+static void WaitForTerminatingWorkers(ParallelState *pstate);
+static void setup_cancel_handler(void);
+static void set_cancel_pstate(ParallelState *pstate);
+static void set_cancel_slot_archive(ParallelSlot *slot, ArchiveHandle *AH);
+static void RunWorker(ArchiveHandle *AH, ParallelSlot *slot);
+static int GetIdleWorker(ParallelState *pstate);
+static bool HasEveryWorkerTerminated(ParallelState *pstate);
+static void lockTableForWorker(ArchiveHandle *AH, TocEntry *te);
+static void WaitForCommands(ArchiveHandle *AH, int pipefd[2]);
+static bool ListenToWorkers(ArchiveHandle *AH, ParallelState *pstate,
+ bool do_wait);
+static char *getMessageFromLeader(int pipefd[2]);
+static void sendMessageToLeader(int pipefd[2], const char *str);
+static int select_loop(int maxFd, fd_set *workerset);
+static char *getMessageFromWorker(ParallelState *pstate,
+ bool do_wait, int *worker);
+static void sendMessageToWorker(ParallelState *pstate,
+ int worker, const char *str);
+static char *readMessageFromPipe(int fd);
+
+#define messageStartsWith(msg, prefix) \
+ (strncmp(msg, prefix, strlen(prefix)) == 0)
+
+
+/*
+ * Initialize parallel dump support --- should be called early in process
+ * startup. (Currently, this is called whether or not we intend parallel
+ * activity.)
+ */
+void
+init_parallel_dump_utils(void)
+{
+#ifdef WIN32
+ if (!parallel_init_done)
+ {
+ WSADATA wsaData;
+ int err;
+
+ /* Prepare for threaded operation */
+ tls_index = TlsAlloc();
+ mainThreadId = GetCurrentThreadId();
+
+ /* Initialize socket access */
+ err = WSAStartup(MAKEWORD(2, 2), &wsaData);
+ if (err != 0)
+ pg_fatal("%s() failed: error code %d", "WSAStartup", err);
+
+ parallel_init_done = true;
+ }
+#endif
+}
+
+/*
+ * Find the ParallelSlot for the current worker process or thread.
+ *
+ * Returns NULL if no matching slot is found (this implies we're the leader).
+ */
+static ParallelSlot *
+GetMyPSlot(ParallelState *pstate)
+{
+ int i;
+
+ for (i = 0; i < pstate->numWorkers; i++)
+ {
+#ifdef WIN32
+ if (pstate->parallelSlot[i].threadId == GetCurrentThreadId())
+#else
+ if (pstate->parallelSlot[i].pid == getpid())
+#endif
+ return &(pstate->parallelSlot[i]);
+ }
+
+ return NULL;
+}
+
+/*
+ * A thread-local version of getLocalPQExpBuffer().
+ *
+ * Non-reentrant but reduces memory leakage: we'll consume one buffer per
+ * thread, which is much better than one per fmtId/fmtQualifiedId call.
+ */
+#ifdef WIN32
+static PQExpBuffer
+getThreadLocalPQExpBuffer(void)
+{
+ /*
+ * The Tls code goes awry if we use a static var, so we provide for both
+ * static and auto, and omit any use of the static var when using Tls. We
+ * rely on TlsGetValue() to return 0 if the value is not yet set.
+ */
+ static PQExpBuffer s_id_return = NULL;
+ PQExpBuffer id_return;
+
+ if (parallel_init_done)
+ id_return = (PQExpBuffer) TlsGetValue(tls_index);
+ else
+ id_return = s_id_return;
+
+ if (id_return) /* first time through? */
+ {
+ /* same buffer, just wipe contents */
+ resetPQExpBuffer(id_return);
+ }
+ else
+ {
+ /* new buffer */
+ id_return = createPQExpBuffer();
+ if (parallel_init_done)
+ TlsSetValue(tls_index, id_return);
+ else
+ s_id_return = id_return;
+ }
+
+ return id_return;
+}
+#endif /* WIN32 */
+
+/*
+ * pg_dump and pg_restore call this to register the cleanup handler
+ * as soon as they've created the ArchiveHandle.
+ */
+void
+on_exit_close_archive(Archive *AHX)
+{
+ shutdown_info.AHX = AHX;
+ on_exit_nicely(archive_close_connection, &shutdown_info);
+}
+
+/*
+ * on_exit_nicely handler for shutting down database connections and
+ * worker processes cleanly.
+ */
+static void
+archive_close_connection(int code, void *arg)
+{
+ ShutdownInformation *si = (ShutdownInformation *) arg;
+
+ if (si->pstate)
+ {
+ /* In parallel mode, must figure out who we are */
+ ParallelSlot *slot = GetMyPSlot(si->pstate);
+
+ if (!slot)
+ {
+ /*
+ * We're the leader. Forcibly shut down workers, then close our
+ * own database connection, if any.
+ */
+ ShutdownWorkersHard(si->pstate);
+
+ if (si->AHX)
+ DisconnectDatabase(si->AHX);
+ }
+ else
+ {
+ /*
+ * We're a worker. Shut down our own DB connection if any. On
+ * Windows, we also have to close our communication sockets, to
+ * emulate what will happen on Unix when the worker process exits.
+ * (Without this, if this is a premature exit, the leader would
+ * fail to detect it because there would be no EOF condition on
+ * the other end of the pipe.)
+ */
+ if (slot->AH)
+ DisconnectDatabase(&(slot->AH->public));
+
+#ifdef WIN32
+ closesocket(slot->pipeRevRead);
+ closesocket(slot->pipeRevWrite);
+#endif
+ }
+ }
+ else
+ {
+ /* Non-parallel operation: just kill the leader DB connection */
+ if (si->AHX)
+ DisconnectDatabase(si->AHX);
+ }
+}
+
+/*
+ * Forcibly shut down any remaining workers, waiting for them to finish.
+ *
+ * Note that we don't expect to come here during normal exit (the workers
+ * should be long gone, and the ParallelState too). We're only here in a
+ * pg_fatal() situation, so intervening to cancel active commands is
+ * appropriate.
+ */
+static void
+ShutdownWorkersHard(ParallelState *pstate)
+{
+ int i;
+
+ /*
+ * Close our write end of the sockets so that any workers waiting for
+ * commands know they can exit. (Note: some of the pipeWrite fields might
+ * still be zero, if we failed to initialize all the workers. Hence, just
+ * ignore errors here.)
+ */
+ for (i = 0; i < pstate->numWorkers; i++)
+ closesocket(pstate->parallelSlot[i].pipeWrite);
+
+ /*
+ * Force early termination of any commands currently in progress.
+ */
+#ifndef WIN32
+ /* On non-Windows, send SIGTERM to each worker process. */
+ for (i = 0; i < pstate->numWorkers; i++)
+ {
+ pid_t pid = pstate->parallelSlot[i].pid;
+
+ if (pid != 0)
+ kill(pid, SIGTERM);
+ }
+#else
+
+ /*
+ * On Windows, send query cancels directly to the workers' backends. Use
+ * a critical section to ensure worker threads don't change state.
+ */
+ EnterCriticalSection(&signal_info_lock);
+ for (i = 0; i < pstate->numWorkers; i++)
+ {
+ ArchiveHandle *AH = pstate->parallelSlot[i].AH;
+ char errbuf[1];
+
+ if (AH != NULL && AH->connCancel != NULL)
+ (void) PQcancel(AH->connCancel, errbuf, sizeof(errbuf));
+ }
+ LeaveCriticalSection(&signal_info_lock);
+#endif
+
+ /* Now wait for them to terminate. */
+ WaitForTerminatingWorkers(pstate);
+}
+
+/*
+ * Wait for all workers to terminate.
+ */
+static void
+WaitForTerminatingWorkers(ParallelState *pstate)
+{
+ while (!HasEveryWorkerTerminated(pstate))
+ {
+ ParallelSlot *slot = NULL;
+ int j;
+
+#ifndef WIN32
+ /* On non-Windows, use wait() to wait for next worker to end */
+ int status;
+ pid_t pid = wait(&status);
+
+ /* Find dead worker's slot, and clear the PID field */
+ for (j = 0; j < pstate->numWorkers; j++)
+ {
+ slot = &(pstate->parallelSlot[j]);
+ if (slot->pid == pid)
+ {
+ slot->pid = 0;
+ break;
+ }
+ }
+#else /* WIN32 */
+ /* On Windows, we must use WaitForMultipleObjects() */
+ HANDLE *lpHandles = pg_malloc(sizeof(HANDLE) * pstate->numWorkers);
+ int nrun = 0;
+ DWORD ret;
+ uintptr_t hThread;
+
+ for (j = 0; j < pstate->numWorkers; j++)
+ {
+ if (WORKER_IS_RUNNING(pstate->parallelSlot[j].workerStatus))
+ {
+ lpHandles[nrun] = (HANDLE) pstate->parallelSlot[j].hThread;
+ nrun++;
+ }
+ }
+ ret = WaitForMultipleObjects(nrun, lpHandles, false, INFINITE);
+ Assert(ret != WAIT_FAILED);
+ hThread = (uintptr_t) lpHandles[ret - WAIT_OBJECT_0];
+ free(lpHandles);
+
+ /* Find dead worker's slot, and clear the hThread field */
+ for (j = 0; j < pstate->numWorkers; j++)
+ {
+ slot = &(pstate->parallelSlot[j]);
+ if (slot->hThread == hThread)
+ {
+ /* For cleanliness, close handles for dead threads */
+ CloseHandle((HANDLE) slot->hThread);
+ slot->hThread = (uintptr_t) INVALID_HANDLE_VALUE;
+ break;
+ }
+ }
+#endif /* WIN32 */
+
+ /* On all platforms, update workerStatus and te[] as well */
+ Assert(j < pstate->numWorkers);
+ slot->workerStatus = WRKR_TERMINATED;
+ pstate->te[j] = NULL;
+ }
+}
+
+
+/*
+ * Code for responding to cancel interrupts (SIGINT, control-C, etc)
+ *
+ * This doesn't quite belong in this module, but it needs access to the
+ * ParallelState data, so there's not really a better place either.
+ *
+ * When we get a cancel interrupt, we could just die, but in pg_restore that
+ * could leave a SQL command (e.g., CREATE INDEX on a large table) running
+ * for a long time. Instead, we try to send a cancel request and then die.
+ * pg_dump probably doesn't really need this, but we might as well use it
+ * there too. Note that sending the cancel directly from the signal handler
+ * is safe because PQcancel() is written to make it so.
+ *
+ * In parallel operation on Unix, each process is responsible for canceling
+ * its own connection (this must be so because nobody else has access to it).
+ * Furthermore, the leader process should attempt to forward its signal to
+ * each child. In simple manual use of pg_dump/pg_restore, forwarding isn't
+ * needed because typing control-C at the console would deliver SIGINT to
+ * every member of the terminal process group --- but in other scenarios it
+ * might be that only the leader gets signaled.
+ *
+ * On Windows, the cancel handler runs in a separate thread, because that's
+ * how SetConsoleCtrlHandler works. We make it stop worker threads, send
+ * cancels on all active connections, and then return FALSE, which will allow
+ * the process to die. For safety's sake, we use a critical section to
+ * protect the PGcancel structures against being changed while the signal
+ * thread runs.
+ */
+
+#ifndef WIN32
+
+/*
+ * Signal handler (Unix only)
+ */
+static void
+sigTermHandler(SIGNAL_ARGS)
+{
+ int i;
+ char errbuf[1];
+
+ /*
+ * Some platforms allow delivery of new signals to interrupt an active
+ * signal handler. That could muck up our attempt to send PQcancel, so
+ * disable the signals that setup_cancel_handler enabled.
+ */
+ pqsignal(SIGINT, SIG_IGN);
+ pqsignal(SIGTERM, SIG_IGN);
+ pqsignal(SIGQUIT, SIG_IGN);
+
+ /*
+ * If we're in the leader, forward signal to all workers. (It seems best
+ * to do this before PQcancel; killing the leader transaction will result
+ * in invalid-snapshot errors from active workers, which maybe we can
+ * quiet by killing workers first.) Ignore any errors.
+ */
+ if (signal_info.pstate != NULL)
+ {
+ for (i = 0; i < signal_info.pstate->numWorkers; i++)
+ {
+ pid_t pid = signal_info.pstate->parallelSlot[i].pid;
+
+ if (pid != 0)
+ kill(pid, SIGTERM);
+ }
+ }
+
+ /*
+ * Send QueryCancel if we have a connection to send to. Ignore errors,
+ * there's not much we can do about them anyway.
+ */
+ if (signal_info.myAH != NULL && signal_info.myAH->connCancel != NULL)
+ (void) PQcancel(signal_info.myAH->connCancel, errbuf, sizeof(errbuf));
+
+ /*
+ * Report we're quitting, using nothing more complicated than write(2).
+ * When in parallel operation, only the leader process should do this.
+ */
+ if (!signal_info.am_worker)
+ {
+ if (progname)
+ {
+ write_stderr(progname);
+ write_stderr(": ");
+ }
+ write_stderr("terminated by user\n");
+ }
+
+ /*
+ * And die, using _exit() not exit() because the latter will invoke atexit
+ * handlers that can fail if we interrupted related code.
+ */
+ _exit(1);
+}
+
+/*
+ * Enable cancel interrupt handler, if not already done.
+ */
+static void
+setup_cancel_handler(void)
+{
+ /*
+ * When forking, signal_info.handler_set will propagate into the new
+ * process, but that's fine because the signal handler state does too.
+ */
+ if (!signal_info.handler_set)
+ {
+ signal_info.handler_set = true;
+
+ pqsignal(SIGINT, sigTermHandler);
+ pqsignal(SIGTERM, sigTermHandler);
+ pqsignal(SIGQUIT, sigTermHandler);
+ }
+}
+
+#else /* WIN32 */
+
+/*
+ * Console interrupt handler --- runs in a newly-started thread.
+ *
+ * After stopping other threads and sending cancel requests on all open
+ * connections, we return FALSE which will allow the default ExitProcess()
+ * action to be taken.
+ */
+static BOOL WINAPI
+consoleHandler(DWORD dwCtrlType)
+{
+ int i;
+ char errbuf[1];
+
+ if (dwCtrlType == CTRL_C_EVENT ||
+ dwCtrlType == CTRL_BREAK_EVENT)
+ {
+ /* Critical section prevents changing data we look at here */
+ EnterCriticalSection(&signal_info_lock);
+
+ /*
+ * If in parallel mode, stop worker threads and send QueryCancel to
+ * their connected backends. The main point of stopping the worker
+ * threads is to keep them from reporting the query cancels as errors,
+ * which would clutter the user's screen. We needn't stop the leader
+ * thread since it won't be doing much anyway. Do this before
+ * canceling the main transaction, else we might get invalid-snapshot
+ * errors reported before we can stop the workers. Ignore errors,
+ * there's not much we can do about them anyway.
+ */
+ if (signal_info.pstate != NULL)
+ {
+ for (i = 0; i < signal_info.pstate->numWorkers; i++)
+ {
+ ParallelSlot *slot = &(signal_info.pstate->parallelSlot[i]);
+ ArchiveHandle *AH = slot->AH;
+ HANDLE hThread = (HANDLE) slot->hThread;
+
+ /*
+ * Using TerminateThread here may leave some resources leaked,
+ * but it doesn't matter since we're about to end the whole
+ * process.
+ */
+ if (hThread != INVALID_HANDLE_VALUE)
+ TerminateThread(hThread, 0);
+
+ if (AH != NULL && AH->connCancel != NULL)
+ (void) PQcancel(AH->connCancel, errbuf, sizeof(errbuf));
+ }
+ }
+
+ /*
+ * Send QueryCancel to leader connection, if enabled. Ignore errors,
+ * there's not much we can do about them anyway.
+ */
+ if (signal_info.myAH != NULL && signal_info.myAH->connCancel != NULL)
+ (void) PQcancel(signal_info.myAH->connCancel,
+ errbuf, sizeof(errbuf));
+
+ LeaveCriticalSection(&signal_info_lock);
+
+ /*
+ * Report we're quitting, using nothing more complicated than
+ * write(2). (We might be able to get away with using pg_log_*()
+ * here, but since we terminated other threads uncleanly above, it
+ * seems better to assume as little as possible.)
+ */
+ if (progname)
+ {
+ write_stderr(progname);
+ write_stderr(": ");
+ }
+ write_stderr("terminated by user\n");
+ }
+
+ /* Always return FALSE to allow signal handling to continue */
+ return FALSE;
+}
+
+/*
+ * Enable cancel interrupt handler, if not already done.
+ */
+static void
+setup_cancel_handler(void)
+{
+ if (!signal_info.handler_set)
+ {
+ signal_info.handler_set = true;
+
+ InitializeCriticalSection(&signal_info_lock);
+
+ SetConsoleCtrlHandler(consoleHandler, TRUE);
+ }
+}
+
+#endif /* WIN32 */
+
+
+/*
+ * set_archive_cancel_info
+ *
+ * Fill AH->connCancel with cancellation info for the specified database
+ * connection; or clear it if conn is NULL.
+ */
+void
+set_archive_cancel_info(ArchiveHandle *AH, PGconn *conn)
+{
+ PGcancel *oldConnCancel;
+
+ /*
+ * Activate the interrupt handler if we didn't yet in this process. On
+ * Windows, this also initializes signal_info_lock; therefore it's
+ * important that this happen at least once before we fork off any
+ * threads.
+ */
+ setup_cancel_handler();
+
+ /*
+ * On Unix, we assume that storing a pointer value is atomic with respect
+ * to any possible signal interrupt. On Windows, use a critical section.
+ */
+
+#ifdef WIN32
+ EnterCriticalSection(&signal_info_lock);
+#endif
+
+ /* Free the old one if we have one */
+ oldConnCancel = AH->connCancel;
+ /* be sure interrupt handler doesn't use pointer while freeing */
+ AH->connCancel = NULL;
+
+ if (oldConnCancel != NULL)
+ PQfreeCancel(oldConnCancel);
+
+ /* Set the new one if specified */
+ if (conn)
+ AH->connCancel = PQgetCancel(conn);
+
+ /*
+ * On Unix, there's only ever one active ArchiveHandle per process, so we
+ * can just set signal_info.myAH unconditionally. On Windows, do that
+ * only in the main thread; worker threads have to make sure their
+ * ArchiveHandle appears in the pstate data, which is dealt with in
+ * RunWorker().
+ */
+#ifndef WIN32
+ signal_info.myAH = AH;
+#else
+ if (mainThreadId == GetCurrentThreadId())
+ signal_info.myAH = AH;
+#endif
+
+#ifdef WIN32
+ LeaveCriticalSection(&signal_info_lock);
+#endif
+}
+
+/*
+ * set_cancel_pstate
+ *
+ * Set signal_info.pstate to point to the specified ParallelState, if any.
+ * We need this mainly to have an interlock against Windows signal thread.
+ */
+static void
+set_cancel_pstate(ParallelState *pstate)
+{
+#ifdef WIN32
+ EnterCriticalSection(&signal_info_lock);
+#endif
+
+ signal_info.pstate = pstate;
+
+#ifdef WIN32
+ LeaveCriticalSection(&signal_info_lock);
+#endif
+}
+
+/*
+ * set_cancel_slot_archive
+ *
+ * Set ParallelSlot's AH field to point to the specified archive, if any.
+ * We need this mainly to have an interlock against Windows signal thread.
+ */
+static void
+set_cancel_slot_archive(ParallelSlot *slot, ArchiveHandle *AH)
+{
+#ifdef WIN32
+ EnterCriticalSection(&signal_info_lock);
+#endif
+
+ slot->AH = AH;
+
+#ifdef WIN32
+ LeaveCriticalSection(&signal_info_lock);
+#endif
+}
+
+
+/*
+ * This function is called by both Unix and Windows variants to set up
+ * and run a worker process. Caller should exit the process (or thread)
+ * upon return.
+ */
+static void
+RunWorker(ArchiveHandle *AH, ParallelSlot *slot)
+{
+ int pipefd[2];
+
+ /* fetch child ends of pipes */
+ pipefd[PIPE_READ] = slot->pipeRevRead;
+ pipefd[PIPE_WRITE] = slot->pipeRevWrite;
+
+ /*
+ * Clone the archive so that we have our own state to work with, and in
+ * particular our own database connection.
+ *
+ * We clone on Unix as well as Windows, even though technically we don't
+ * need to because fork() gives us a copy in our own address space
+ * already. But CloneArchive resets the state information and also clones
+ * the database connection which both seem kinda helpful.
+ */
+ AH = CloneArchive(AH);
+
+ /* Remember cloned archive where signal handler can find it */
+ set_cancel_slot_archive(slot, AH);
+
+ /*
+ * Call the setup worker function that's defined in the ArchiveHandle.
+ */
+ (AH->SetupWorkerPtr) ((Archive *) AH);
+
+ /*
+ * Execute commands until done.
+ */
+ WaitForCommands(AH, pipefd);
+
+ /*
+ * Disconnect from database and clean up.
+ */
+ set_cancel_slot_archive(slot, NULL);
+ DisconnectDatabase(&(AH->public));
+ DeCloneArchive(AH);
+}
+
+/*
+ * Thread base function for Windows
+ */
+#ifdef WIN32
+static unsigned __stdcall
+init_spawned_worker_win32(WorkerInfo *wi)
+{
+ ArchiveHandle *AH = wi->AH;
+ ParallelSlot *slot = wi->slot;
+
+ /* Don't need WorkerInfo anymore */
+ free(wi);
+
+ /* Run the worker ... */
+ RunWorker(AH, slot);
+
+ /* Exit the thread */
+ _endthreadex(0);
+ return 0;
+}
+#endif /* WIN32 */
+
+/*
+ * This function starts a parallel dump or restore by spawning off the worker
+ * processes. For Windows, it creates a number of threads; on Unix the
+ * workers are created with fork().
+ */
+ParallelState *
+ParallelBackupStart(ArchiveHandle *AH)
+{
+ ParallelState *pstate;
+ int i;
+
+ Assert(AH->public.numWorkers > 0);
+
+ pstate = (ParallelState *) pg_malloc(sizeof(ParallelState));
+
+ pstate->numWorkers = AH->public.numWorkers;
+ pstate->te = NULL;
+ pstate->parallelSlot = NULL;
+
+ if (AH->public.numWorkers == 1)
+ return pstate;
+
+ /* Create status arrays, being sure to initialize all fields to 0 */
+ pstate->te = (TocEntry **)
+ pg_malloc0(pstate->numWorkers * sizeof(TocEntry *));
+ pstate->parallelSlot = (ParallelSlot *)
+ pg_malloc0(pstate->numWorkers * sizeof(ParallelSlot));
+
+#ifdef WIN32
+ /* Make fmtId() and fmtQualifiedId() use thread-local storage */
+ getLocalPQExpBuffer = getThreadLocalPQExpBuffer;
+#endif
+
+ /*
+ * Set the pstate in shutdown_info, to tell the exit handler that it must
+ * clean up workers as well as the main database connection. But we don't
+ * set this in signal_info yet, because we don't want child processes to
+ * inherit non-NULL signal_info.pstate.
+ */
+ shutdown_info.pstate = pstate;
+
+ /*
+ * Temporarily disable query cancellation on the leader connection. This
+ * ensures that child processes won't inherit valid AH->connCancel
+ * settings and thus won't try to issue cancels against the leader's
+ * connection. No harm is done if we fail while it's disabled, because
+ * the leader connection is idle at this point anyway.
+ */
+ set_archive_cancel_info(AH, NULL);
+
+ /* Ensure stdio state is quiesced before forking */
+ fflush(NULL);
+
+ /* Create desired number of workers */
+ for (i = 0; i < pstate->numWorkers; i++)
+ {
+#ifdef WIN32
+ WorkerInfo *wi;
+ uintptr_t handle;
+#else
+ pid_t pid;
+#endif
+ ParallelSlot *slot = &(pstate->parallelSlot[i]);
+ int pipeMW[2],
+ pipeWM[2];
+
+ /* Create communication pipes for this worker */
+ if (pgpipe(pipeMW) < 0 || pgpipe(pipeWM) < 0)
+ pg_fatal("could not create communication channels: %m");
+
+ /* leader's ends of the pipes */
+ slot->pipeRead = pipeWM[PIPE_READ];
+ slot->pipeWrite = pipeMW[PIPE_WRITE];
+ /* child's ends of the pipes */
+ slot->pipeRevRead = pipeMW[PIPE_READ];
+ slot->pipeRevWrite = pipeWM[PIPE_WRITE];
+
+#ifdef WIN32
+ /* Create transient structure to pass args to worker function */
+ wi = (WorkerInfo *) pg_malloc(sizeof(WorkerInfo));
+
+ wi->AH = AH;
+ wi->slot = slot;
+
+ handle = _beginthreadex(NULL, 0, (void *) &init_spawned_worker_win32,
+ wi, 0, &(slot->threadId));
+ slot->hThread = handle;
+ slot->workerStatus = WRKR_IDLE;
+#else /* !WIN32 */
+ pid = fork();
+ if (pid == 0)
+ {
+ /* we are the worker */
+ int j;
+
+ /* this is needed for GetMyPSlot() */
+ slot->pid = getpid();
+
+ /* instruct signal handler that we're in a worker now */
+ signal_info.am_worker = true;
+
+ /* close read end of Worker -> Leader */
+ closesocket(pipeWM[PIPE_READ]);
+ /* close write end of Leader -> Worker */
+ closesocket(pipeMW[PIPE_WRITE]);
+
+ /*
+ * Close all inherited fds for communication of the leader with
+ * previously-forked workers.
+ */
+ for (j = 0; j < i; j++)
+ {
+ closesocket(pstate->parallelSlot[j].pipeRead);
+ closesocket(pstate->parallelSlot[j].pipeWrite);
+ }
+
+ /* Run the worker ... */
+ RunWorker(AH, slot);
+
+ /* We can just exit(0) when done */
+ exit(0);
+ }
+ else if (pid < 0)
+ {
+ /* fork failed */
+ pg_fatal("could not create worker process: %m");
+ }
+
+ /* In Leader after successful fork */
+ slot->pid = pid;
+ slot->workerStatus = WRKR_IDLE;
+
+ /* close read end of Leader -> Worker */
+ closesocket(pipeMW[PIPE_READ]);
+ /* close write end of Worker -> Leader */
+ closesocket(pipeWM[PIPE_WRITE]);
+#endif /* WIN32 */
+ }
+
+ /*
+ * Having forked off the workers, disable SIGPIPE so that leader isn't
+ * killed if it tries to send a command to a dead worker. We don't want
+ * the workers to inherit this setting, though.
+ */
+#ifndef WIN32
+ pqsignal(SIGPIPE, SIG_IGN);
+#endif
+
+ /*
+ * Re-establish query cancellation on the leader connection.
+ */
+ set_archive_cancel_info(AH, AH->connection);
+
+ /*
+ * Tell the cancel signal handler to forward signals to worker processes,
+ * too. (As with query cancel, we did not need this earlier because the
+ * workers have not yet been given anything to do; if we die before this
+ * point, any already-started workers will see EOF and quit promptly.)
+ */
+ set_cancel_pstate(pstate);
+
+ return pstate;
+}
+
+/*
+ * Close down a parallel dump or restore.
+ */
+void
+ParallelBackupEnd(ArchiveHandle *AH, ParallelState *pstate)
+{
+ int i;
+
+ /* No work if non-parallel */
+ if (pstate->numWorkers == 1)
+ return;
+
+ /* There should not be any unfinished jobs */
+ Assert(IsEveryWorkerIdle(pstate));
+
+ /* Close the sockets so that the workers know they can exit */
+ for (i = 0; i < pstate->numWorkers; i++)
+ {
+ closesocket(pstate->parallelSlot[i].pipeRead);
+ closesocket(pstate->parallelSlot[i].pipeWrite);
+ }
+
+ /* Wait for them to exit */
+ WaitForTerminatingWorkers(pstate);
+
+ /*
+ * Unlink pstate from shutdown_info, so the exit handler will not try to
+ * use it; and likewise unlink from signal_info.
+ */
+ shutdown_info.pstate = NULL;
+ set_cancel_pstate(NULL);
+
+ /* Release state (mere neatnik-ism, since we're about to terminate) */
+ free(pstate->te);
+ free(pstate->parallelSlot);
+ free(pstate);
+}
+
+/*
+ * These next four functions handle construction and parsing of the command
+ * strings and response strings for parallel workers.
+ *
+ * Currently, these can be the same regardless of which archive format we are
+ * processing. In future, we might want to let format modules override these
+ * functions to add format-specific data to a command or response.
+ */
+
+/*
+ * buildWorkerCommand: format a command string to send to a worker.
+ *
+ * The string is built in the caller-supplied buffer of size buflen.
+ */
+static void
+buildWorkerCommand(ArchiveHandle *AH, TocEntry *te, T_Action act,
+ char *buf, int buflen)
+{
+ if (act == ACT_DUMP)
+ snprintf(buf, buflen, "DUMP %d", te->dumpId);
+ else if (act == ACT_RESTORE)
+ snprintf(buf, buflen, "RESTORE %d", te->dumpId);
+ else
+ Assert(false);
+}
+
+/*
+ * parseWorkerCommand: interpret a command string in a worker.
+ */
+static void
+parseWorkerCommand(ArchiveHandle *AH, TocEntry **te, T_Action *act,
+ const char *msg)
+{
+ DumpId dumpId;
+ int nBytes;
+
+ if (messageStartsWith(msg, "DUMP "))
+ {
+ *act = ACT_DUMP;
+ sscanf(msg, "DUMP %d%n", &dumpId, &nBytes);
+ Assert(nBytes == strlen(msg));
+ *te = getTocEntryByDumpId(AH, dumpId);
+ Assert(*te != NULL);
+ }
+ else if (messageStartsWith(msg, "RESTORE "))
+ {
+ *act = ACT_RESTORE;
+ sscanf(msg, "RESTORE %d%n", &dumpId, &nBytes);
+ Assert(nBytes == strlen(msg));
+ *te = getTocEntryByDumpId(AH, dumpId);
+ Assert(*te != NULL);
+ }
+ else
+ pg_fatal("unrecognized command received from leader: \"%s\"",
+ msg);
+}
+
+/*
+ * buildWorkerResponse: format a response string to send to the leader.
+ *
+ * The string is built in the caller-supplied buffer of size buflen.
+ */
+static void
+buildWorkerResponse(ArchiveHandle *AH, TocEntry *te, T_Action act, int status,
+ char *buf, int buflen)
+{
+ snprintf(buf, buflen, "OK %d %d %d",
+ te->dumpId,
+ status,
+ status == WORKER_IGNORED_ERRORS ? AH->public.n_errors : 0);
+}
+
+/*
+ * parseWorkerResponse: parse the status message returned by a worker.
+ *
+ * Returns the integer status code, and may update fields of AH and/or te.
+ */
+static int
+parseWorkerResponse(ArchiveHandle *AH, TocEntry *te,
+ const char *msg)
+{
+ DumpId dumpId;
+ int nBytes,
+ n_errors;
+ int status = 0;
+
+ if (messageStartsWith(msg, "OK "))
+ {
+ sscanf(msg, "OK %d %d %d%n", &dumpId, &status, &n_errors, &nBytes);
+
+ Assert(dumpId == te->dumpId);
+ Assert(nBytes == strlen(msg));
+
+ AH->public.n_errors += n_errors;
+ }
+ else
+ pg_fatal("invalid message received from worker: \"%s\"",
+ msg);
+
+ return status;
+}
+
+/*
+ * Dispatch a job to some free worker.
+ *
+ * te is the TocEntry to be processed, act is the action to be taken on it.
+ * callback is the function to call on completion of the job.
+ *
+ * If no worker is currently available, this will block, and previously
+ * registered callback functions may be called.
+ */
+void
+DispatchJobForTocEntry(ArchiveHandle *AH,
+ ParallelState *pstate,
+ TocEntry *te,
+ T_Action act,
+ ParallelCompletionPtr callback,
+ void *callback_data)
+{
+ int worker;
+ char buf[256];
+
+ /* Get a worker, waiting if none are idle */
+ while ((worker = GetIdleWorker(pstate)) == NO_SLOT)
+ WaitForWorkers(AH, pstate, WFW_ONE_IDLE);
+
+ /* Construct and send command string */
+ buildWorkerCommand(AH, te, act, buf, sizeof(buf));
+
+ sendMessageToWorker(pstate, worker, buf);
+
+ /* Remember worker is busy, and which TocEntry it's working on */
+ pstate->parallelSlot[worker].workerStatus = WRKR_WORKING;
+ pstate->parallelSlot[worker].callback = callback;
+ pstate->parallelSlot[worker].callback_data = callback_data;
+ pstate->te[worker] = te;
+}
+
+/*
+ * Find an idle worker and return its slot number.
+ * Return NO_SLOT if none are idle.
+ */
+static int
+GetIdleWorker(ParallelState *pstate)
+{
+ int i;
+
+ for (i = 0; i < pstate->numWorkers; i++)
+ {
+ if (pstate->parallelSlot[i].workerStatus == WRKR_IDLE)
+ return i;
+ }
+ return NO_SLOT;
+}
+
+/*
+ * Return true iff no worker is running.
+ */
+static bool
+HasEveryWorkerTerminated(ParallelState *pstate)
+{
+ int i;
+
+ for (i = 0; i < pstate->numWorkers; i++)
+ {
+ if (WORKER_IS_RUNNING(pstate->parallelSlot[i].workerStatus))
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Return true iff every worker is in the WRKR_IDLE state.
+ */
+bool
+IsEveryWorkerIdle(ParallelState *pstate)
+{
+ int i;
+
+ for (i = 0; i < pstate->numWorkers; i++)
+ {
+ if (pstate->parallelSlot[i].workerStatus != WRKR_IDLE)
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Acquire lock on a table to be dumped by a worker process.
+ *
+ * The leader process is already holding an ACCESS SHARE lock. Ordinarily
+ * it's no problem for a worker to get one too, but if anything else besides
+ * pg_dump is running, there's a possible deadlock:
+ *
+ * 1) Leader dumps the schema and locks all tables in ACCESS SHARE mode.
+ * 2) Another process requests an ACCESS EXCLUSIVE lock (which is not granted
+ * because the leader holds a conflicting ACCESS SHARE lock).
+ * 3) A worker process also requests an ACCESS SHARE lock to read the table.
+ * The worker is enqueued behind the ACCESS EXCLUSIVE lock request.
+ * 4) Now we have a deadlock, since the leader is effectively waiting for
+ * the worker. The server cannot detect that, however.
+ *
+ * To prevent an infinite wait, prior to touching a table in a worker, request
+ * a lock in ACCESS SHARE mode but with NOWAIT. If we don't get the lock,
+ * then we know that somebody else has requested an ACCESS EXCLUSIVE lock and
+ * so we have a deadlock. We must fail the backup in that case.
+ */
+static void
+lockTableForWorker(ArchiveHandle *AH, TocEntry *te)
+{
+ const char *qualId;
+ PQExpBuffer query;
+ PGresult *res;
+
+ /* Nothing to do for BLOBS */
+ if (strcmp(te->desc, "BLOBS") == 0)
+ return;
+
+ query = createPQExpBuffer();
+
+ qualId = fmtQualifiedId(te->namespace, te->tag);
+
+ appendPQExpBuffer(query, "LOCK TABLE %s IN ACCESS SHARE MODE NOWAIT",
+ qualId);
+
+ res = PQexec(AH->connection, query->data);
+
+ if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
+ pg_fatal("could not obtain lock on relation \"%s\"\n"
+ "This usually means that someone requested an ACCESS EXCLUSIVE lock "
+ "on the table after the pg_dump parent process had gotten the "
+ "initial ACCESS SHARE lock on the table.", qualId);
+
+ PQclear(res);
+ destroyPQExpBuffer(query);
+}
+
+/*
+ * WaitForCommands: main routine for a worker process.
+ *
+ * Read and execute commands from the leader until we see EOF on the pipe.
+ */
+static void
+WaitForCommands(ArchiveHandle *AH, int pipefd[2])
+{
+ char *command;
+ TocEntry *te;
+ T_Action act;
+ int status = 0;
+ char buf[256];
+
+ for (;;)
+ {
+ if (!(command = getMessageFromLeader(pipefd)))
+ {
+ /* EOF, so done */
+ return;
+ }
+
+ /* Decode the command */
+ parseWorkerCommand(AH, &te, &act, command);
+
+ if (act == ACT_DUMP)
+ {
+ /* Acquire lock on this table within the worker's session */
+ lockTableForWorker(AH, te);
+
+ /* Perform the dump command */
+ status = (AH->WorkerJobDumpPtr) (AH, te);
+ }
+ else if (act == ACT_RESTORE)
+ {
+ /* Perform the restore command */
+ status = (AH->WorkerJobRestorePtr) (AH, te);
+ }
+ else
+ Assert(false);
+
+ /* Return status to leader */
+ buildWorkerResponse(AH, te, act, status, buf, sizeof(buf));
+
+ sendMessageToLeader(pipefd, buf);
+
+ /* command was pg_malloc'd and we are responsible for free()ing it. */
+ free(command);
+ }
+}
+
+/*
+ * Check for status messages from workers.
+ *
+ * If do_wait is true, wait to get a status message; otherwise, just return
+ * immediately if there is none available.
+ *
+ * When we get a status message, we pass the status code to the callback
+ * function that was specified to DispatchJobForTocEntry, then reset the
+ * worker status to IDLE.
+ *
+ * Returns true if we collected a status message, else false.
+ *
+ * XXX is it worth checking for more than one status message per call?
+ * It seems somewhat unlikely that multiple workers would finish at exactly
+ * the same time.
+ */
+static bool
+ListenToWorkers(ArchiveHandle *AH, ParallelState *pstate, bool do_wait)
+{
+ int worker;
+ char *msg;
+
+ /* Try to collect a status message */
+ msg = getMessageFromWorker(pstate, do_wait, &worker);
+
+ if (!msg)
+ {
+ /* If do_wait is true, we must have detected EOF on some socket */
+ if (do_wait)
+ pg_fatal("a worker process died unexpectedly");
+ return false;
+ }
+
+ /* Process it and update our idea of the worker's status */
+ if (messageStartsWith(msg, "OK "))
+ {
+ ParallelSlot *slot = &pstate->parallelSlot[worker];
+ TocEntry *te = pstate->te[worker];
+ int status;
+
+ status = parseWorkerResponse(AH, te, msg);
+ slot->callback(AH, te, status, slot->callback_data);
+ slot->workerStatus = WRKR_IDLE;
+ pstate->te[worker] = NULL;
+ }
+ else
+ pg_fatal("invalid message received from worker: \"%s\"",
+ msg);
+
+ /* Free the string returned from getMessageFromWorker */
+ free(msg);
+
+ return true;
+}
+
+/*
+ * Check for status results from workers, waiting if necessary.
+ *
+ * Available wait modes are:
+ * WFW_NO_WAIT: reap any available status, but don't block
+ * WFW_GOT_STATUS: wait for at least one more worker to finish
+ * WFW_ONE_IDLE: wait for at least one worker to be idle
+ * WFW_ALL_IDLE: wait for all workers to be idle
+ *
+ * Any received results are passed to the callback specified to
+ * DispatchJobForTocEntry.
+ *
+ * This function is executed in the leader process.
+ */
+void
+WaitForWorkers(ArchiveHandle *AH, ParallelState *pstate, WFW_WaitOption mode)
+{
+ bool do_wait = false;
+
+ /*
+ * In GOT_STATUS mode, always block waiting for a message, since we can't
+ * return till we get something. In other modes, we don't block the first
+ * time through the loop.
+ */
+ if (mode == WFW_GOT_STATUS)
+ {
+ /* Assert that caller knows what it's doing */
+ Assert(!IsEveryWorkerIdle(pstate));
+ do_wait = true;
+ }
+
+ for (;;)
+ {
+ /*
+ * Check for status messages, even if we don't need to block. We do
+ * not try very hard to reap all available messages, though, since
+ * there's unlikely to be more than one.
+ */
+ if (ListenToWorkers(AH, pstate, do_wait))
+ {
+ /*
+ * If we got a message, we are done by definition for GOT_STATUS
+ * mode, and we can also be certain that there's at least one idle
+ * worker. So we're done in all but ALL_IDLE mode.
+ */
+ if (mode != WFW_ALL_IDLE)
+ return;
+ }
+
+ /* Check whether we must wait for new status messages */
+ switch (mode)
+ {
+ case WFW_NO_WAIT:
+ return; /* never wait */
+ case WFW_GOT_STATUS:
+ Assert(false); /* can't get here, because we waited */
+ break;
+ case WFW_ONE_IDLE:
+ if (GetIdleWorker(pstate) != NO_SLOT)
+ return;
+ break;
+ case WFW_ALL_IDLE:
+ if (IsEveryWorkerIdle(pstate))
+ return;
+ break;
+ }
+
+ /* Loop back, and this time wait for something to happen */
+ do_wait = true;
+ }
+}
+
+/*
+ * Read one command message from the leader, blocking if necessary
+ * until one is available, and return it as a malloc'd string.
+ * On EOF, return NULL.
+ *
+ * This function is executed in worker processes.
+ */
+static char *
+getMessageFromLeader(int pipefd[2])
+{
+ return readMessageFromPipe(pipefd[PIPE_READ]);
+}
+
+/*
+ * Send a status message to the leader.
+ *
+ * This function is executed in worker processes.
+ */
+static void
+sendMessageToLeader(int pipefd[2], const char *str)
+{
+ int len = strlen(str) + 1;
+
+ if (pipewrite(pipefd[PIPE_WRITE], str, len) != len)
+ pg_fatal("could not write to the communication channel: %m");
+}
+
+/*
+ * Wait until some descriptor in "workerset" becomes readable.
+ * Returns -1 on error, else the number of readable descriptors.
+ */
+static int
+select_loop(int maxFd, fd_set *workerset)
+{
+ int i;
+ fd_set saveSet = *workerset;
+
+ for (;;)
+ {
+ *workerset = saveSet;
+ i = select(maxFd + 1, workerset, NULL, NULL, NULL);
+
+#ifndef WIN32
+ if (i < 0 && errno == EINTR)
+ continue;
+#else
+ if (i == SOCKET_ERROR && WSAGetLastError() == WSAEINTR)
+ continue;
+#endif
+ break;
+ }
+
+ return i;
+}
+
+
+/*
+ * Check for messages from worker processes.
+ *
+ * If a message is available, return it as a malloc'd string, and put the
+ * index of the sending worker in *worker.
+ *
+ * If nothing is available, wait if "do_wait" is true, else return NULL.
+ *
+ * If we detect EOF on any socket, we'll return NULL. It's not great that
+ * that's hard to distinguish from the no-data-available case, but for now
+ * our one caller is okay with that.
+ *
+ * This function is executed in the leader process.
+ */
+static char *
+getMessageFromWorker(ParallelState *pstate, bool do_wait, int *worker)
+{
+ int i;
+ fd_set workerset;
+ int maxFd = -1;
+ struct timeval nowait = {0, 0};
+
+ /* construct bitmap of socket descriptors for select() */
+ FD_ZERO(&workerset);
+ for (i = 0; i < pstate->numWorkers; i++)
+ {
+ if (!WORKER_IS_RUNNING(pstate->parallelSlot[i].workerStatus))
+ continue;
+ FD_SET(pstate->parallelSlot[i].pipeRead, &workerset);
+ if (pstate->parallelSlot[i].pipeRead > maxFd)
+ maxFd = pstate->parallelSlot[i].pipeRead;
+ }
+
+ if (do_wait)
+ {
+ i = select_loop(maxFd, &workerset);
+ Assert(i != 0);
+ }
+ else
+ {
+ if ((i = select(maxFd + 1, &workerset, NULL, NULL, &nowait)) == 0)
+ return NULL;
+ }
+
+ if (i < 0)
+ pg_fatal("%s() failed: %m", "select");
+
+ for (i = 0; i < pstate->numWorkers; i++)
+ {
+ char *msg;
+
+ if (!WORKER_IS_RUNNING(pstate->parallelSlot[i].workerStatus))
+ continue;
+ if (!FD_ISSET(pstate->parallelSlot[i].pipeRead, &workerset))
+ continue;
+
+ /*
+ * Read the message if any. If the socket is ready because of EOF,
+ * we'll return NULL instead (and the socket will stay ready, so the
+ * condition will persist).
+ *
+ * Note: because this is a blocking read, we'll wait if only part of
+ * the message is available. Waiting a long time would be bad, but
+ * since worker status messages are short and are always sent in one
+ * operation, it shouldn't be a problem in practice.
+ */
+ msg = readMessageFromPipe(pstate->parallelSlot[i].pipeRead);
+ *worker = i;
+ return msg;
+ }
+ Assert(false);
+ return NULL;
+}
+
+/*
+ * Send a command message to the specified worker process.
+ *
+ * This function is executed in the leader process.
+ */
+static void
+sendMessageToWorker(ParallelState *pstate, int worker, const char *str)
+{
+ int len = strlen(str) + 1;
+
+ if (pipewrite(pstate->parallelSlot[worker].pipeWrite, str, len) != len)
+ {
+ pg_fatal("could not write to the communication channel: %m");
+ }
+}
+
+/*
+ * Read one message from the specified pipe (fd), blocking if necessary
+ * until one is available, and return it as a malloc'd string.
+ * On EOF, return NULL.
+ *
+ * A "message" on the channel is just a null-terminated string.
+ */
+static char *
+readMessageFromPipe(int fd)
+{
+ char *msg;
+ int msgsize,
+ bufsize;
+ int ret;
+
+ /*
+ * In theory, if we let piperead() read multiple bytes, it might give us
+ * back fragments of multiple messages. (That can't actually occur, since
+ * neither leader nor workers send more than one message without waiting
+ * for a reply, but we don't wish to assume that here.) For simplicity,
+ * read a byte at a time until we get the terminating '\0'. This method
+ * is a bit inefficient, but since this is only used for relatively short
+ * command and status strings, it shouldn't matter.
+ */
+ bufsize = 64; /* could be any number */
+ msg = (char *) pg_malloc(bufsize);
+ msgsize = 0;
+ for (;;)
+ {
+ Assert(msgsize < bufsize);
+ ret = piperead(fd, msg + msgsize, 1);
+ if (ret <= 0)
+ break; /* error or connection closure */
+
+ Assert(ret == 1);
+
+ if (msg[msgsize] == '\0')
+ return msg; /* collected whole message */
+
+ msgsize++;
+ if (msgsize == bufsize) /* enlarge buffer if needed */
+ {
+ bufsize += 16; /* could be any number */
+ msg = (char *) pg_realloc(msg, bufsize);
+ }
+ }
+
+ /* Other end has closed the connection */
+ pg_free(msg);
+ return NULL;
+}
+
+#ifdef WIN32
+
+/*
+ * This is a replacement version of pipe(2) for Windows which allows the pipe
+ * handles to be used in select().
+ *
+ * Reads and writes on the pipe must go through piperead()/pipewrite().
+ *
+ * For consistency with Unix we declare the returned handles as "int".
+ * This is okay even on WIN64 because system handles are not more than
+ * 32 bits wide, but we do have to do some casting.
+ */
+static int
+pgpipe(int handles[2])
+{
+ pgsocket s,
+ tmp_sock;
+ struct sockaddr_in serv_addr;
+ int len = sizeof(serv_addr);
+
+ /* We have to use the Unix socket invalid file descriptor value here. */
+ handles[0] = handles[1] = -1;
+
+ /*
+ * setup listen socket
+ */
+ if ((s = socket(AF_INET, SOCK_STREAM, 0)) == PGINVALID_SOCKET)
+ {
+ pg_log_error("pgpipe: could not create socket: error code %d",
+ WSAGetLastError());
+ return -1;
+ }
+
+ memset(&serv_addr, 0, sizeof(serv_addr));
+ serv_addr.sin_family = AF_INET;
+ serv_addr.sin_port = pg_hton16(0);
+ serv_addr.sin_addr.s_addr = pg_hton32(INADDR_LOOPBACK);
+ if (bind(s, (SOCKADDR *) &serv_addr, len) == SOCKET_ERROR)
+ {
+ pg_log_error("pgpipe: could not bind: error code %d",
+ WSAGetLastError());
+ closesocket(s);
+ return -1;
+ }
+ if (listen(s, 1) == SOCKET_ERROR)
+ {
+ pg_log_error("pgpipe: could not listen: error code %d",
+ WSAGetLastError());
+ closesocket(s);
+ return -1;
+ }
+ if (getsockname(s, (SOCKADDR *) &serv_addr, &len) == SOCKET_ERROR)
+ {
+ pg_log_error("pgpipe: %s() failed: error code %d", "getsockname",
+ WSAGetLastError());
+ closesocket(s);
+ return -1;
+ }
+
+ /*
+ * setup pipe handles
+ */
+ if ((tmp_sock = socket(AF_INET, SOCK_STREAM, 0)) == PGINVALID_SOCKET)
+ {
+ pg_log_error("pgpipe: could not create second socket: error code %d",
+ WSAGetLastError());
+ closesocket(s);
+ return -1;
+ }
+ handles[1] = (int) tmp_sock;
+
+ if (connect(handles[1], (SOCKADDR *) &serv_addr, len) == SOCKET_ERROR)
+ {
+ pg_log_error("pgpipe: could not connect socket: error code %d",
+ WSAGetLastError());
+ closesocket(handles[1]);
+ handles[1] = -1;
+ closesocket(s);
+ return -1;
+ }
+ if ((tmp_sock = accept(s, (SOCKADDR *) &serv_addr, &len)) == PGINVALID_SOCKET)
+ {
+ pg_log_error("pgpipe: could not accept connection: error code %d",
+ WSAGetLastError());
+ closesocket(handles[1]);
+ handles[1] = -1;
+ closesocket(s);
+ return -1;
+ }
+ handles[0] = (int) tmp_sock;
+
+ closesocket(s);
+ return 0;
+}
+
+#endif /* WIN32 */