diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 13:44:03 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-13 13:44:03 +0000 |
commit | 293913568e6a7a86fd1479e1cff8e2ecb58d6568 (patch) | |
tree | fc3b469a3ec5ab71b36ea97cc7aaddb838423a0c /src/backend/postmaster | |
parent | Initial commit. (diff) | |
download | postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.tar.xz postgresql-16-293913568e6a7a86fd1479e1cff8e2ecb58d6568.zip |
Adding upstream version 16.2.upstream/16.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/postmaster')
-rw-r--r-- | src/backend/postmaster/Makefile | 29 | ||||
-rw-r--r-- | src/backend/postmaster/autovacuum.c | 3492 | ||||
-rw-r--r-- | src/backend/postmaster/auxprocess.c | 183 | ||||
-rw-r--r-- | src/backend/postmaster/bgworker.c | 1311 | ||||
-rw-r--r-- | src/backend/postmaster/bgwriter.c | 346 | ||||
-rw-r--r-- | src/backend/postmaster/checkpointer.c | 1353 | ||||
-rw-r--r-- | src/backend/postmaster/fork_process.c | 126 | ||||
-rw-r--r-- | src/backend/postmaster/interrupt.c | 117 | ||||
-rw-r--r-- | src/backend/postmaster/meson.build | 16 | ||||
-rw-r--r-- | src/backend/postmaster/pgarch.c | 869 | ||||
-rw-r--r-- | src/backend/postmaster/postmaster.c | 6549 | ||||
-rw-r--r-- | src/backend/postmaster/startup.c | 402 | ||||
-rw-r--r-- | src/backend/postmaster/syslogger.c | 1651 | ||||
-rw-r--r-- | src/backend/postmaster/walwriter.c | 300 |
14 files changed, 16744 insertions, 0 deletions
diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile new file mode 100644 index 0000000..047448b --- /dev/null +++ b/src/backend/postmaster/Makefile @@ -0,0 +1,29 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for src/backend/postmaster +# +# IDENTIFICATION +# src/backend/postmaster/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/postmaster +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + autovacuum.o \ + auxprocess.o \ + bgworker.o \ + bgwriter.o \ + checkpointer.o \ + fork_process.o \ + interrupt.o \ + pgarch.o \ + postmaster.o \ + startup.o \ + syslogger.o \ + walwriter.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c new file mode 100644 index 0000000..ae9be9b --- /dev/null +++ b/src/backend/postmaster/autovacuum.c @@ -0,0 +1,3492 @@ +/*------------------------------------------------------------------------- + * + * autovacuum.c + * + * PostgreSQL Integrated Autovacuum Daemon + * + * The autovacuum system is structured in two different kinds of processes: the + * autovacuum launcher and the autovacuum worker. The launcher is an + * always-running process, started by the postmaster when the autovacuum GUC + * parameter is set. The launcher schedules autovacuum workers to be started + * when appropriate. The workers are the processes which execute the actual + * vacuuming; they connect to a database as determined in the launcher, and + * once connected they examine the catalogs to select the tables to vacuum. + * + * The autovacuum launcher cannot start the worker processes by itself, + * because doing so would cause robustness issues (namely, failure to shut + * them down on exceptional conditions, and also, since the launcher is + * connected to shared memory and is thus subject to corruption there, it is + * not as robust as the postmaster). So it leaves that task to the postmaster. + * + * There is an autovacuum shared memory area, where the launcher stores + * information about the database it wants vacuumed. When it wants a new + * worker to start, it sets a flag in shared memory and sends a signal to the + * postmaster. Then postmaster knows nothing more than it must start a worker; + * so it forks a new child, which turns into a worker. This new process + * connects to shared memory, and there it can inspect the information that the + * launcher has set up. + * + * If the fork() call fails in the postmaster, it sets a flag in the shared + * memory area, and sends a signal to the launcher. The launcher, upon + * noticing the flag, can try starting the worker again by resending the + * signal. Note that the failure can only be transient (fork failure due to + * high load, memory pressure, too many processes, etc); more permanent + * problems, like failure to connect to a database, are detected later in the + * worker and dealt with just by having the worker exit normally. The launcher + * will launch a new worker again later, per schedule. + * + * When the worker is done vacuuming it sends SIGUSR2 to the launcher. The + * launcher then wakes up and is able to launch another worker, if the schedule + * is so tight that a new worker is needed immediately. At this time the + * launcher can also balance the settings for the various remaining workers' + * cost-based vacuum delay feature. + * + * Note that there can be more than one worker in a database concurrently. + * They will store the table they are currently vacuuming in shared memory, so + * that other workers avoid being blocked waiting for the vacuum lock for that + * table. They will also fetch the last time the table was vacuumed from + * pgstats just before vacuuming each table, to avoid vacuuming a table that + * was just finished being vacuumed by another worker and thus is no longer + * noted in shared memory. However, there is a small window (due to not yet + * holding the relation lock) during which a worker may choose a table that was + * already vacuumed; this is a bug in the current design. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/postmaster/autovacuum.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <signal.h> +#include <sys/time.h> +#include <unistd.h> + +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/multixact.h" +#include "access/reloptions.h" +#include "access/tableam.h" +#include "access/transam.h" +#include "access/xact.h" +#include "catalog/dependency.h" +#include "catalog/namespace.h" +#include "catalog/pg_database.h" +#include "commands/dbcommands.h" +#include "commands/vacuum.h" +#include "lib/ilist.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "pgstat.h" +#include "postmaster/autovacuum.h" +#include "postmaster/fork_process.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "storage/bufmgr.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/lmgr.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/procsignal.h" +#include "storage/sinvaladt.h" +#include "storage/smgr.h" +#include "tcop/tcopprot.h" +#include "utils/fmgroids.h" +#include "utils/fmgrprotos.h" +#include "utils/guc_hooks.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/ps_status.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" +#include "utils/timeout.h" +#include "utils/timestamp.h" + + +/* + * GUC parameters + */ +bool autovacuum_start_daemon = false; +int autovacuum_max_workers; +int autovacuum_work_mem = -1; +int autovacuum_naptime; +int autovacuum_vac_thresh; +double autovacuum_vac_scale; +int autovacuum_vac_ins_thresh; +double autovacuum_vac_ins_scale; +int autovacuum_anl_thresh; +double autovacuum_anl_scale; +int autovacuum_freeze_max_age; +int autovacuum_multixact_freeze_max_age; + +double autovacuum_vac_cost_delay; +int autovacuum_vac_cost_limit; + +int Log_autovacuum_min_duration = 600000; + +/* the minimum allowed time between two awakenings of the launcher */ +#define MIN_AUTOVAC_SLEEPTIME 100.0 /* milliseconds */ +#define MAX_AUTOVAC_SLEEPTIME 300 /* seconds */ + +/* Flags to tell if we are in an autovacuum process */ +static bool am_autovacuum_launcher = false; +static bool am_autovacuum_worker = false; + +/* + * Variables to save the cost-related storage parameters for the current + * relation being vacuumed by this autovacuum worker. Using these, we can + * ensure we don't overwrite the values of vacuum_cost_delay and + * vacuum_cost_limit after reloading the configuration file. They are + * initialized to "invalid" values to indicate that no cost-related storage + * parameters were specified and will be set in do_autovacuum() after checking + * the storage parameters in table_recheck_autovac(). + */ +static double av_storage_param_cost_delay = -1; +static int av_storage_param_cost_limit = -1; + +/* Flags set by signal handlers */ +static volatile sig_atomic_t got_SIGUSR2 = false; + +/* Comparison points for determining whether freeze_max_age is exceeded */ +static TransactionId recentXid; +static MultiXactId recentMulti; + +/* Default freeze ages to use for autovacuum (varies by database) */ +static int default_freeze_min_age; +static int default_freeze_table_age; +static int default_multixact_freeze_min_age; +static int default_multixact_freeze_table_age; + +/* Memory context for long-lived data */ +static MemoryContext AutovacMemCxt; + +/* struct to keep track of databases in launcher */ +typedef struct avl_dbase +{ + Oid adl_datid; /* hash key -- must be first */ + TimestampTz adl_next_worker; + int adl_score; + dlist_node adl_node; +} avl_dbase; + +/* struct to keep track of databases in worker */ +typedef struct avw_dbase +{ + Oid adw_datid; + char *adw_name; + TransactionId adw_frozenxid; + MultiXactId adw_minmulti; + PgStat_StatDBEntry *adw_entry; +} avw_dbase; + +/* struct to keep track of tables to vacuum and/or analyze, in 1st pass */ +typedef struct av_relation +{ + Oid ar_toastrelid; /* hash key - must be first */ + Oid ar_relid; + bool ar_hasrelopts; + AutoVacOpts ar_reloptions; /* copy of AutoVacOpts from the main table's + * reloptions, or NULL if none */ +} av_relation; + +/* struct to keep track of tables to vacuum and/or analyze, after rechecking */ +typedef struct autovac_table +{ + Oid at_relid; + VacuumParams at_params; + double at_storage_param_vac_cost_delay; + int at_storage_param_vac_cost_limit; + bool at_dobalance; + bool at_sharedrel; + char *at_relname; + char *at_nspname; + char *at_datname; +} autovac_table; + +/*------------- + * This struct holds information about a single worker's whereabouts. We keep + * an array of these in shared memory, sized according to + * autovacuum_max_workers. + * + * wi_links entry into free list or running list + * wi_dboid OID of the database this worker is supposed to work on + * wi_tableoid OID of the table currently being vacuumed, if any + * wi_sharedrel flag indicating whether table is marked relisshared + * wi_proc pointer to PGPROC of the running worker, NULL if not started + * wi_launchtime Time at which this worker was launched + * wi_dobalance Whether this worker should be included in balance calculations + * + * All fields are protected by AutovacuumLock, except for wi_tableoid and + * wi_sharedrel which are protected by AutovacuumScheduleLock (note these + * two fields are read-only for everyone except that worker itself). + *------------- + */ +typedef struct WorkerInfoData +{ + dlist_node wi_links; + Oid wi_dboid; + Oid wi_tableoid; + PGPROC *wi_proc; + TimestampTz wi_launchtime; + pg_atomic_flag wi_dobalance; + bool wi_sharedrel; +} WorkerInfoData; + +typedef struct WorkerInfoData *WorkerInfo; + +/* + * Possible signals received by the launcher from remote processes. These are + * stored atomically in shared memory so that other processes can set them + * without locking. + */ +typedef enum +{ + AutoVacForkFailed, /* failed trying to start a worker */ + AutoVacRebalance, /* rebalance the cost limits */ + AutoVacNumSignals /* must be last */ +} AutoVacuumSignal; + +/* + * Autovacuum workitem array, stored in AutoVacuumShmem->av_workItems. This + * list is mostly protected by AutovacuumLock, except that if an item is + * marked 'active' other processes must not modify the work-identifying + * members. + */ +typedef struct AutoVacuumWorkItem +{ + AutoVacuumWorkItemType avw_type; + bool avw_used; /* below data is valid */ + bool avw_active; /* being processed */ + Oid avw_database; + Oid avw_relation; + BlockNumber avw_blockNumber; +} AutoVacuumWorkItem; + +#define NUM_WORKITEMS 256 + +/*------------- + * The main autovacuum shmem struct. On shared memory we store this main + * struct and the array of WorkerInfo structs. This struct keeps: + * + * av_signal set by other processes to indicate various conditions + * av_launcherpid the PID of the autovacuum launcher + * av_freeWorkers the WorkerInfo freelist + * av_runningWorkers the WorkerInfo non-free queue + * av_startingWorker pointer to WorkerInfo currently being started (cleared by + * the worker itself as soon as it's up and running) + * av_workItems work item array + * av_nworkersForBalance the number of autovacuum workers to use when + * calculating the per worker cost limit + * + * This struct is protected by AutovacuumLock, except for av_signal and parts + * of the worker list (see above). + *------------- + */ +typedef struct +{ + sig_atomic_t av_signal[AutoVacNumSignals]; + pid_t av_launcherpid; + dlist_head av_freeWorkers; + dlist_head av_runningWorkers; + WorkerInfo av_startingWorker; + AutoVacuumWorkItem av_workItems[NUM_WORKITEMS]; + pg_atomic_uint32 av_nworkersForBalance; +} AutoVacuumShmemStruct; + +static AutoVacuumShmemStruct *AutoVacuumShmem; + +/* + * the database list (of avl_dbase elements) in the launcher, and the context + * that contains it + */ +static dlist_head DatabaseList = DLIST_STATIC_INIT(DatabaseList); +static MemoryContext DatabaseListCxt = NULL; + +/* Pointer to my own WorkerInfo, valid on each worker */ +static WorkerInfo MyWorkerInfo = NULL; + +/* PID of launcher, valid only in worker while shutting down */ +int AutovacuumLauncherPid = 0; + +#ifdef EXEC_BACKEND +static pid_t avlauncher_forkexec(void); +static pid_t avworker_forkexec(void); +#endif +NON_EXEC_STATIC void AutoVacWorkerMain(int argc, char *argv[]) pg_attribute_noreturn(); +NON_EXEC_STATIC void AutoVacLauncherMain(int argc, char *argv[]) pg_attribute_noreturn(); + +static Oid do_start_worker(void); +static void HandleAutoVacLauncherInterrupts(void); +static void AutoVacLauncherShutdown(void) pg_attribute_noreturn(); +static void launcher_determine_sleep(bool canlaunch, bool recursing, + struct timeval *nap); +static void launch_worker(TimestampTz now); +static List *get_database_list(void); +static void rebuild_database_list(Oid newdb); +static int db_comparator(const void *a, const void *b); +static void autovac_recalculate_workers_for_balance(void); + +static void do_autovacuum(void); +static void FreeWorkerInfo(int code, Datum arg); + +static autovac_table *table_recheck_autovac(Oid relid, HTAB *table_toast_map, + TupleDesc pg_class_desc, + int effective_multixact_freeze_max_age); +static void recheck_relation_needs_vacanalyze(Oid relid, AutoVacOpts *avopts, + Form_pg_class classForm, + int effective_multixact_freeze_max_age, + bool *dovacuum, bool *doanalyze, bool *wraparound); +static void relation_needs_vacanalyze(Oid relid, AutoVacOpts *relopts, + Form_pg_class classForm, + PgStat_StatTabEntry *tabentry, + int effective_multixact_freeze_max_age, + bool *dovacuum, bool *doanalyze, bool *wraparound); + +static void autovacuum_do_vac_analyze(autovac_table *tab, + BufferAccessStrategy bstrategy); +static AutoVacOpts *extract_autovac_opts(HeapTuple tup, + TupleDesc pg_class_desc); +static void perform_work_item(AutoVacuumWorkItem *workitem); +static void autovac_report_activity(autovac_table *tab); +static void autovac_report_workitem(AutoVacuumWorkItem *workitem, + const char *nspname, const char *relname); +static void avl_sigusr2_handler(SIGNAL_ARGS); + + + +/******************************************************************** + * AUTOVACUUM LAUNCHER CODE + ********************************************************************/ + +#ifdef EXEC_BACKEND +/* + * forkexec routine for the autovacuum launcher process. + * + * Format up the arglist, then fork and exec. + */ +static pid_t +avlauncher_forkexec(void) +{ + char *av[10]; + int ac = 0; + + av[ac++] = "postgres"; + av[ac++] = "--forkavlauncher"; + av[ac++] = NULL; /* filled in by postmaster_forkexec */ + av[ac] = NULL; + + Assert(ac < lengthof(av)); + + return postmaster_forkexec(ac, av); +} + +/* + * We need this set from the outside, before InitProcess is called + */ +void +AutovacuumLauncherIAm(void) +{ + am_autovacuum_launcher = true; +} +#endif + +/* + * Main entry point for autovacuum launcher process, to be called from the + * postmaster. + */ +int +StartAutoVacLauncher(void) +{ + pid_t AutoVacPID; + +#ifdef EXEC_BACKEND + switch ((AutoVacPID = avlauncher_forkexec())) +#else + switch ((AutoVacPID = fork_process())) +#endif + { + case -1: + ereport(LOG, + (errmsg("could not fork autovacuum launcher process: %m"))); + return 0; + +#ifndef EXEC_BACKEND + case 0: + /* in postmaster child ... */ + InitPostmasterChild(); + + /* Close the postmaster's sockets */ + ClosePostmasterPorts(false); + + AutoVacLauncherMain(0, NULL); + break; +#endif + default: + return (int) AutoVacPID; + } + + /* shouldn't get here */ + return 0; +} + +/* + * Main loop for the autovacuum launcher process. + */ +NON_EXEC_STATIC void +AutoVacLauncherMain(int argc, char *argv[]) +{ + sigjmp_buf local_sigjmp_buf; + + am_autovacuum_launcher = true; + + MyBackendType = B_AUTOVAC_LAUNCHER; + init_ps_display(NULL); + + ereport(DEBUG1, + (errmsg_internal("autovacuum launcher started"))); + + if (PostAuthDelay) + pg_usleep(PostAuthDelay * 1000000L); + + SetProcessingMode(InitProcessing); + + /* + * Set up signal handlers. We operate on databases much like a regular + * backend, so we use the same signal handling. See equivalent code in + * tcop/postgres.c. + */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGINT, StatementCancelHandler); + pqsignal(SIGTERM, SignalHandlerForShutdownRequest); + /* SIGQUIT handler was already set up by InitPostmasterChild */ + + InitializeTimeouts(); /* establishes SIGALRM handler */ + + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, avl_sigusr2_handler); + pqsignal(SIGFPE, FloatExceptionHandler); + pqsignal(SIGCHLD, SIG_DFL); + + /* + * Create a per-backend PGPROC struct in shared memory, except in the + * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do + * this before we can use LWLocks (and in the EXEC_BACKEND case we already + * had to do some stuff with LWLocks). + */ +#ifndef EXEC_BACKEND + InitProcess(); +#endif + + /* Early initialization */ + BaseInit(); + + InitPostgres(NULL, InvalidOid, NULL, InvalidOid, false, false, NULL); + + SetProcessingMode(NormalProcessing); + + /* + * Create a memory context that we will do all our work in. We do this so + * that we can reset the context during error recovery and thereby avoid + * possible memory leaks. + */ + AutovacMemCxt = AllocSetContextCreate(TopMemoryContext, + "Autovacuum Launcher", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(AutovacMemCxt); + + /* + * If an exception is encountered, processing resumes here. + * + * This code is a stripped down version of PostgresMain error recovery. + * + * Note that we use sigsetjmp(..., 1), so that the prevailing signal mask + * (to wit, BlockSig) will be restored when longjmp'ing to here. Thus, + * signals other than SIGQUIT will be blocked until we complete error + * recovery. It might seem that this policy makes the HOLD_INTERRUPTS() + * call redundant, but it is not since InterruptPending might be set + * already. + */ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* since not using PG_TRY, must reset error stack by hand */ + error_context_stack = NULL; + + /* Prevents interrupts while cleaning up */ + HOLD_INTERRUPTS(); + + /* Forget any pending QueryCancel or timeout request */ + disable_all_timeouts(false); + QueryCancelPending = false; /* second to avoid race condition */ + + /* Report the error to the server log */ + EmitErrorReport(); + + /* Abort the current transaction in order to recover */ + AbortCurrentTransaction(); + + /* + * Release any other resources, for the case where we were not in a + * transaction. + */ + LWLockReleaseAll(); + pgstat_report_wait_end(); + UnlockBuffers(); + /* this is probably dead code, but let's be safe: */ + if (AuxProcessResourceOwner) + ReleaseAuxProcessResources(false); + AtEOXact_Buffers(false); + AtEOXact_SMgr(); + AtEOXact_Files(false); + AtEOXact_HashTables(false); + + /* + * Now return to normal top-level context and clear ErrorContext for + * next time. + */ + MemoryContextSwitchTo(AutovacMemCxt); + FlushErrorState(); + + /* Flush any leaked data in the top-level context */ + MemoryContextResetAndDeleteChildren(AutovacMemCxt); + + /* don't leave dangling pointers to freed memory */ + DatabaseListCxt = NULL; + dlist_init(&DatabaseList); + + /* Now we can allow interrupts again */ + RESUME_INTERRUPTS(); + + /* if in shutdown mode, no need for anything further; just go away */ + if (ShutdownRequestPending) + AutoVacLauncherShutdown(); + + /* + * Sleep at least 1 second after any error. We don't want to be + * filling the error logs as fast as we can. + */ + pg_usleep(1000000L); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + /* must unblock signals before calling rebuild_database_list */ + sigprocmask(SIG_SETMASK, &UnBlockSig, NULL); + + /* + * Set always-secure search path. Launcher doesn't connect to a database, + * so this has no effect. + */ + SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE); + + /* + * Force zero_damaged_pages OFF in the autovac process, even if it is set + * in postgresql.conf. We don't really want such a dangerous option being + * applied non-interactively. + */ + SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE); + + /* + * Force settable timeouts off to avoid letting these settings prevent + * regular maintenance from being executed. + */ + SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE); + SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE); + SetConfigOption("idle_in_transaction_session_timeout", "0", + PGC_SUSET, PGC_S_OVERRIDE); + + /* + * Force default_transaction_isolation to READ COMMITTED. We don't want + * to pay the overhead of serializable mode, nor add any risk of causing + * deadlocks or delaying other transactions. + */ + SetConfigOption("default_transaction_isolation", "read committed", + PGC_SUSET, PGC_S_OVERRIDE); + + /* + * Even when system is configured to use a different fetch consistency, + * for autovac we always want fresh stats. + */ + SetConfigOption("stats_fetch_consistency", "none", PGC_SUSET, PGC_S_OVERRIDE); + + /* + * In emergency mode, just start a worker (unless shutdown was requested) + * and go away. + */ + if (!AutoVacuumingActive()) + { + if (!ShutdownRequestPending) + do_start_worker(); + proc_exit(0); /* done */ + } + + AutoVacuumShmem->av_launcherpid = MyProcPid; + + /* + * Create the initial database list. The invariant we want this list to + * keep is that it's ordered by decreasing next_time. As soon as an entry + * is updated to a higher time, it will be moved to the front (which is + * correct because the only operation is to add autovacuum_naptime to the + * entry, and time always increases). + */ + rebuild_database_list(InvalidOid); + + /* loop until shutdown request */ + while (!ShutdownRequestPending) + { + struct timeval nap; + TimestampTz current_time = 0; + bool can_launch; + + /* + * This loop is a bit different from the normal use of WaitLatch, + * because we'd like to sleep before the first launch of a child + * process. So it's WaitLatch, then ResetLatch, then check for + * wakening conditions. + */ + + launcher_determine_sleep(!dlist_is_empty(&AutoVacuumShmem->av_freeWorkers), + false, &nap); + + /* + * Wait until naptime expires or we get some type of signal (all the + * signal handlers will wake us by calling SetLatch). + */ + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L), + WAIT_EVENT_AUTOVACUUM_MAIN); + + ResetLatch(MyLatch); + + HandleAutoVacLauncherInterrupts(); + + /* + * a worker finished, or postmaster signaled failure to start a worker + */ + if (got_SIGUSR2) + { + got_SIGUSR2 = false; + + /* rebalance cost limits, if needed */ + if (AutoVacuumShmem->av_signal[AutoVacRebalance]) + { + LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); + AutoVacuumShmem->av_signal[AutoVacRebalance] = false; + autovac_recalculate_workers_for_balance(); + LWLockRelease(AutovacuumLock); + } + + if (AutoVacuumShmem->av_signal[AutoVacForkFailed]) + { + /* + * If the postmaster failed to start a new worker, we sleep + * for a little while and resend the signal. The new worker's + * state is still in memory, so this is sufficient. After + * that, we restart the main loop. + * + * XXX should we put a limit to the number of times we retry? + * I don't think it makes much sense, because a future start + * of a worker will continue to fail in the same way. + */ + AutoVacuumShmem->av_signal[AutoVacForkFailed] = false; + pg_usleep(1000000L); /* 1s */ + SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER); + continue; + } + } + + /* + * There are some conditions that we need to check before trying to + * start a worker. First, we need to make sure that there is a worker + * slot available. Second, we need to make sure that no other worker + * failed while starting up. + */ + + current_time = GetCurrentTimestamp(); + LWLockAcquire(AutovacuumLock, LW_SHARED); + + can_launch = !dlist_is_empty(&AutoVacuumShmem->av_freeWorkers); + + if (AutoVacuumShmem->av_startingWorker != NULL) + { + int waittime; + WorkerInfo worker = AutoVacuumShmem->av_startingWorker; + + /* + * We can't launch another worker when another one is still + * starting up (or failed while doing so), so just sleep for a bit + * more; that worker will wake us up again as soon as it's ready. + * We will only wait autovacuum_naptime seconds (up to a maximum + * of 60 seconds) for this to happen however. Note that failure + * to connect to a particular database is not a problem here, + * because the worker removes itself from the startingWorker + * pointer before trying to connect. Problems detected by the + * postmaster (like fork() failure) are also reported and handled + * differently. The only problems that may cause this code to + * fire are errors in the earlier sections of AutoVacWorkerMain, + * before the worker removes the WorkerInfo from the + * startingWorker pointer. + */ + waittime = Min(autovacuum_naptime, 60) * 1000; + if (TimestampDifferenceExceeds(worker->wi_launchtime, current_time, + waittime)) + { + LWLockRelease(AutovacuumLock); + LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); + + /* + * No other process can put a worker in starting mode, so if + * startingWorker is still INVALID after exchanging our lock, + * we assume it's the same one we saw above (so we don't + * recheck the launch time). + */ + if (AutoVacuumShmem->av_startingWorker != NULL) + { + worker = AutoVacuumShmem->av_startingWorker; + worker->wi_dboid = InvalidOid; + worker->wi_tableoid = InvalidOid; + worker->wi_sharedrel = false; + worker->wi_proc = NULL; + worker->wi_launchtime = 0; + dlist_push_head(&AutoVacuumShmem->av_freeWorkers, + &worker->wi_links); + AutoVacuumShmem->av_startingWorker = NULL; + ereport(WARNING, + errmsg("autovacuum worker took too long to start; canceled")); + } + } + else + can_launch = false; + } + LWLockRelease(AutovacuumLock); /* either shared or exclusive */ + + /* if we can't do anything, just go back to sleep */ + if (!can_launch) + continue; + + /* We're OK to start a new worker */ + + if (dlist_is_empty(&DatabaseList)) + { + /* + * Special case when the list is empty: start a worker right away. + * This covers the initial case, when no database is in pgstats + * (thus the list is empty). Note that the constraints in + * launcher_determine_sleep keep us from starting workers too + * quickly (at most once every autovacuum_naptime when the list is + * empty). + */ + launch_worker(current_time); + } + else + { + /* + * because rebuild_database_list constructs a list with most + * distant adl_next_worker first, we obtain our database from the + * tail of the list. + */ + avl_dbase *avdb; + + avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList); + + /* + * launch a worker if next_worker is right now or it is in the + * past + */ + if (TimestampDifferenceExceeds(avdb->adl_next_worker, + current_time, 0)) + launch_worker(current_time); + } + } + + AutoVacLauncherShutdown(); +} + +/* + * Process any new interrupts. + */ +static void +HandleAutoVacLauncherInterrupts(void) +{ + /* the normal shutdown case */ + if (ShutdownRequestPending) + AutoVacLauncherShutdown(); + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + + /* shutdown requested in config file? */ + if (!AutoVacuumingActive()) + AutoVacLauncherShutdown(); + + /* rebuild the list in case the naptime changed */ + rebuild_database_list(InvalidOid); + } + + /* Process barrier events */ + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); + + /* Perform logging of memory contexts of this process */ + if (LogMemoryContextPending) + ProcessLogMemoryContextInterrupt(); + + /* Process sinval catchup interrupts that happened while sleeping */ + ProcessCatchupInterrupt(); +} + +/* + * Perform a normal exit from the autovac launcher. + */ +static void +AutoVacLauncherShutdown(void) +{ + ereport(DEBUG1, + (errmsg_internal("autovacuum launcher shutting down"))); + AutoVacuumShmem->av_launcherpid = 0; + + proc_exit(0); /* done */ +} + +/* + * Determine the time to sleep, based on the database list. + * + * The "canlaunch" parameter indicates whether we can start a worker right now, + * for example due to the workers being all busy. If this is false, we will + * cause a long sleep, which will be interrupted when a worker exits. + */ +static void +launcher_determine_sleep(bool canlaunch, bool recursing, struct timeval *nap) +{ + /* + * We sleep until the next scheduled vacuum. We trust that when the + * database list was built, care was taken so that no entries have times + * in the past; if the first entry has too close a next_worker value, or a + * time in the past, we will sleep a small nominal time. + */ + if (!canlaunch) + { + nap->tv_sec = autovacuum_naptime; + nap->tv_usec = 0; + } + else if (!dlist_is_empty(&DatabaseList)) + { + TimestampTz current_time = GetCurrentTimestamp(); + TimestampTz next_wakeup; + avl_dbase *avdb; + long secs; + int usecs; + + avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList); + + next_wakeup = avdb->adl_next_worker; + TimestampDifference(current_time, next_wakeup, &secs, &usecs); + + nap->tv_sec = secs; + nap->tv_usec = usecs; + } + else + { + /* list is empty, sleep for whole autovacuum_naptime seconds */ + nap->tv_sec = autovacuum_naptime; + nap->tv_usec = 0; + } + + /* + * If the result is exactly zero, it means a database had an entry with + * time in the past. Rebuild the list so that the databases are evenly + * distributed again, and recalculate the time to sleep. This can happen + * if there are more tables needing vacuum than workers, and they all take + * longer to vacuum than autovacuum_naptime. + * + * We only recurse once. rebuild_database_list should always return times + * in the future, but it seems best not to trust too much on that. + */ + if (nap->tv_sec == 0 && nap->tv_usec == 0 && !recursing) + { + rebuild_database_list(InvalidOid); + launcher_determine_sleep(canlaunch, true, nap); + return; + } + + /* The smallest time we'll allow the launcher to sleep. */ + if (nap->tv_sec <= 0 && nap->tv_usec <= MIN_AUTOVAC_SLEEPTIME * 1000) + { + nap->tv_sec = 0; + nap->tv_usec = MIN_AUTOVAC_SLEEPTIME * 1000; + } + + /* + * If the sleep time is too large, clamp it to an arbitrary maximum (plus + * any fractional seconds, for simplicity). This avoids an essentially + * infinite sleep in strange cases like the system clock going backwards a + * few years. + */ + if (nap->tv_sec > MAX_AUTOVAC_SLEEPTIME) + nap->tv_sec = MAX_AUTOVAC_SLEEPTIME; +} + +/* + * Build an updated DatabaseList. It must only contain databases that appear + * in pgstats, and must be sorted by next_worker from highest to lowest, + * distributed regularly across the next autovacuum_naptime interval. + * + * Receives the Oid of the database that made this list be generated (we call + * this the "new" database, because when the database was already present on + * the list, we expect that this function is not called at all). The + * preexisting list, if any, will be used to preserve the order of the + * databases in the autovacuum_naptime period. The new database is put at the + * end of the interval. The actual values are not saved, which should not be + * much of a problem. + */ +static void +rebuild_database_list(Oid newdb) +{ + List *dblist; + ListCell *cell; + MemoryContext newcxt; + MemoryContext oldcxt; + MemoryContext tmpcxt; + HASHCTL hctl; + int score; + int nelems; + HTAB *dbhash; + dlist_iter iter; + + newcxt = AllocSetContextCreate(AutovacMemCxt, + "Autovacuum database list", + ALLOCSET_DEFAULT_SIZES); + tmpcxt = AllocSetContextCreate(newcxt, + "Autovacuum database list (tmp)", + ALLOCSET_DEFAULT_SIZES); + oldcxt = MemoryContextSwitchTo(tmpcxt); + + /* + * Implementing this is not as simple as it sounds, because we need to put + * the new database at the end of the list; next the databases that were + * already on the list, and finally (at the tail of the list) all the + * other databases that are not on the existing list. + * + * To do this, we build an empty hash table of scored databases. We will + * start with the lowest score (zero) for the new database, then + * increasing scores for the databases in the existing list, in order, and + * lastly increasing scores for all databases gotten via + * get_database_list() that are not already on the hash. + * + * Then we will put all the hash elements into an array, sort the array by + * score, and finally put the array elements into the new doubly linked + * list. + */ + hctl.keysize = sizeof(Oid); + hctl.entrysize = sizeof(avl_dbase); + hctl.hcxt = tmpcxt; + dbhash = hash_create("autovacuum db hash", 20, &hctl, /* magic number here + * FIXME */ + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + /* start by inserting the new database */ + score = 0; + if (OidIsValid(newdb)) + { + avl_dbase *db; + PgStat_StatDBEntry *entry; + + /* only consider this database if it has a pgstat entry */ + entry = pgstat_fetch_stat_dbentry(newdb); + if (entry != NULL) + { + /* we assume it isn't found because the hash was just created */ + db = hash_search(dbhash, &newdb, HASH_ENTER, NULL); + + /* hash_search already filled in the key */ + db->adl_score = score++; + /* next_worker is filled in later */ + } + } + + /* Now insert the databases from the existing list */ + dlist_foreach(iter, &DatabaseList) + { + avl_dbase *avdb = dlist_container(avl_dbase, adl_node, iter.cur); + avl_dbase *db; + bool found; + PgStat_StatDBEntry *entry; + + /* + * skip databases with no stat entries -- in particular, this gets rid + * of dropped databases + */ + entry = pgstat_fetch_stat_dbentry(avdb->adl_datid); + if (entry == NULL) + continue; + + db = hash_search(dbhash, &(avdb->adl_datid), HASH_ENTER, &found); + + if (!found) + { + /* hash_search already filled in the key */ + db->adl_score = score++; + /* next_worker is filled in later */ + } + } + + /* finally, insert all qualifying databases not previously inserted */ + dblist = get_database_list(); + foreach(cell, dblist) + { + avw_dbase *avdb = lfirst(cell); + avl_dbase *db; + bool found; + PgStat_StatDBEntry *entry; + + /* only consider databases with a pgstat entry */ + entry = pgstat_fetch_stat_dbentry(avdb->adw_datid); + if (entry == NULL) + continue; + + db = hash_search(dbhash, &(avdb->adw_datid), HASH_ENTER, &found); + /* only update the score if the database was not already on the hash */ + if (!found) + { + /* hash_search already filled in the key */ + db->adl_score = score++; + /* next_worker is filled in later */ + } + } + nelems = score; + + /* from here on, the allocated memory belongs to the new list */ + MemoryContextSwitchTo(newcxt); + dlist_init(&DatabaseList); + + if (nelems > 0) + { + TimestampTz current_time; + int millis_increment; + avl_dbase *dbary; + avl_dbase *db; + HASH_SEQ_STATUS seq; + int i; + + /* put all the hash elements into an array */ + dbary = palloc(nelems * sizeof(avl_dbase)); + + i = 0; + hash_seq_init(&seq, dbhash); + while ((db = hash_seq_search(&seq)) != NULL) + memcpy(&(dbary[i++]), db, sizeof(avl_dbase)); + + /* sort the array */ + qsort(dbary, nelems, sizeof(avl_dbase), db_comparator); + + /* + * Determine the time interval between databases in the schedule. If + * we see that the configured naptime would take us to sleep times + * lower than our min sleep time (which launcher_determine_sleep is + * coded not to allow), silently use a larger naptime (but don't touch + * the GUC variable). + */ + millis_increment = 1000.0 * autovacuum_naptime / nelems; + if (millis_increment <= MIN_AUTOVAC_SLEEPTIME) + millis_increment = MIN_AUTOVAC_SLEEPTIME * 1.1; + + current_time = GetCurrentTimestamp(); + + /* + * move the elements from the array into the dlist, setting the + * next_worker while walking the array + */ + for (i = 0; i < nelems; i++) + { + db = &(dbary[i]); + + current_time = TimestampTzPlusMilliseconds(current_time, + millis_increment); + db->adl_next_worker = current_time; + + /* later elements should go closer to the head of the list */ + dlist_push_head(&DatabaseList, &db->adl_node); + } + } + + /* all done, clean up memory */ + if (DatabaseListCxt != NULL) + MemoryContextDelete(DatabaseListCxt); + MemoryContextDelete(tmpcxt); + DatabaseListCxt = newcxt; + MemoryContextSwitchTo(oldcxt); +} + +/* qsort comparator for avl_dbase, using adl_score */ +static int +db_comparator(const void *a, const void *b) +{ + if (((const avl_dbase *) a)->adl_score == ((const avl_dbase *) b)->adl_score) + return 0; + else + return (((const avl_dbase *) a)->adl_score < ((const avl_dbase *) b)->adl_score) ? 1 : -1; +} + +/* + * do_start_worker + * + * Bare-bones procedure for starting an autovacuum worker from the launcher. + * It determines what database to work on, sets up shared memory stuff and + * signals postmaster to start the worker. It fails gracefully if invoked when + * autovacuum_workers are already active. + * + * Return value is the OID of the database that the worker is going to process, + * or InvalidOid if no worker was actually started. + */ +static Oid +do_start_worker(void) +{ + List *dblist; + ListCell *cell; + TransactionId xidForceLimit; + MultiXactId multiForceLimit; + bool for_xid_wrap; + bool for_multi_wrap; + avw_dbase *avdb; + TimestampTz current_time; + bool skipit = false; + Oid retval = InvalidOid; + MemoryContext tmpcxt, + oldcxt; + + /* return quickly when there are no free workers */ + LWLockAcquire(AutovacuumLock, LW_SHARED); + if (dlist_is_empty(&AutoVacuumShmem->av_freeWorkers)) + { + LWLockRelease(AutovacuumLock); + return InvalidOid; + } + LWLockRelease(AutovacuumLock); + + /* + * Create and switch to a temporary context to avoid leaking the memory + * allocated for the database list. + */ + tmpcxt = AllocSetContextCreate(CurrentMemoryContext, + "Autovacuum start worker (tmp)", + ALLOCSET_DEFAULT_SIZES); + oldcxt = MemoryContextSwitchTo(tmpcxt); + + /* Get a list of databases */ + dblist = get_database_list(); + + /* + * Determine the oldest datfrozenxid/relfrozenxid that we will allow to + * pass without forcing a vacuum. (This limit can be tightened for + * particular tables, but not loosened.) + */ + recentXid = ReadNextTransactionId(); + xidForceLimit = recentXid - autovacuum_freeze_max_age; + /* ensure it's a "normal" XID, else TransactionIdPrecedes misbehaves */ + /* this can cause the limit to go backwards by 3, but that's OK */ + if (xidForceLimit < FirstNormalTransactionId) + xidForceLimit -= FirstNormalTransactionId; + + /* Also determine the oldest datminmxid we will consider. */ + recentMulti = ReadNextMultiXactId(); + multiForceLimit = recentMulti - MultiXactMemberFreezeThreshold(); + if (multiForceLimit < FirstMultiXactId) + multiForceLimit -= FirstMultiXactId; + + /* + * Choose a database to connect to. We pick the database that was least + * recently auto-vacuumed, or one that needs vacuuming to prevent Xid + * wraparound-related data loss. If any db at risk of Xid wraparound is + * found, we pick the one with oldest datfrozenxid, independently of + * autovacuum times; similarly we pick the one with the oldest datminmxid + * if any is in MultiXactId wraparound. Note that those in Xid wraparound + * danger are given more priority than those in multi wraparound danger. + * + * Note that a database with no stats entry is not considered, except for + * Xid wraparound purposes. The theory is that if no one has ever + * connected to it since the stats were last initialized, it doesn't need + * vacuuming. + * + * XXX This could be improved if we had more info about whether it needs + * vacuuming before connecting to it. Perhaps look through the pgstats + * data for the database's tables? One idea is to keep track of the + * number of new and dead tuples per database in pgstats. However it + * isn't clear how to construct a metric that measures that and not cause + * starvation for less busy databases. + */ + avdb = NULL; + for_xid_wrap = false; + for_multi_wrap = false; + current_time = GetCurrentTimestamp(); + foreach(cell, dblist) + { + avw_dbase *tmp = lfirst(cell); + dlist_iter iter; + + /* Check to see if this one is at risk of wraparound */ + if (TransactionIdPrecedes(tmp->adw_frozenxid, xidForceLimit)) + { + if (avdb == NULL || + TransactionIdPrecedes(tmp->adw_frozenxid, + avdb->adw_frozenxid)) + avdb = tmp; + for_xid_wrap = true; + continue; + } + else if (for_xid_wrap) + continue; /* ignore not-at-risk DBs */ + else if (MultiXactIdPrecedes(tmp->adw_minmulti, multiForceLimit)) + { + if (avdb == NULL || + MultiXactIdPrecedes(tmp->adw_minmulti, avdb->adw_minmulti)) + avdb = tmp; + for_multi_wrap = true; + continue; + } + else if (for_multi_wrap) + continue; /* ignore not-at-risk DBs */ + + /* Find pgstat entry if any */ + tmp->adw_entry = pgstat_fetch_stat_dbentry(tmp->adw_datid); + + /* + * Skip a database with no pgstat entry; it means it hasn't seen any + * activity. + */ + if (!tmp->adw_entry) + continue; + + /* + * Also, skip a database that appears on the database list as having + * been processed recently (less than autovacuum_naptime seconds ago). + * We do this so that we don't select a database which we just + * selected, but that pgstat hasn't gotten around to updating the last + * autovacuum time yet. + */ + skipit = false; + + dlist_reverse_foreach(iter, &DatabaseList) + { + avl_dbase *dbp = dlist_container(avl_dbase, adl_node, iter.cur); + + if (dbp->adl_datid == tmp->adw_datid) + { + /* + * Skip this database if its next_worker value falls between + * the current time and the current time plus naptime. + */ + if (!TimestampDifferenceExceeds(dbp->adl_next_worker, + current_time, 0) && + !TimestampDifferenceExceeds(current_time, + dbp->adl_next_worker, + autovacuum_naptime * 1000)) + skipit = true; + + break; + } + } + if (skipit) + continue; + + /* + * Remember the db with oldest autovac time. (If we are here, both + * tmp->entry and db->entry must be non-null.) + */ + if (avdb == NULL || + tmp->adw_entry->last_autovac_time < avdb->adw_entry->last_autovac_time) + avdb = tmp; + } + + /* Found a database -- process it */ + if (avdb != NULL) + { + WorkerInfo worker; + dlist_node *wptr; + + LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); + + /* + * Get a worker entry from the freelist. We checked above, so there + * really should be a free slot. + */ + wptr = dlist_pop_head_node(&AutoVacuumShmem->av_freeWorkers); + + worker = dlist_container(WorkerInfoData, wi_links, wptr); + worker->wi_dboid = avdb->adw_datid; + worker->wi_proc = NULL; + worker->wi_launchtime = GetCurrentTimestamp(); + + AutoVacuumShmem->av_startingWorker = worker; + + LWLockRelease(AutovacuumLock); + + SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER); + + retval = avdb->adw_datid; + } + else if (skipit) + { + /* + * If we skipped all databases on the list, rebuild it, because it + * probably contains a dropped database. + */ + rebuild_database_list(InvalidOid); + } + + MemoryContextSwitchTo(oldcxt); + MemoryContextDelete(tmpcxt); + + return retval; +} + +/* + * launch_worker + * + * Wrapper for starting a worker from the launcher. Besides actually starting + * it, update the database list to reflect the next time that another one will + * need to be started on the selected database. The actual database choice is + * left to do_start_worker. + * + * This routine is also expected to insert an entry into the database list if + * the selected database was previously absent from the list. + */ +static void +launch_worker(TimestampTz now) +{ + Oid dbid; + dlist_iter iter; + + dbid = do_start_worker(); + if (OidIsValid(dbid)) + { + bool found = false; + + /* + * Walk the database list and update the corresponding entry. If the + * database is not on the list, we'll recreate the list. + */ + dlist_foreach(iter, &DatabaseList) + { + avl_dbase *avdb = dlist_container(avl_dbase, adl_node, iter.cur); + + if (avdb->adl_datid == dbid) + { + found = true; + + /* + * add autovacuum_naptime seconds to the current time, and use + * that as the new "next_worker" field for this database. + */ + avdb->adl_next_worker = + TimestampTzPlusMilliseconds(now, autovacuum_naptime * 1000); + + dlist_move_head(&DatabaseList, iter.cur); + break; + } + } + + /* + * If the database was not present in the database list, we rebuild + * the list. It's possible that the database does not get into the + * list anyway, for example if it's a database that doesn't have a + * pgstat entry, but this is not a problem because we don't want to + * schedule workers regularly into those in any case. + */ + if (!found) + rebuild_database_list(dbid); + } +} + +/* + * Called from postmaster to signal a failure to fork a process to become + * worker. The postmaster should kill(SIGUSR2) the launcher shortly + * after calling this function. + */ +void +AutoVacWorkerFailed(void) +{ + AutoVacuumShmem->av_signal[AutoVacForkFailed] = true; +} + +/* SIGUSR2: a worker is up and running, or just finished, or failed to fork */ +static void +avl_sigusr2_handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + got_SIGUSR2 = true; + SetLatch(MyLatch); + + errno = save_errno; +} + + +/******************************************************************** + * AUTOVACUUM WORKER CODE + ********************************************************************/ + +#ifdef EXEC_BACKEND +/* + * forkexec routines for the autovacuum worker. + * + * Format up the arglist, then fork and exec. + */ +static pid_t +avworker_forkexec(void) +{ + char *av[10]; + int ac = 0; + + av[ac++] = "postgres"; + av[ac++] = "--forkavworker"; + av[ac++] = NULL; /* filled in by postmaster_forkexec */ + av[ac] = NULL; + + Assert(ac < lengthof(av)); + + return postmaster_forkexec(ac, av); +} + +/* + * We need this set from the outside, before InitProcess is called + */ +void +AutovacuumWorkerIAm(void) +{ + am_autovacuum_worker = true; +} +#endif + +/* + * Main entry point for autovacuum worker process. + * + * This code is heavily based on pgarch.c, q.v. + */ +int +StartAutoVacWorker(void) +{ + pid_t worker_pid; + +#ifdef EXEC_BACKEND + switch ((worker_pid = avworker_forkexec())) +#else + switch ((worker_pid = fork_process())) +#endif + { + case -1: + ereport(LOG, + (errmsg("could not fork autovacuum worker process: %m"))); + return 0; + +#ifndef EXEC_BACKEND + case 0: + /* in postmaster child ... */ + InitPostmasterChild(); + + /* Close the postmaster's sockets */ + ClosePostmasterPorts(false); + + AutoVacWorkerMain(0, NULL); + break; +#endif + default: + return (int) worker_pid; + } + + /* shouldn't get here */ + return 0; +} + +/* + * AutoVacWorkerMain + */ +NON_EXEC_STATIC void +AutoVacWorkerMain(int argc, char *argv[]) +{ + sigjmp_buf local_sigjmp_buf; + Oid dbid; + + am_autovacuum_worker = true; + + MyBackendType = B_AUTOVAC_WORKER; + init_ps_display(NULL); + + SetProcessingMode(InitProcessing); + + /* + * Set up signal handlers. We operate on databases much like a regular + * backend, so we use the same signal handling. See equivalent code in + * tcop/postgres.c. + */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + + /* + * SIGINT is used to signal canceling the current table's vacuum; SIGTERM + * means abort and exit cleanly, and SIGQUIT means abandon ship. + */ + pqsignal(SIGINT, StatementCancelHandler); + pqsignal(SIGTERM, die); + /* SIGQUIT handler was already set up by InitPostmasterChild */ + + InitializeTimeouts(); /* establishes SIGALRM handler */ + + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, SIG_IGN); + pqsignal(SIGFPE, FloatExceptionHandler); + pqsignal(SIGCHLD, SIG_DFL); + + /* + * Create a per-backend PGPROC struct in shared memory, except in the + * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do + * this before we can use LWLocks (and in the EXEC_BACKEND case we already + * had to do some stuff with LWLocks). + */ +#ifndef EXEC_BACKEND + InitProcess(); +#endif + + /* Early initialization */ + BaseInit(); + + /* + * If an exception is encountered, processing resumes here. + * + * Unlike most auxiliary processes, we don't attempt to continue + * processing after an error; we just clean up and exit. The autovac + * launcher is responsible for spawning another worker later. + * + * Note that we use sigsetjmp(..., 1), so that the prevailing signal mask + * (to wit, BlockSig) will be restored when longjmp'ing to here. Thus, + * signals other than SIGQUIT will be blocked until we exit. It might + * seem that this policy makes the HOLD_INTERRUPTS() call redundant, but + * it is not since InterruptPending might be set already. + */ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* since not using PG_TRY, must reset error stack by hand */ + error_context_stack = NULL; + + /* Prevents interrupts while cleaning up */ + HOLD_INTERRUPTS(); + + /* Report the error to the server log */ + EmitErrorReport(); + + /* + * We can now go away. Note that because we called InitProcess, a + * callback was registered to do ProcKill, which will clean up + * necessary state. + */ + proc_exit(0); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + sigprocmask(SIG_SETMASK, &UnBlockSig, NULL); + + /* + * Set always-secure search path, so malicious users can't redirect user + * code (e.g. pg_index.indexprs). (That code runs in a + * SECURITY_RESTRICTED_OPERATION sandbox, so malicious users could not + * take control of the entire autovacuum worker in any case.) + */ + SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE); + + /* + * Force zero_damaged_pages OFF in the autovac process, even if it is set + * in postgresql.conf. We don't really want such a dangerous option being + * applied non-interactively. + */ + SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE); + + /* + * Force settable timeouts off to avoid letting these settings prevent + * regular maintenance from being executed. + */ + SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE); + SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE); + SetConfigOption("idle_in_transaction_session_timeout", "0", + PGC_SUSET, PGC_S_OVERRIDE); + + /* + * Force default_transaction_isolation to READ COMMITTED. We don't want + * to pay the overhead of serializable mode, nor add any risk of causing + * deadlocks or delaying other transactions. + */ + SetConfigOption("default_transaction_isolation", "read committed", + PGC_SUSET, PGC_S_OVERRIDE); + + /* + * Force synchronous replication off to allow regular maintenance even if + * we are waiting for standbys to connect. This is important to ensure we + * aren't blocked from performing anti-wraparound tasks. + */ + if (synchronous_commit > SYNCHRONOUS_COMMIT_LOCAL_FLUSH) + SetConfigOption("synchronous_commit", "local", + PGC_SUSET, PGC_S_OVERRIDE); + + /* + * Even when system is configured to use a different fetch consistency, + * for autovac we always want fresh stats. + */ + SetConfigOption("stats_fetch_consistency", "none", PGC_SUSET, PGC_S_OVERRIDE); + + /* + * Get the info about the database we're going to work on. + */ + LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); + + /* + * beware of startingWorker being INVALID; this should normally not + * happen, but if a worker fails after forking and before this, the + * launcher might have decided to remove it from the queue and start + * again. + */ + if (AutoVacuumShmem->av_startingWorker != NULL) + { + MyWorkerInfo = AutoVacuumShmem->av_startingWorker; + dbid = MyWorkerInfo->wi_dboid; + MyWorkerInfo->wi_proc = MyProc; + + /* insert into the running list */ + dlist_push_head(&AutoVacuumShmem->av_runningWorkers, + &MyWorkerInfo->wi_links); + + /* + * remove from the "starting" pointer, so that the launcher can start + * a new worker if required + */ + AutoVacuumShmem->av_startingWorker = NULL; + LWLockRelease(AutovacuumLock); + + on_shmem_exit(FreeWorkerInfo, 0); + + /* wake up the launcher */ + if (AutoVacuumShmem->av_launcherpid != 0) + kill(AutoVacuumShmem->av_launcherpid, SIGUSR2); + } + else + { + /* no worker entry for me, go away */ + elog(WARNING, "autovacuum worker started without a worker entry"); + dbid = InvalidOid; + LWLockRelease(AutovacuumLock); + } + + if (OidIsValid(dbid)) + { + char dbname[NAMEDATALEN]; + + /* + * Report autovac startup to the cumulative stats system. We + * deliberately do this before InitPostgres, so that the + * last_autovac_time will get updated even if the connection attempt + * fails. This is to prevent autovac from getting "stuck" repeatedly + * selecting an unopenable database, rather than making any progress + * on stuff it can connect to. + */ + pgstat_report_autovac(dbid); + + /* + * Connect to the selected database, specifying no particular user + * + * Note: if we have selected a just-deleted database (due to using + * stale stats info), we'll fail and exit here. + */ + InitPostgres(NULL, dbid, NULL, InvalidOid, false, false, + dbname); + SetProcessingMode(NormalProcessing); + set_ps_display(dbname); + ereport(DEBUG1, + (errmsg_internal("autovacuum: processing database \"%s\"", dbname))); + + if (PostAuthDelay) + pg_usleep(PostAuthDelay * 1000000L); + + /* And do an appropriate amount of work */ + recentXid = ReadNextTransactionId(); + recentMulti = ReadNextMultiXactId(); + do_autovacuum(); + } + + /* + * The launcher will be notified of my death in ProcKill, *if* we managed + * to get a worker slot at all + */ + + /* All done, go away */ + proc_exit(0); +} + +/* + * Return a WorkerInfo to the free list + */ +static void +FreeWorkerInfo(int code, Datum arg) +{ + if (MyWorkerInfo != NULL) + { + LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); + + /* + * Wake the launcher up so that he can launch a new worker immediately + * if required. We only save the launcher's PID in local memory here; + * the actual signal will be sent when the PGPROC is recycled. Note + * that we always do this, so that the launcher can rebalance the cost + * limit setting of the remaining workers. + * + * We somewhat ignore the risk that the launcher changes its PID + * between us reading it and the actual kill; we expect ProcKill to be + * called shortly after us, and we assume that PIDs are not reused too + * quickly after a process exits. + */ + AutovacuumLauncherPid = AutoVacuumShmem->av_launcherpid; + + dlist_delete(&MyWorkerInfo->wi_links); + MyWorkerInfo->wi_dboid = InvalidOid; + MyWorkerInfo->wi_tableoid = InvalidOid; + MyWorkerInfo->wi_sharedrel = false; + MyWorkerInfo->wi_proc = NULL; + MyWorkerInfo->wi_launchtime = 0; + pg_atomic_clear_flag(&MyWorkerInfo->wi_dobalance); + dlist_push_head(&AutoVacuumShmem->av_freeWorkers, + &MyWorkerInfo->wi_links); + /* not mine anymore */ + MyWorkerInfo = NULL; + + /* + * now that we're inactive, cause a rebalancing of the surviving + * workers + */ + AutoVacuumShmem->av_signal[AutoVacRebalance] = true; + LWLockRelease(AutovacuumLock); + } +} + +/* + * Update vacuum cost-based delay-related parameters for autovacuum workers and + * backends executing VACUUM or ANALYZE using the value of relevant GUCs and + * global state. This must be called during setup for vacuum and after every + * config reload to ensure up-to-date values. + */ +void +VacuumUpdateCosts(void) +{ + if (MyWorkerInfo) + { + if (av_storage_param_cost_delay >= 0) + vacuum_cost_delay = av_storage_param_cost_delay; + else if (autovacuum_vac_cost_delay >= 0) + vacuum_cost_delay = autovacuum_vac_cost_delay; + else + /* fall back to VacuumCostDelay */ + vacuum_cost_delay = VacuumCostDelay; + + AutoVacuumUpdateCostLimit(); + } + else + { + /* Must be explicit VACUUM or ANALYZE */ + vacuum_cost_delay = VacuumCostDelay; + vacuum_cost_limit = VacuumCostLimit; + } + + /* + * If configuration changes are allowed to impact VacuumCostActive, make + * sure it is updated. + */ + if (VacuumFailsafeActive) + Assert(!VacuumCostActive); + else if (vacuum_cost_delay > 0) + VacuumCostActive = true; + else + { + VacuumCostActive = false; + VacuumCostBalance = 0; + } + + /* + * Since the cost logging requires a lock, avoid rendering the log message + * in case we are using a message level where the log wouldn't be emitted. + */ + if (MyWorkerInfo && message_level_is_interesting(DEBUG2)) + { + Oid dboid, + tableoid; + + Assert(!LWLockHeldByMe(AutovacuumLock)); + + LWLockAcquire(AutovacuumLock, LW_SHARED); + dboid = MyWorkerInfo->wi_dboid; + tableoid = MyWorkerInfo->wi_tableoid; + LWLockRelease(AutovacuumLock); + + elog(DEBUG2, + "Autovacuum VacuumUpdateCosts(db=%u, rel=%u, dobalance=%s, cost_limit=%d, cost_delay=%g active=%s failsafe=%s)", + dboid, tableoid, pg_atomic_unlocked_test_flag(&MyWorkerInfo->wi_dobalance) ? "no" : "yes", + vacuum_cost_limit, vacuum_cost_delay, + vacuum_cost_delay > 0 ? "yes" : "no", + VacuumFailsafeActive ? "yes" : "no"); + } +} + +/* + * Update vacuum_cost_limit with the correct value for an autovacuum worker, + * given the value of other relevant cost limit parameters and the number of + * workers across which the limit must be balanced. Autovacuum workers must + * call this regularly in case av_nworkersForBalance has been updated by + * another worker or by the autovacuum launcher. They must also call it after a + * config reload. + */ +void +AutoVacuumUpdateCostLimit(void) +{ + if (!MyWorkerInfo) + return; + + /* + * note: in cost_limit, zero also means use value from elsewhere, because + * zero is not a valid value. + */ + + if (av_storage_param_cost_limit > 0) + vacuum_cost_limit = av_storage_param_cost_limit; + else + { + int nworkers_for_balance; + + if (autovacuum_vac_cost_limit > 0) + vacuum_cost_limit = autovacuum_vac_cost_limit; + else + vacuum_cost_limit = VacuumCostLimit; + + /* Only balance limit if no cost-related storage parameters specified */ + if (pg_atomic_unlocked_test_flag(&MyWorkerInfo->wi_dobalance)) + return; + + Assert(vacuum_cost_limit > 0); + + nworkers_for_balance = pg_atomic_read_u32(&AutoVacuumShmem->av_nworkersForBalance); + + /* There is at least 1 autovac worker (this worker) */ + if (nworkers_for_balance <= 0) + elog(ERROR, "nworkers_for_balance must be > 0"); + + vacuum_cost_limit = Max(vacuum_cost_limit / nworkers_for_balance, 1); + } +} + +/* + * autovac_recalculate_workers_for_balance + * Recalculate the number of workers to consider, given cost-related + * storage parameters and the current number of active workers. + * + * Caller must hold the AutovacuumLock in at least shared mode to access + * worker->wi_proc. + */ +static void +autovac_recalculate_workers_for_balance(void) +{ + dlist_iter iter; + int orig_nworkers_for_balance; + int nworkers_for_balance = 0; + + Assert(LWLockHeldByMe(AutovacuumLock)); + + orig_nworkers_for_balance = + pg_atomic_read_u32(&AutoVacuumShmem->av_nworkersForBalance); + + dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers) + { + WorkerInfo worker = dlist_container(WorkerInfoData, wi_links, iter.cur); + + if (worker->wi_proc == NULL || + pg_atomic_unlocked_test_flag(&worker->wi_dobalance)) + continue; + + nworkers_for_balance++; + } + + if (nworkers_for_balance != orig_nworkers_for_balance) + pg_atomic_write_u32(&AutoVacuumShmem->av_nworkersForBalance, + nworkers_for_balance); +} + +/* + * get_database_list + * Return a list of all databases found in pg_database. + * + * The list and associated data is allocated in the caller's memory context, + * which is in charge of ensuring that it's properly cleaned up afterwards. + * + * Note: this is the only function in which the autovacuum launcher uses a + * transaction. Although we aren't attached to any particular database and + * therefore can't access most catalogs, we do have enough infrastructure + * to do a seqscan on pg_database. + */ +static List * +get_database_list(void) +{ + List *dblist = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext resultcxt; + + /* This is the context that we will allocate our output data in */ + resultcxt = CurrentMemoryContext; + + /* + * Start a transaction so we can access pg_database, and get a snapshot. + * We don't have a use for the snapshot itself, but we're interested in + * the secondary effect that it sets RecentGlobalXmin. (This is critical + * for anything that reads heap pages, because HOT may decide to prune + * them even if the process doesn't attempt to modify any tuples.) + * + * FIXME: This comment is inaccurate / the code buggy. A snapshot that is + * not pushed/active does not reliably prevent HOT pruning (->xmin could + * e.g. be cleared when cache invalidations are processed). + */ + StartTransactionCommand(); + (void) GetTransactionSnapshot(); + + rel = table_open(DatabaseRelationId, AccessShareLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_database pgdatabase = (Form_pg_database) GETSTRUCT(tup); + avw_dbase *avdb; + MemoryContext oldcxt; + + /* + * If database has partially been dropped, we can't, nor need to, + * vacuum it. + */ + if (database_is_invalid_form(pgdatabase)) + { + elog(DEBUG2, + "autovacuum: skipping invalid database \"%s\"", + NameStr(pgdatabase->datname)); + continue; + } + + /* + * Allocate our results in the caller's context, not the + * transaction's. We do this inside the loop, and restore the original + * context at the end, so that leaky things like heap_getnext() are + * not called in a potentially long-lived context. + */ + oldcxt = MemoryContextSwitchTo(resultcxt); + + avdb = (avw_dbase *) palloc(sizeof(avw_dbase)); + + avdb->adw_datid = pgdatabase->oid; + avdb->adw_name = pstrdup(NameStr(pgdatabase->datname)); + avdb->adw_frozenxid = pgdatabase->datfrozenxid; + avdb->adw_minmulti = pgdatabase->datminmxid; + /* this gets set later: */ + avdb->adw_entry = NULL; + + dblist = lappend(dblist, avdb); + MemoryContextSwitchTo(oldcxt); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + /* Be sure to restore caller's memory context */ + MemoryContextSwitchTo(resultcxt); + + return dblist; +} + +/* + * Process a database table-by-table + * + * Note that CHECK_FOR_INTERRUPTS is supposed to be used in certain spots in + * order not to ignore shutdown commands for too long. + */ +static void +do_autovacuum(void) +{ + Relation classRel; + HeapTuple tuple; + TableScanDesc relScan; + Form_pg_database dbForm; + List *table_oids = NIL; + List *orphan_oids = NIL; + HASHCTL ctl; + HTAB *table_toast_map; + ListCell *volatile cell; + BufferAccessStrategy bstrategy; + ScanKeyData key; + TupleDesc pg_class_desc; + int effective_multixact_freeze_max_age; + bool did_vacuum = false; + bool found_concurrent_worker = false; + int i; + + /* + * StartTransactionCommand and CommitTransactionCommand will automatically + * switch to other contexts. We need this one to keep the list of + * relations to vacuum/analyze across transactions. + */ + AutovacMemCxt = AllocSetContextCreate(TopMemoryContext, + "Autovacuum worker", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(AutovacMemCxt); + + /* Start a transaction so our commands have one to play into. */ + StartTransactionCommand(); + + /* + * Compute the multixact age for which freezing is urgent. This is + * normally autovacuum_multixact_freeze_max_age, but may be less if we are + * short of multixact member space. + */ + effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold(); + + /* + * Find the pg_database entry and select the default freeze ages. We use + * zero in template and nonconnectable databases, else the system-wide + * default. + */ + tuple = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for database %u", MyDatabaseId); + dbForm = (Form_pg_database) GETSTRUCT(tuple); + + if (dbForm->datistemplate || !dbForm->datallowconn) + { + default_freeze_min_age = 0; + default_freeze_table_age = 0; + default_multixact_freeze_min_age = 0; + default_multixact_freeze_table_age = 0; + } + else + { + default_freeze_min_age = vacuum_freeze_min_age; + default_freeze_table_age = vacuum_freeze_table_age; + default_multixact_freeze_min_age = vacuum_multixact_freeze_min_age; + default_multixact_freeze_table_age = vacuum_multixact_freeze_table_age; + } + + ReleaseSysCache(tuple); + + /* StartTransactionCommand changed elsewhere */ + MemoryContextSwitchTo(AutovacMemCxt); + + classRel = table_open(RelationRelationId, AccessShareLock); + + /* create a copy so we can use it after closing pg_class */ + pg_class_desc = CreateTupleDescCopy(RelationGetDescr(classRel)); + + /* create hash table for toast <-> main relid mapping */ + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(av_relation); + + table_toast_map = hash_create("TOAST to main relid map", + 100, + &ctl, + HASH_ELEM | HASH_BLOBS); + + /* + * Scan pg_class to determine which tables to vacuum. + * + * We do this in two passes: on the first one we collect the list of plain + * relations and materialized views, and on the second one we collect + * TOAST tables. The reason for doing the second pass is that during it we + * want to use the main relation's pg_class.reloptions entry if the TOAST + * table does not have any, and we cannot obtain it unless we know + * beforehand what's the main table OID. + * + * We need to check TOAST tables separately because in cases with short, + * wide tables there might be proportionally much more activity in the + * TOAST table than in its parent. + */ + relScan = table_beginscan_catalog(classRel, 0, NULL); + + /* + * On the first pass, we collect main tables to vacuum, and also the main + * table relid to TOAST relid mapping. + */ + while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL) + { + Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple); + PgStat_StatTabEntry *tabentry; + AutoVacOpts *relopts; + Oid relid; + bool dovacuum; + bool doanalyze; + bool wraparound; + + if (classForm->relkind != RELKIND_RELATION && + classForm->relkind != RELKIND_MATVIEW) + continue; + + relid = classForm->oid; + + /* + * Check if it is a temp table (presumably, of some other backend's). + * We cannot safely process other backends' temp tables. + */ + if (classForm->relpersistence == RELPERSISTENCE_TEMP) + { + /* + * We just ignore it if the owning backend is still active and + * using the temporary schema. Also, for safety, ignore it if the + * namespace doesn't exist or isn't a temp namespace after all. + */ + if (checkTempNamespaceStatus(classForm->relnamespace) == TEMP_NAMESPACE_IDLE) + { + /* + * The table seems to be orphaned -- although it might be that + * the owning backend has already deleted it and exited; our + * pg_class scan snapshot is not necessarily up-to-date + * anymore, so we could be looking at a committed-dead entry. + * Remember it so we can try to delete it later. + */ + orphan_oids = lappend_oid(orphan_oids, relid); + } + continue; + } + + /* Fetch reloptions and the pgstat entry for this table */ + relopts = extract_autovac_opts(tuple, pg_class_desc); + tabentry = pgstat_fetch_stat_tabentry_ext(classForm->relisshared, + relid); + + /* Check if it needs vacuum or analyze */ + relation_needs_vacanalyze(relid, relopts, classForm, tabentry, + effective_multixact_freeze_max_age, + &dovacuum, &doanalyze, &wraparound); + + /* Relations that need work are added to table_oids */ + if (dovacuum || doanalyze) + table_oids = lappend_oid(table_oids, relid); + + /* + * Remember TOAST associations for the second pass. Note: we must do + * this whether or not the table is going to be vacuumed, because we + * don't automatically vacuum toast tables along the parent table. + */ + if (OidIsValid(classForm->reltoastrelid)) + { + av_relation *hentry; + bool found; + + hentry = hash_search(table_toast_map, + &classForm->reltoastrelid, + HASH_ENTER, &found); + + if (!found) + { + /* hash_search already filled in the key */ + hentry->ar_relid = relid; + hentry->ar_hasrelopts = false; + if (relopts != NULL) + { + hentry->ar_hasrelopts = true; + memcpy(&hentry->ar_reloptions, relopts, + sizeof(AutoVacOpts)); + } + } + } + } + + table_endscan(relScan); + + /* second pass: check TOAST tables */ + ScanKeyInit(&key, + Anum_pg_class_relkind, + BTEqualStrategyNumber, F_CHAREQ, + CharGetDatum(RELKIND_TOASTVALUE)); + + relScan = table_beginscan_catalog(classRel, 1, &key); + while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL) + { + Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple); + PgStat_StatTabEntry *tabentry; + Oid relid; + AutoVacOpts *relopts = NULL; + bool dovacuum; + bool doanalyze; + bool wraparound; + + /* + * We cannot safely process other backends' temp tables, so skip 'em. + */ + if (classForm->relpersistence == RELPERSISTENCE_TEMP) + continue; + + relid = classForm->oid; + + /* + * fetch reloptions -- if this toast table does not have them, try the + * main rel + */ + relopts = extract_autovac_opts(tuple, pg_class_desc); + if (relopts == NULL) + { + av_relation *hentry; + bool found; + + hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found); + if (found && hentry->ar_hasrelopts) + relopts = &hentry->ar_reloptions; + } + + /* Fetch the pgstat entry for this table */ + tabentry = pgstat_fetch_stat_tabentry_ext(classForm->relisshared, + relid); + + relation_needs_vacanalyze(relid, relopts, classForm, tabentry, + effective_multixact_freeze_max_age, + &dovacuum, &doanalyze, &wraparound); + + /* ignore analyze for toast tables */ + if (dovacuum) + table_oids = lappend_oid(table_oids, relid); + } + + table_endscan(relScan); + table_close(classRel, AccessShareLock); + + /* + * Recheck orphan temporary tables, and if they still seem orphaned, drop + * them. We'll eat a transaction per dropped table, which might seem + * excessive, but we should only need to do anything as a result of a + * previous backend crash, so this should not happen often enough to + * justify "optimizing". Using separate transactions ensures that we + * don't bloat the lock table if there are many temp tables to be dropped, + * and it ensures that we don't lose work if a deletion attempt fails. + */ + foreach(cell, orphan_oids) + { + Oid relid = lfirst_oid(cell); + Form_pg_class classForm; + ObjectAddress object; + + /* + * Check for user-requested abort. + */ + CHECK_FOR_INTERRUPTS(); + + /* + * Try to lock the table. If we can't get the lock immediately, + * somebody else is using (or dropping) the table, so it's not our + * concern anymore. Having the lock prevents race conditions below. + */ + if (!ConditionalLockRelationOid(relid, AccessExclusiveLock)) + continue; + + /* + * Re-fetch the pg_class tuple and re-check whether it still seems to + * be an orphaned temp table. If it's not there or no longer the same + * relation, ignore it. + */ + tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(tuple)) + { + /* be sure to drop useless lock so we don't bloat lock table */ + UnlockRelationOid(relid, AccessExclusiveLock); + continue; + } + classForm = (Form_pg_class) GETSTRUCT(tuple); + + /* + * Make all the same tests made in the loop above. In event of OID + * counter wraparound, the pg_class entry we have now might be + * completely unrelated to the one we saw before. + */ + if (!((classForm->relkind == RELKIND_RELATION || + classForm->relkind == RELKIND_MATVIEW) && + classForm->relpersistence == RELPERSISTENCE_TEMP)) + { + UnlockRelationOid(relid, AccessExclusiveLock); + continue; + } + + if (checkTempNamespaceStatus(classForm->relnamespace) != TEMP_NAMESPACE_IDLE) + { + UnlockRelationOid(relid, AccessExclusiveLock); + continue; + } + + /* OK, let's delete it */ + ereport(LOG, + (errmsg("autovacuum: dropping orphan temp table \"%s.%s.%s\"", + get_database_name(MyDatabaseId), + get_namespace_name(classForm->relnamespace), + NameStr(classForm->relname)))); + + object.classId = RelationRelationId; + object.objectId = relid; + object.objectSubId = 0; + performDeletion(&object, DROP_CASCADE, + PERFORM_DELETION_INTERNAL | + PERFORM_DELETION_QUIETLY | + PERFORM_DELETION_SKIP_EXTENSIONS); + + /* + * To commit the deletion, end current transaction and start a new + * one. Note this also releases the lock we took. + */ + CommitTransactionCommand(); + StartTransactionCommand(); + + /* StartTransactionCommand changed current memory context */ + MemoryContextSwitchTo(AutovacMemCxt); + } + + /* + * Optionally, create a buffer access strategy object for VACUUM to use. + * We use the same BufferAccessStrategy object for all tables VACUUMed by + * this worker to prevent autovacuum from blowing out shared buffers. + * + * VacuumBufferUsageLimit being set to 0 results in + * GetAccessStrategyWithSize returning NULL, effectively meaning we can + * use up to all of shared buffers. + * + * If we later enter failsafe mode on any of the tables being vacuumed, we + * will cease use of the BufferAccessStrategy only for that table. + * + * XXX should we consider adding code to adjust the size of this if + * VacuumBufferUsageLimit changes? + */ + bstrategy = GetAccessStrategyWithSize(BAS_VACUUM, VacuumBufferUsageLimit); + + /* + * create a memory context to act as fake PortalContext, so that the + * contexts created in the vacuum code are cleaned up for each table. + */ + PortalContext = AllocSetContextCreate(AutovacMemCxt, + "Autovacuum Portal", + ALLOCSET_DEFAULT_SIZES); + + /* + * Perform operations on collected tables. + */ + foreach(cell, table_oids) + { + Oid relid = lfirst_oid(cell); + HeapTuple classTup; + autovac_table *tab; + bool isshared; + bool skipit; + dlist_iter iter; + + CHECK_FOR_INTERRUPTS(); + + /* + * Check for config changes before processing each collected table. + */ + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + + /* + * You might be tempted to bail out if we see autovacuum is now + * disabled. Must resist that temptation -- this might be a + * for-wraparound emergency worker, in which case that would be + * entirely inappropriate. + */ + } + + /* + * Find out whether the table is shared or not. (It's slightly + * annoying to fetch the syscache entry just for this, but in typical + * cases it adds little cost because table_recheck_autovac would + * refetch the entry anyway. We could buy that back by copying the + * tuple here and passing it to table_recheck_autovac, but that + * increases the odds of that function working with stale data.) + */ + classTup = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(classTup)) + continue; /* somebody deleted the rel, forget it */ + isshared = ((Form_pg_class) GETSTRUCT(classTup))->relisshared; + ReleaseSysCache(classTup); + + /* + * Hold schedule lock from here until we've claimed the table. We + * also need the AutovacuumLock to walk the worker array, but that one + * can just be a shared lock. + */ + LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE); + LWLockAcquire(AutovacuumLock, LW_SHARED); + + /* + * Check whether the table is being vacuumed concurrently by another + * worker. + */ + skipit = false; + dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers) + { + WorkerInfo worker = dlist_container(WorkerInfoData, wi_links, iter.cur); + + /* ignore myself */ + if (worker == MyWorkerInfo) + continue; + + /* ignore workers in other databases (unless table is shared) */ + if (!worker->wi_sharedrel && worker->wi_dboid != MyDatabaseId) + continue; + + if (worker->wi_tableoid == relid) + { + skipit = true; + found_concurrent_worker = true; + break; + } + } + LWLockRelease(AutovacuumLock); + if (skipit) + { + LWLockRelease(AutovacuumScheduleLock); + continue; + } + + /* + * Store the table's OID in shared memory before releasing the + * schedule lock, so that other workers don't try to vacuum it + * concurrently. (We claim it here so as not to hold + * AutovacuumScheduleLock while rechecking the stats.) + */ + MyWorkerInfo->wi_tableoid = relid; + MyWorkerInfo->wi_sharedrel = isshared; + LWLockRelease(AutovacuumScheduleLock); + + /* + * Check whether pgstat data still says we need to vacuum this table. + * It could have changed if something else processed the table while + * we weren't looking. This doesn't entirely close the race condition, + * but it is very small. + */ + MemoryContextSwitchTo(AutovacMemCxt); + tab = table_recheck_autovac(relid, table_toast_map, pg_class_desc, + effective_multixact_freeze_max_age); + if (tab == NULL) + { + /* someone else vacuumed the table, or it went away */ + LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE); + MyWorkerInfo->wi_tableoid = InvalidOid; + MyWorkerInfo->wi_sharedrel = false; + LWLockRelease(AutovacuumScheduleLock); + continue; + } + + /* + * Save the cost-related storage parameter values in global variables + * for reference when updating vacuum_cost_delay and vacuum_cost_limit + * during vacuuming this table. + */ + av_storage_param_cost_delay = tab->at_storage_param_vac_cost_delay; + av_storage_param_cost_limit = tab->at_storage_param_vac_cost_limit; + + /* + * We only expect this worker to ever set the flag, so don't bother + * checking the return value. We shouldn't have to retry. + */ + if (tab->at_dobalance) + pg_atomic_test_set_flag(&MyWorkerInfo->wi_dobalance); + else + pg_atomic_clear_flag(&MyWorkerInfo->wi_dobalance); + + LWLockAcquire(AutovacuumLock, LW_SHARED); + autovac_recalculate_workers_for_balance(); + LWLockRelease(AutovacuumLock); + + /* + * We wait until this point to update cost delay and cost limit + * values, even though we reloaded the configuration file above, so + * that we can take into account the cost-related storage parameters. + */ + VacuumUpdateCosts(); + + + /* clean up memory before each iteration */ + MemoryContextResetAndDeleteChildren(PortalContext); + + /* + * Save the relation name for a possible error message, to avoid a + * catalog lookup in case of an error. If any of these return NULL, + * then the relation has been dropped since last we checked; skip it. + * Note: they must live in a long-lived memory context because we call + * vacuum and analyze in different transactions. + */ + + tab->at_relname = get_rel_name(tab->at_relid); + tab->at_nspname = get_namespace_name(get_rel_namespace(tab->at_relid)); + tab->at_datname = get_database_name(MyDatabaseId); + if (!tab->at_relname || !tab->at_nspname || !tab->at_datname) + goto deleted; + + /* + * We will abort vacuuming the current table if something errors out, + * and continue with the next one in schedule; in particular, this + * happens if we are interrupted with SIGINT. + */ + PG_TRY(); + { + /* Use PortalContext for any per-table allocations */ + MemoryContextSwitchTo(PortalContext); + + /* have at it */ + autovacuum_do_vac_analyze(tab, bstrategy); + + /* + * Clear a possible query-cancel signal, to avoid a late reaction + * to an automatically-sent signal because of vacuuming the + * current table (we're done with it, so it would make no sense to + * cancel at this point.) + */ + QueryCancelPending = false; + } + PG_CATCH(); + { + /* + * Abort the transaction, start a new one, and proceed with the + * next table in our list. + */ + HOLD_INTERRUPTS(); + if (tab->at_params.options & VACOPT_VACUUM) + errcontext("automatic vacuum of table \"%s.%s.%s\"", + tab->at_datname, tab->at_nspname, tab->at_relname); + else + errcontext("automatic analyze of table \"%s.%s.%s\"", + tab->at_datname, tab->at_nspname, tab->at_relname); + EmitErrorReport(); + + /* this resets ProcGlobal->statusFlags[i] too */ + AbortOutOfAnyTransaction(); + FlushErrorState(); + MemoryContextResetAndDeleteChildren(PortalContext); + + /* restart our transaction for the following operations */ + StartTransactionCommand(); + RESUME_INTERRUPTS(); + } + PG_END_TRY(); + + /* Make sure we're back in AutovacMemCxt */ + MemoryContextSwitchTo(AutovacMemCxt); + + did_vacuum = true; + + /* ProcGlobal->statusFlags[i] are reset at the next end of xact */ + + /* be tidy */ +deleted: + if (tab->at_datname != NULL) + pfree(tab->at_datname); + if (tab->at_nspname != NULL) + pfree(tab->at_nspname); + if (tab->at_relname != NULL) + pfree(tab->at_relname); + pfree(tab); + + /* + * Remove my info from shared memory. We set wi_dobalance on the + * assumption that we are more likely than not to vacuum a table with + * no cost-related storage parameters next, so we want to claim our + * share of I/O as soon as possible to avoid thrashing the global + * balance. + */ + LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE); + MyWorkerInfo->wi_tableoid = InvalidOid; + MyWorkerInfo->wi_sharedrel = false; + LWLockRelease(AutovacuumScheduleLock); + pg_atomic_test_set_flag(&MyWorkerInfo->wi_dobalance); + } + + /* + * Perform additional work items, as requested by backends. + */ + LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); + for (i = 0; i < NUM_WORKITEMS; i++) + { + AutoVacuumWorkItem *workitem = &AutoVacuumShmem->av_workItems[i]; + + if (!workitem->avw_used) + continue; + if (workitem->avw_active) + continue; + if (workitem->avw_database != MyDatabaseId) + continue; + + /* claim this one, and release lock while performing it */ + workitem->avw_active = true; + LWLockRelease(AutovacuumLock); + + perform_work_item(workitem); + + /* + * Check for config changes before acquiring lock for further jobs. + */ + CHECK_FOR_INTERRUPTS(); + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + VacuumUpdateCosts(); + } + + LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); + + /* and mark it done */ + workitem->avw_active = false; + workitem->avw_used = false; + } + LWLockRelease(AutovacuumLock); + + /* + * We leak table_toast_map here (among other things), but since we're + * going away soon, it's not a problem. + */ + + /* + * Update pg_database.datfrozenxid, and truncate pg_xact if possible. We + * only need to do this once, not after each table. + * + * Even if we didn't vacuum anything, it may still be important to do + * this, because one indirect effect of vac_update_datfrozenxid() is to + * update ShmemVariableCache->xidVacLimit. That might need to be done + * even if we haven't vacuumed anything, because relations with older + * relfrozenxid values or other databases with older datfrozenxid values + * might have been dropped, allowing xidVacLimit to advance. + * + * However, it's also important not to do this blindly in all cases, + * because when autovacuum=off this will restart the autovacuum launcher. + * If we're not careful, an infinite loop can result, where workers find + * no work to do and restart the launcher, which starts another worker in + * the same database that finds no work to do. To prevent that, we skip + * this if (1) we found no work to do and (2) we skipped at least one + * table due to concurrent autovacuum activity. In that case, the other + * worker has already done it, or will do so when it finishes. + */ + if (did_vacuum || !found_concurrent_worker) + vac_update_datfrozenxid(); + + /* Finally close out the last transaction. */ + CommitTransactionCommand(); +} + +/* + * Execute a previously registered work item. + */ +static void +perform_work_item(AutoVacuumWorkItem *workitem) +{ + char *cur_datname = NULL; + char *cur_nspname = NULL; + char *cur_relname = NULL; + + /* + * Note we do not store table info in MyWorkerInfo, since this is not + * vacuuming proper. + */ + + /* + * Save the relation name for a possible error message, to avoid a catalog + * lookup in case of an error. If any of these return NULL, then the + * relation has been dropped since last we checked; skip it. + */ + Assert(CurrentMemoryContext == AutovacMemCxt); + + cur_relname = get_rel_name(workitem->avw_relation); + cur_nspname = get_namespace_name(get_rel_namespace(workitem->avw_relation)); + cur_datname = get_database_name(MyDatabaseId); + if (!cur_relname || !cur_nspname || !cur_datname) + goto deleted2; + + autovac_report_workitem(workitem, cur_nspname, cur_relname); + + /* clean up memory before each work item */ + MemoryContextResetAndDeleteChildren(PortalContext); + + /* + * We will abort the current work item if something errors out, and + * continue with the next one; in particular, this happens if we are + * interrupted with SIGINT. Note that this means that the work item list + * can be lossy. + */ + PG_TRY(); + { + /* Use PortalContext for any per-work-item allocations */ + MemoryContextSwitchTo(PortalContext); + + /* + * Have at it. Functions called here are responsible for any required + * user switch and sandbox. + */ + switch (workitem->avw_type) + { + case AVW_BRINSummarizeRange: + DirectFunctionCall2(brin_summarize_range, + ObjectIdGetDatum(workitem->avw_relation), + Int64GetDatum((int64) workitem->avw_blockNumber)); + break; + default: + elog(WARNING, "unrecognized work item found: type %d", + workitem->avw_type); + break; + } + + /* + * Clear a possible query-cancel signal, to avoid a late reaction to + * an automatically-sent signal because of vacuuming the current table + * (we're done with it, so it would make no sense to cancel at this + * point.) + */ + QueryCancelPending = false; + } + PG_CATCH(); + { + /* + * Abort the transaction, start a new one, and proceed with the next + * table in our list. + */ + HOLD_INTERRUPTS(); + errcontext("processing work entry for relation \"%s.%s.%s\"", + cur_datname, cur_nspname, cur_relname); + EmitErrorReport(); + + /* this resets ProcGlobal->statusFlags[i] too */ + AbortOutOfAnyTransaction(); + FlushErrorState(); + MemoryContextResetAndDeleteChildren(PortalContext); + + /* restart our transaction for the following operations */ + StartTransactionCommand(); + RESUME_INTERRUPTS(); + } + PG_END_TRY(); + + /* Make sure we're back in AutovacMemCxt */ + MemoryContextSwitchTo(AutovacMemCxt); + + /* We intentionally do not set did_vacuum here */ + + /* be tidy */ +deleted2: + if (cur_datname) + pfree(cur_datname); + if (cur_nspname) + pfree(cur_nspname); + if (cur_relname) + pfree(cur_relname); +} + +/* + * extract_autovac_opts + * + * Given a relation's pg_class tuple, return the AutoVacOpts portion of + * reloptions, if set; otherwise, return NULL. + * + * Note: callers do not have a relation lock on the table at this point, + * so the table could have been dropped, and its catalog rows gone, after + * we acquired the pg_class row. If pg_class had a TOAST table, this would + * be a risk; fortunately, it doesn't. + */ +static AutoVacOpts * +extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc) +{ + bytea *relopts; + AutoVacOpts *av; + + Assert(((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_RELATION || + ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_MATVIEW || + ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_TOASTVALUE); + + relopts = extractRelOptions(tup, pg_class_desc, NULL); + if (relopts == NULL) + return NULL; + + av = palloc(sizeof(AutoVacOpts)); + memcpy(av, &(((StdRdOptions *) relopts)->autovacuum), sizeof(AutoVacOpts)); + pfree(relopts); + + return av; +} + + +/* + * table_recheck_autovac + * + * Recheck whether a table still needs vacuum or analyze. Return value is a + * valid autovac_table pointer if it does, NULL otherwise. + * + * Note that the returned autovac_table does not have the name fields set. + */ +static autovac_table * +table_recheck_autovac(Oid relid, HTAB *table_toast_map, + TupleDesc pg_class_desc, + int effective_multixact_freeze_max_age) +{ + Form_pg_class classForm; + HeapTuple classTup; + bool dovacuum; + bool doanalyze; + autovac_table *tab = NULL; + bool wraparound; + AutoVacOpts *avopts; + + /* fetch the relation's relcache entry */ + classTup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(classTup)) + return NULL; + classForm = (Form_pg_class) GETSTRUCT(classTup); + + /* + * Get the applicable reloptions. If it is a TOAST table, try to get the + * main table reloptions if the toast table itself doesn't have. + */ + avopts = extract_autovac_opts(classTup, pg_class_desc); + if (classForm->relkind == RELKIND_TOASTVALUE && + avopts == NULL && table_toast_map != NULL) + { + av_relation *hentry; + bool found; + + hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found); + if (found && hentry->ar_hasrelopts) + avopts = &hentry->ar_reloptions; + } + + recheck_relation_needs_vacanalyze(relid, avopts, classForm, + effective_multixact_freeze_max_age, + &dovacuum, &doanalyze, &wraparound); + + /* OK, it needs something done */ + if (doanalyze || dovacuum) + { + int freeze_min_age; + int freeze_table_age; + int multixact_freeze_min_age; + int multixact_freeze_table_age; + int log_min_duration; + + /* + * Calculate the vacuum cost parameters and the freeze ages. If there + * are options set in pg_class.reloptions, use them; in the case of a + * toast table, try the main table too. Otherwise use the GUC + * defaults, autovacuum's own first and plain vacuum second. + */ + + /* -1 in autovac setting means use log_autovacuum_min_duration */ + log_min_duration = (avopts && avopts->log_min_duration >= 0) + ? avopts->log_min_duration + : Log_autovacuum_min_duration; + + /* these do not have autovacuum-specific settings */ + freeze_min_age = (avopts && avopts->freeze_min_age >= 0) + ? avopts->freeze_min_age + : default_freeze_min_age; + + freeze_table_age = (avopts && avopts->freeze_table_age >= 0) + ? avopts->freeze_table_age + : default_freeze_table_age; + + multixact_freeze_min_age = (avopts && + avopts->multixact_freeze_min_age >= 0) + ? avopts->multixact_freeze_min_age + : default_multixact_freeze_min_age; + + multixact_freeze_table_age = (avopts && + avopts->multixact_freeze_table_age >= 0) + ? avopts->multixact_freeze_table_age + : default_multixact_freeze_table_age; + + tab = palloc(sizeof(autovac_table)); + tab->at_relid = relid; + tab->at_sharedrel = classForm->relisshared; + + /* + * Select VACUUM options. Note we don't say VACOPT_PROCESS_TOAST, so + * that vacuum() skips toast relations. Also note we tell vacuum() to + * skip vac_update_datfrozenxid(); we'll do that separately. + */ + tab->at_params.options = + (dovacuum ? (VACOPT_VACUUM | + VACOPT_PROCESS_MAIN | + VACOPT_SKIP_DATABASE_STATS) : 0) | + (doanalyze ? VACOPT_ANALYZE : 0) | + (!wraparound ? VACOPT_SKIP_LOCKED : 0); + + /* + * index_cleanup and truncate are unspecified at first in autovacuum. + * They will be filled in with usable values using their reloptions + * (or reloption defaults) later. + */ + tab->at_params.index_cleanup = VACOPTVALUE_UNSPECIFIED; + tab->at_params.truncate = VACOPTVALUE_UNSPECIFIED; + /* As of now, we don't support parallel vacuum for autovacuum */ + tab->at_params.nworkers = -1; + tab->at_params.freeze_min_age = freeze_min_age; + tab->at_params.freeze_table_age = freeze_table_age; + tab->at_params.multixact_freeze_min_age = multixact_freeze_min_age; + tab->at_params.multixact_freeze_table_age = multixact_freeze_table_age; + tab->at_params.is_wraparound = wraparound; + tab->at_params.log_min_duration = log_min_duration; + tab->at_storage_param_vac_cost_limit = avopts ? + avopts->vacuum_cost_limit : 0; + tab->at_storage_param_vac_cost_delay = avopts ? + avopts->vacuum_cost_delay : -1; + tab->at_relname = NULL; + tab->at_nspname = NULL; + tab->at_datname = NULL; + + /* + * If any of the cost delay parameters has been set individually for + * this table, disable the balancing algorithm. + */ + tab->at_dobalance = + !(avopts && (avopts->vacuum_cost_limit > 0 || + avopts->vacuum_cost_delay >= 0)); + } + + heap_freetuple(classTup); + return tab; +} + +/* + * recheck_relation_needs_vacanalyze + * + * Subroutine for table_recheck_autovac. + * + * Fetch the pgstat of a relation and recheck whether a relation + * needs to be vacuumed or analyzed. + */ +static void +recheck_relation_needs_vacanalyze(Oid relid, + AutoVacOpts *avopts, + Form_pg_class classForm, + int effective_multixact_freeze_max_age, + bool *dovacuum, + bool *doanalyze, + bool *wraparound) +{ + PgStat_StatTabEntry *tabentry; + + /* fetch the pgstat table entry */ + tabentry = pgstat_fetch_stat_tabentry_ext(classForm->relisshared, + relid); + + relation_needs_vacanalyze(relid, avopts, classForm, tabentry, + effective_multixact_freeze_max_age, + dovacuum, doanalyze, wraparound); + + /* ignore ANALYZE for toast tables */ + if (classForm->relkind == RELKIND_TOASTVALUE) + *doanalyze = false; +} + +/* + * relation_needs_vacanalyze + * + * Check whether a relation needs to be vacuumed or analyzed; return each into + * "dovacuum" and "doanalyze", respectively. Also return whether the vacuum is + * being forced because of Xid or multixact wraparound. + * + * relopts is a pointer to the AutoVacOpts options (either for itself in the + * case of a plain table, or for either itself or its parent table in the case + * of a TOAST table), NULL if none; tabentry is the pgstats entry, which can be + * NULL. + * + * A table needs to be vacuumed if the number of dead tuples exceeds a + * threshold. This threshold is calculated as + * + * threshold = vac_base_thresh + vac_scale_factor * reltuples + * + * For analyze, the analysis done is that the number of tuples inserted, + * deleted and updated since the last analyze exceeds a threshold calculated + * in the same fashion as above. Note that the cumulative stats system stores + * the number of tuples (both live and dead) that there were as of the last + * analyze. This is asymmetric to the VACUUM case. + * + * We also force vacuum if the table's relfrozenxid is more than freeze_max_age + * transactions back, and if its relminmxid is more than + * multixact_freeze_max_age multixacts back. + * + * A table whose autovacuum_enabled option is false is + * automatically skipped (unless we have to vacuum it due to freeze_max_age). + * Thus autovacuum can be disabled for specific tables. Also, when the cumulative + * stats system does not have data about a table, it will be skipped. + * + * A table whose vac_base_thresh value is < 0 takes the base value from the + * autovacuum_vacuum_threshold GUC variable. Similarly, a vac_scale_factor + * value < 0 is substituted with the value of + * autovacuum_vacuum_scale_factor GUC variable. Ditto for analyze. + */ +static void +relation_needs_vacanalyze(Oid relid, + AutoVacOpts *relopts, + Form_pg_class classForm, + PgStat_StatTabEntry *tabentry, + int effective_multixact_freeze_max_age, + /* output params below */ + bool *dovacuum, + bool *doanalyze, + bool *wraparound) +{ + bool force_vacuum; + bool av_enabled; + float4 reltuples; /* pg_class.reltuples */ + + /* constants from reloptions or GUC variables */ + int vac_base_thresh, + vac_ins_base_thresh, + anl_base_thresh; + float4 vac_scale_factor, + vac_ins_scale_factor, + anl_scale_factor; + + /* thresholds calculated from above constants */ + float4 vacthresh, + vacinsthresh, + anlthresh; + + /* number of vacuum (resp. analyze) tuples at this time */ + float4 vactuples, + instuples, + anltuples; + + /* freeze parameters */ + int freeze_max_age; + int multixact_freeze_max_age; + TransactionId xidForceLimit; + MultiXactId multiForceLimit; + + Assert(classForm != NULL); + Assert(OidIsValid(relid)); + + /* + * Determine vacuum/analyze equation parameters. We have two possible + * sources: the passed reloptions (which could be a main table or a toast + * table), or the autovacuum GUC variables. + */ + + /* -1 in autovac setting means use plain vacuum_scale_factor */ + vac_scale_factor = (relopts && relopts->vacuum_scale_factor >= 0) + ? relopts->vacuum_scale_factor + : autovacuum_vac_scale; + + vac_base_thresh = (relopts && relopts->vacuum_threshold >= 0) + ? relopts->vacuum_threshold + : autovacuum_vac_thresh; + + vac_ins_scale_factor = (relopts && relopts->vacuum_ins_scale_factor >= 0) + ? relopts->vacuum_ins_scale_factor + : autovacuum_vac_ins_scale; + + /* -1 is used to disable insert vacuums */ + vac_ins_base_thresh = (relopts && relopts->vacuum_ins_threshold >= -1) + ? relopts->vacuum_ins_threshold + : autovacuum_vac_ins_thresh; + + anl_scale_factor = (relopts && relopts->analyze_scale_factor >= 0) + ? relopts->analyze_scale_factor + : autovacuum_anl_scale; + + anl_base_thresh = (relopts && relopts->analyze_threshold >= 0) + ? relopts->analyze_threshold + : autovacuum_anl_thresh; + + freeze_max_age = (relopts && relopts->freeze_max_age >= 0) + ? Min(relopts->freeze_max_age, autovacuum_freeze_max_age) + : autovacuum_freeze_max_age; + + multixact_freeze_max_age = (relopts && relopts->multixact_freeze_max_age >= 0) + ? Min(relopts->multixact_freeze_max_age, effective_multixact_freeze_max_age) + : effective_multixact_freeze_max_age; + + av_enabled = (relopts ? relopts->enabled : true); + + /* Force vacuum if table is at risk of wraparound */ + xidForceLimit = recentXid - freeze_max_age; + if (xidForceLimit < FirstNormalTransactionId) + xidForceLimit -= FirstNormalTransactionId; + force_vacuum = (TransactionIdIsNormal(classForm->relfrozenxid) && + TransactionIdPrecedes(classForm->relfrozenxid, + xidForceLimit)); + if (!force_vacuum) + { + multiForceLimit = recentMulti - multixact_freeze_max_age; + if (multiForceLimit < FirstMultiXactId) + multiForceLimit -= FirstMultiXactId; + force_vacuum = MultiXactIdIsValid(classForm->relminmxid) && + MultiXactIdPrecedes(classForm->relminmxid, multiForceLimit); + } + *wraparound = force_vacuum; + + /* User disabled it in pg_class.reloptions? (But ignore if at risk) */ + if (!av_enabled && !force_vacuum) + { + *doanalyze = false; + *dovacuum = false; + return; + } + + /* + * If we found stats for the table, and autovacuum is currently enabled, + * make a threshold-based decision whether to vacuum and/or analyze. If + * autovacuum is currently disabled, we must be here for anti-wraparound + * vacuuming only, so don't vacuum (or analyze) anything that's not being + * forced. + */ + if (PointerIsValid(tabentry) && AutoVacuumingActive()) + { + reltuples = classForm->reltuples; + vactuples = tabentry->dead_tuples; + instuples = tabentry->ins_since_vacuum; + anltuples = tabentry->mod_since_analyze; + + /* If the table hasn't yet been vacuumed, take reltuples as zero */ + if (reltuples < 0) + reltuples = 0; + + vacthresh = (float4) vac_base_thresh + vac_scale_factor * reltuples; + vacinsthresh = (float4) vac_ins_base_thresh + vac_ins_scale_factor * reltuples; + anlthresh = (float4) anl_base_thresh + anl_scale_factor * reltuples; + + /* + * Note that we don't need to take special consideration for stat + * reset, because if that happens, the last vacuum and analyze counts + * will be reset too. + */ + if (vac_ins_base_thresh >= 0) + elog(DEBUG3, "%s: vac: %.0f (threshold %.0f), ins: %.0f (threshold %.0f), anl: %.0f (threshold %.0f)", + NameStr(classForm->relname), + vactuples, vacthresh, instuples, vacinsthresh, anltuples, anlthresh); + else + elog(DEBUG3, "%s: vac: %.0f (threshold %.0f), ins: (disabled), anl: %.0f (threshold %.0f)", + NameStr(classForm->relname), + vactuples, vacthresh, anltuples, anlthresh); + + /* Determine if this table needs vacuum or analyze. */ + *dovacuum = force_vacuum || (vactuples > vacthresh) || + (vac_ins_base_thresh >= 0 && instuples > vacinsthresh); + *doanalyze = (anltuples > anlthresh); + } + else + { + /* + * Skip a table not found in stat hash, unless we have to force vacuum + * for anti-wrap purposes. If it's not acted upon, there's no need to + * vacuum it. + */ + *dovacuum = force_vacuum; + *doanalyze = false; + } + + /* ANALYZE refuses to work with pg_statistic */ + if (relid == StatisticRelationId) + *doanalyze = false; +} + +/* + * autovacuum_do_vac_analyze + * Vacuum and/or analyze the specified table + * + * We expect the caller to have switched into a memory context that won't + * disappear at transaction commit. + */ +static void +autovacuum_do_vac_analyze(autovac_table *tab, BufferAccessStrategy bstrategy) +{ + RangeVar *rangevar; + VacuumRelation *rel; + List *rel_list; + MemoryContext vac_context; + + /* Let pgstat know what we're doing */ + autovac_report_activity(tab); + + /* Set up one VacuumRelation target, identified by OID, for vacuum() */ + rangevar = makeRangeVar(tab->at_nspname, tab->at_relname, -1); + rel = makeVacuumRelation(rangevar, tab->at_relid, NIL); + rel_list = list_make1(rel); + + vac_context = AllocSetContextCreate(CurrentMemoryContext, + "Vacuum", + ALLOCSET_DEFAULT_SIZES); + + vacuum(rel_list, &tab->at_params, bstrategy, vac_context, true); + + MemoryContextDelete(vac_context); +} + +/* + * autovac_report_activity + * Report to pgstat what autovacuum is doing + * + * We send a SQL string corresponding to what the user would see if the + * equivalent command was to be issued manually. + * + * Note we assume that we are going to report the next command as soon as we're + * done with the current one, and exit right after the last one, so we don't + * bother to report "<IDLE>" or some such. + */ +static void +autovac_report_activity(autovac_table *tab) +{ +#define MAX_AUTOVAC_ACTIV_LEN (NAMEDATALEN * 2 + 56) + char activity[MAX_AUTOVAC_ACTIV_LEN]; + int len; + + /* Report the command and possible options */ + if (tab->at_params.options & VACOPT_VACUUM) + snprintf(activity, MAX_AUTOVAC_ACTIV_LEN, + "autovacuum: VACUUM%s", + tab->at_params.options & VACOPT_ANALYZE ? " ANALYZE" : ""); + else + snprintf(activity, MAX_AUTOVAC_ACTIV_LEN, + "autovacuum: ANALYZE"); + + /* + * Report the qualified name of the relation. + */ + len = strlen(activity); + + snprintf(activity + len, MAX_AUTOVAC_ACTIV_LEN - len, + " %s.%s%s", tab->at_nspname, tab->at_relname, + tab->at_params.is_wraparound ? " (to prevent wraparound)" : ""); + + /* Set statement_timestamp() to current time for pg_stat_activity */ + SetCurrentStatementStartTimestamp(); + + pgstat_report_activity(STATE_RUNNING, activity); +} + +/* + * autovac_report_workitem + * Report to pgstat that autovacuum is processing a work item + */ +static void +autovac_report_workitem(AutoVacuumWorkItem *workitem, + const char *nspname, const char *relname) +{ + char activity[MAX_AUTOVAC_ACTIV_LEN + 12 + 2]; + char blk[12 + 2]; + int len; + + switch (workitem->avw_type) + { + case AVW_BRINSummarizeRange: + snprintf(activity, MAX_AUTOVAC_ACTIV_LEN, + "autovacuum: BRIN summarize"); + break; + } + + /* + * Report the qualified name of the relation, and the block number if any + */ + len = strlen(activity); + + if (BlockNumberIsValid(workitem->avw_blockNumber)) + snprintf(blk, sizeof(blk), " %u", workitem->avw_blockNumber); + else + blk[0] = '\0'; + + snprintf(activity + len, MAX_AUTOVAC_ACTIV_LEN - len, + " %s.%s%s", nspname, relname, blk); + + /* Set statement_timestamp() to current time for pg_stat_activity */ + SetCurrentStatementStartTimestamp(); + + pgstat_report_activity(STATE_RUNNING, activity); +} + +/* + * AutoVacuumingActive + * Check GUC vars and report whether the autovacuum process should be + * running. + */ +bool +AutoVacuumingActive(void) +{ + if (!autovacuum_start_daemon || !pgstat_track_counts) + return false; + return true; +} + +/* + * Request one work item to the next autovacuum run processing our database. + * Return false if the request can't be recorded. + */ +bool +AutoVacuumRequestWork(AutoVacuumWorkItemType type, Oid relationId, + BlockNumber blkno) +{ + int i; + bool result = false; + + LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); + + /* + * Locate an unused work item and fill it with the given data. + */ + for (i = 0; i < NUM_WORKITEMS; i++) + { + AutoVacuumWorkItem *workitem = &AutoVacuumShmem->av_workItems[i]; + + if (workitem->avw_used) + continue; + + workitem->avw_used = true; + workitem->avw_active = false; + workitem->avw_type = type; + workitem->avw_database = MyDatabaseId; + workitem->avw_relation = relationId; + workitem->avw_blockNumber = blkno; + result = true; + + /* done */ + break; + } + + LWLockRelease(AutovacuumLock); + + return result; +} + +/* + * autovac_init + * This is called at postmaster initialization. + * + * All we do here is annoy the user if he got it wrong. + */ +void +autovac_init(void) +{ + if (autovacuum_start_daemon && !pgstat_track_counts) + ereport(WARNING, + (errmsg("autovacuum not started because of misconfiguration"), + errhint("Enable the \"track_counts\" option."))); +} + +/* + * IsAutoVacuum functions + * Return whether this is either a launcher autovacuum process or a worker + * process. + */ +bool +IsAutoVacuumLauncherProcess(void) +{ + return am_autovacuum_launcher; +} + +bool +IsAutoVacuumWorkerProcess(void) +{ + return am_autovacuum_worker; +} + + +/* + * AutoVacuumShmemSize + * Compute space needed for autovacuum-related shared memory + */ +Size +AutoVacuumShmemSize(void) +{ + Size size; + + /* + * Need the fixed struct and the array of WorkerInfoData. + */ + size = sizeof(AutoVacuumShmemStruct); + size = MAXALIGN(size); + size = add_size(size, mul_size(autovacuum_max_workers, + sizeof(WorkerInfoData))); + return size; +} + +/* + * AutoVacuumShmemInit + * Allocate and initialize autovacuum-related shared memory + */ +void +AutoVacuumShmemInit(void) +{ + bool found; + + AutoVacuumShmem = (AutoVacuumShmemStruct *) + ShmemInitStruct("AutoVacuum Data", + AutoVacuumShmemSize(), + &found); + + if (!IsUnderPostmaster) + { + WorkerInfo worker; + int i; + + Assert(!found); + + AutoVacuumShmem->av_launcherpid = 0; + dlist_init(&AutoVacuumShmem->av_freeWorkers); + dlist_init(&AutoVacuumShmem->av_runningWorkers); + AutoVacuumShmem->av_startingWorker = NULL; + memset(AutoVacuumShmem->av_workItems, 0, + sizeof(AutoVacuumWorkItem) * NUM_WORKITEMS); + + worker = (WorkerInfo) ((char *) AutoVacuumShmem + + MAXALIGN(sizeof(AutoVacuumShmemStruct))); + + /* initialize the WorkerInfo free list */ + for (i = 0; i < autovacuum_max_workers; i++) + { + dlist_push_head(&AutoVacuumShmem->av_freeWorkers, + &worker[i].wi_links); + pg_atomic_init_flag(&worker[i].wi_dobalance); + } + + pg_atomic_init_u32(&AutoVacuumShmem->av_nworkersForBalance, 0); + + } + else + Assert(found); +} + +/* + * GUC check_hook for autovacuum_work_mem + */ +bool +check_autovacuum_work_mem(int *newval, void **extra, GucSource source) +{ + /* + * -1 indicates fallback. + * + * If we haven't yet changed the boot_val default of -1, just let it be. + * Autovacuum will look to maintenance_work_mem instead. + */ + if (*newval == -1) + return true; + + /* + * We clamp manually-set values to at least 1MB. Since + * maintenance_work_mem is always set to at least this value, do the same + * here. + */ + if (*newval < 1024) + *newval = 1024; + + return true; +} diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c new file mode 100644 index 0000000..cae6feb --- /dev/null +++ b/src/backend/postmaster/auxprocess.c @@ -0,0 +1,183 @@ +/*------------------------------------------------------------------------- + * auxprocess.c + * functions related to auxiliary processes. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/postmaster/auxprocess.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <unistd.h> +#include <signal.h> + +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/auxprocess.h" +#include "postmaster/bgwriter.h" +#include "postmaster/startup.h" +#include "postmaster/walwriter.h" +#include "replication/walreceiver.h" +#include "storage/bufmgr.h" +#include "storage/bufpage.h" +#include "storage/condition_variable.h" +#include "storage/ipc.h" +#include "storage/proc.h" +#include "tcop/tcopprot.h" +#include "utils/memutils.h" +#include "utils/ps_status.h" +#include "utils/rel.h" + + +static void ShutdownAuxiliaryProcess(int code, Datum arg); + + +/* ---------------- + * global variables + * ---------------- + */ + +AuxProcType MyAuxProcType = NotAnAuxProcess; /* declared in miscadmin.h */ + + +/* + * AuxiliaryProcessMain + * + * The main entry point for auxiliary processes, such as the bgwriter, + * walwriter, walreceiver, bootstrapper and the shared memory checker code. + * + * This code is here just because of historical reasons. + */ +void +AuxiliaryProcessMain(AuxProcType auxtype) +{ + Assert(IsUnderPostmaster); + + MyAuxProcType = auxtype; + + switch (MyAuxProcType) + { + case StartupProcess: + MyBackendType = B_STARTUP; + break; + case ArchiverProcess: + MyBackendType = B_ARCHIVER; + break; + case BgWriterProcess: + MyBackendType = B_BG_WRITER; + break; + case CheckpointerProcess: + MyBackendType = B_CHECKPOINTER; + break; + case WalWriterProcess: + MyBackendType = B_WAL_WRITER; + break; + case WalReceiverProcess: + MyBackendType = B_WAL_RECEIVER; + break; + default: + elog(PANIC, "unrecognized process type: %d", (int) MyAuxProcType); + MyBackendType = B_INVALID; + } + + init_ps_display(NULL); + + SetProcessingMode(BootstrapProcessing); + IgnoreSystemIndexes = true; + + /* + * As an auxiliary process, we aren't going to do the full InitPostgres + * pushups, but there are a couple of things that need to get lit up even + * in an auxiliary process. + */ + + /* + * Create a PGPROC so we can use LWLocks. In the EXEC_BACKEND case, this + * was already done by SubPostmasterMain(). + */ +#ifndef EXEC_BACKEND + InitAuxiliaryProcess(); +#endif + + BaseInit(); + + /* + * Assign the ProcSignalSlot for an auxiliary process. Since it doesn't + * have a BackendId, the slot is statically allocated based on the + * auxiliary process type (MyAuxProcType). Backends use slots indexed in + * the range from 1 to MaxBackends (inclusive), so we use MaxBackends + + * AuxProcType + 1 as the index of the slot for an auxiliary process. + * + * This will need rethinking if we ever want more than one of a particular + * auxiliary process type. + */ + ProcSignalInit(MaxBackends + MyAuxProcType + 1); + + /* + * Auxiliary processes don't run transactions, but they may need a + * resource owner anyway to manage buffer pins acquired outside + * transactions (and, perhaps, other things in future). + */ + CreateAuxProcessResourceOwner(); + + + /* Initialize backend status information */ + pgstat_beinit(); + pgstat_bestart(); + + /* register a before-shutdown callback for LWLock cleanup */ + before_shmem_exit(ShutdownAuxiliaryProcess, 0); + + SetProcessingMode(NormalProcessing); + + switch (MyAuxProcType) + { + case StartupProcess: + StartupProcessMain(); + proc_exit(1); + + case ArchiverProcess: + PgArchiverMain(); + proc_exit(1); + + case BgWriterProcess: + BackgroundWriterMain(); + proc_exit(1); + + case CheckpointerProcess: + CheckpointerMain(); + proc_exit(1); + + case WalWriterProcess: + WalWriterMain(); + proc_exit(1); + + case WalReceiverProcess: + WalReceiverMain(); + proc_exit(1); + + default: + elog(PANIC, "unrecognized process type: %d", (int) MyAuxProcType); + proc_exit(1); + } +} + +/* + * Begin shutdown of an auxiliary process. This is approximately the equivalent + * of ShutdownPostgres() in postinit.c. We can't run transactions in an + * auxiliary process, so most of the work of AbortTransaction() is not needed, + * but we do need to make sure we've released any LWLocks we are holding. + * (This is only critical during an error exit.) + */ +static void +ShutdownAuxiliaryProcess(int code, Datum arg) +{ + LWLockReleaseAll(); + ConditionVariableCancelSleep(); + pgstat_report_wait_end(); +} diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c new file mode 100644 index 0000000..0dd22b2 --- /dev/null +++ b/src/backend/postmaster/bgworker.c @@ -0,0 +1,1311 @@ +/*-------------------------------------------------------------------- + * bgworker.c + * POSTGRES pluggable background workers implementation + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/postmaster/bgworker.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/parallel.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "port/atomics.h" +#include "postmaster/bgworker_internals.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "replication/logicallauncher.h" +#include "replication/logicalworker.h" +#include "storage/dsm.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/lwlock.h" +#include "storage/pg_shmem.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" +#include "tcop/tcopprot.h" +#include "utils/ascii.h" +#include "utils/ps_status.h" +#include "utils/timeout.h" + +/* + * The postmaster's list of registered background workers, in private memory. + */ +slist_head BackgroundWorkerList = SLIST_STATIC_INIT(BackgroundWorkerList); + +/* + * BackgroundWorkerSlots exist in shared memory and can be accessed (via + * the BackgroundWorkerArray) by both the postmaster and by regular backends. + * However, the postmaster cannot take locks, even spinlocks, because this + * might allow it to crash or become wedged if shared memory gets corrupted. + * Such an outcome is intolerable. Therefore, we need a lockless protocol + * for coordinating access to this data. + * + * The 'in_use' flag is used to hand off responsibility for the slot between + * the postmaster and the rest of the system. When 'in_use' is false, + * the postmaster will ignore the slot entirely, except for the 'in_use' flag + * itself, which it may read. In this state, regular backends may modify the + * slot. Once a backend sets 'in_use' to true, the slot becomes the + * responsibility of the postmaster. Regular backends may no longer modify it, + * but the postmaster may examine it. Thus, a backend initializing a slot + * must fully initialize the slot - and insert a write memory barrier - before + * marking it as in use. + * + * As an exception, however, even when the slot is in use, regular backends + * may set the 'terminate' flag for a slot, telling the postmaster not + * to restart it. Once the background worker is no longer running, the slot + * will be released for reuse. + * + * In addition to coordinating with the postmaster, backends modifying this + * data structure must coordinate with each other. Since they can take locks, + * this is straightforward: any backend wishing to manipulate a slot must + * take BackgroundWorkerLock in exclusive mode. Backends wishing to read + * data that might get concurrently modified by other backends should take + * this lock in shared mode. No matter what, backends reading this data + * structure must be able to tolerate concurrent modifications by the + * postmaster. + */ +typedef struct BackgroundWorkerSlot +{ + bool in_use; + bool terminate; + pid_t pid; /* InvalidPid = not started yet; 0 = dead */ + uint64 generation; /* incremented when slot is recycled */ + BackgroundWorker worker; +} BackgroundWorkerSlot; + +/* + * In order to limit the total number of parallel workers (according to + * max_parallel_workers GUC), we maintain the number of active parallel + * workers. Since the postmaster cannot take locks, two variables are used for + * this purpose: the number of registered parallel workers (modified by the + * backends, protected by BackgroundWorkerLock) and the number of terminated + * parallel workers (modified only by the postmaster, lockless). The active + * number of parallel workers is the number of registered workers minus the + * terminated ones. These counters can of course overflow, but it's not + * important here since the subtraction will still give the right number. + */ +typedef struct BackgroundWorkerArray +{ + int total_slots; + uint32 parallel_register_count; + uint32 parallel_terminate_count; + BackgroundWorkerSlot slot[FLEXIBLE_ARRAY_MEMBER]; +} BackgroundWorkerArray; + +struct BackgroundWorkerHandle +{ + int slot; + uint64 generation; +}; + +static BackgroundWorkerArray *BackgroundWorkerData; + +/* + * List of internal background worker entry points. We need this for + * reasons explained in LookupBackgroundWorkerFunction(), below. + */ +static const struct +{ + const char *fn_name; + bgworker_main_type fn_addr; +} InternalBGWorkers[] = + +{ + { + "ParallelWorkerMain", ParallelWorkerMain + }, + { + "ApplyLauncherMain", ApplyLauncherMain + }, + { + "ApplyWorkerMain", ApplyWorkerMain + }, + { + "ParallelApplyWorkerMain", ParallelApplyWorkerMain + } +}; + +/* Private functions. */ +static bgworker_main_type LookupBackgroundWorkerFunction(const char *libraryname, const char *funcname); + + +/* + * Calculate shared memory needed. + */ +Size +BackgroundWorkerShmemSize(void) +{ + Size size; + + /* Array of workers is variably sized. */ + size = offsetof(BackgroundWorkerArray, slot); + size = add_size(size, mul_size(max_worker_processes, + sizeof(BackgroundWorkerSlot))); + + return size; +} + +/* + * Initialize shared memory. + */ +void +BackgroundWorkerShmemInit(void) +{ + bool found; + + BackgroundWorkerData = ShmemInitStruct("Background Worker Data", + BackgroundWorkerShmemSize(), + &found); + if (!IsUnderPostmaster) + { + slist_iter siter; + int slotno = 0; + + BackgroundWorkerData->total_slots = max_worker_processes; + BackgroundWorkerData->parallel_register_count = 0; + BackgroundWorkerData->parallel_terminate_count = 0; + + /* + * Copy contents of worker list into shared memory. Record the shared + * memory slot assigned to each worker. This ensures a 1-to-1 + * correspondence between the postmaster's private list and the array + * in shared memory. + */ + slist_foreach(siter, &BackgroundWorkerList) + { + BackgroundWorkerSlot *slot = &BackgroundWorkerData->slot[slotno]; + RegisteredBgWorker *rw; + + rw = slist_container(RegisteredBgWorker, rw_lnode, siter.cur); + Assert(slotno < max_worker_processes); + slot->in_use = true; + slot->terminate = false; + slot->pid = InvalidPid; + slot->generation = 0; + rw->rw_shmem_slot = slotno; + rw->rw_worker.bgw_notify_pid = 0; /* might be reinit after crash */ + memcpy(&slot->worker, &rw->rw_worker, sizeof(BackgroundWorker)); + ++slotno; + } + + /* + * Mark any remaining slots as not in use. + */ + while (slotno < max_worker_processes) + { + BackgroundWorkerSlot *slot = &BackgroundWorkerData->slot[slotno]; + + slot->in_use = false; + ++slotno; + } + } + else + Assert(found); +} + +/* + * Search the postmaster's backend-private list of RegisteredBgWorker objects + * for the one that maps to the given slot number. + */ +static RegisteredBgWorker * +FindRegisteredWorkerBySlotNumber(int slotno) +{ + slist_iter siter; + + slist_foreach(siter, &BackgroundWorkerList) + { + RegisteredBgWorker *rw; + + rw = slist_container(RegisteredBgWorker, rw_lnode, siter.cur); + if (rw->rw_shmem_slot == slotno) + return rw; + } + + return NULL; +} + +/* + * Notice changes to shared memory made by other backends. + * Accept new worker requests only if allow_new_workers is true. + * + * This code runs in the postmaster, so we must be very careful not to assume + * that shared memory contents are sane. Otherwise, a rogue backend could + * take out the postmaster. + */ +void +BackgroundWorkerStateChange(bool allow_new_workers) +{ + int slotno; + + /* + * The total number of slots stored in shared memory should match our + * notion of max_worker_processes. If it does not, something is very + * wrong. Further down, we always refer to this value as + * max_worker_processes, in case shared memory gets corrupted while we're + * looping. + */ + if (max_worker_processes != BackgroundWorkerData->total_slots) + { + ereport(LOG, + (errmsg("inconsistent background worker state (max_worker_processes=%d, total_slots=%d)", + max_worker_processes, + BackgroundWorkerData->total_slots))); + return; + } + + /* + * Iterate through slots, looking for newly-registered workers or workers + * who must die. + */ + for (slotno = 0; slotno < max_worker_processes; ++slotno) + { + BackgroundWorkerSlot *slot = &BackgroundWorkerData->slot[slotno]; + RegisteredBgWorker *rw; + + if (!slot->in_use) + continue; + + /* + * Make sure we don't see the in_use flag before the updated slot + * contents. + */ + pg_read_barrier(); + + /* See whether we already know about this worker. */ + rw = FindRegisteredWorkerBySlotNumber(slotno); + if (rw != NULL) + { + /* + * In general, the worker data can't change after it's initially + * registered. However, someone can set the terminate flag. + */ + if (slot->terminate && !rw->rw_terminate) + { + rw->rw_terminate = true; + if (rw->rw_pid != 0) + kill(rw->rw_pid, SIGTERM); + else + { + /* Report never-started, now-terminated worker as dead. */ + ReportBackgroundWorkerPID(rw); + } + } + continue; + } + + /* + * If we aren't allowing new workers, then immediately mark it for + * termination; the next stanza will take care of cleaning it up. + * Doing this ensures that any process waiting for the worker will get + * awoken, even though the worker will never be allowed to run. + */ + if (!allow_new_workers) + slot->terminate = true; + + /* + * If the worker is marked for termination, we don't need to add it to + * the registered workers list; we can just free the slot. However, if + * bgw_notify_pid is set, the process that registered the worker may + * need to know that we've processed the terminate request, so be sure + * to signal it. + */ + if (slot->terminate) + { + int notify_pid; + + /* + * We need a memory barrier here to make sure that the load of + * bgw_notify_pid and the update of parallel_terminate_count + * complete before the store to in_use. + */ + notify_pid = slot->worker.bgw_notify_pid; + if ((slot->worker.bgw_flags & BGWORKER_CLASS_PARALLEL) != 0) + BackgroundWorkerData->parallel_terminate_count++; + slot->pid = 0; + + pg_memory_barrier(); + slot->in_use = false; + + if (notify_pid != 0) + kill(notify_pid, SIGUSR1); + + continue; + } + + /* + * Copy the registration data into the registered workers list. + */ + rw = malloc(sizeof(RegisteredBgWorker)); + if (rw == NULL) + { + ereport(LOG, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + return; + } + + /* + * Copy strings in a paranoid way. If shared memory is corrupted, the + * source data might not even be NUL-terminated. + */ + ascii_safe_strlcpy(rw->rw_worker.bgw_name, + slot->worker.bgw_name, BGW_MAXLEN); + ascii_safe_strlcpy(rw->rw_worker.bgw_type, + slot->worker.bgw_type, BGW_MAXLEN); + ascii_safe_strlcpy(rw->rw_worker.bgw_library_name, + slot->worker.bgw_library_name, BGW_MAXLEN); + ascii_safe_strlcpy(rw->rw_worker.bgw_function_name, + slot->worker.bgw_function_name, BGW_MAXLEN); + + /* + * Copy various fixed-size fields. + * + * flags, start_time, and restart_time are examined by the postmaster, + * but nothing too bad will happen if they are corrupted. The + * remaining fields will only be examined by the child process. It + * might crash, but we won't. + */ + rw->rw_worker.bgw_flags = slot->worker.bgw_flags; + rw->rw_worker.bgw_start_time = slot->worker.bgw_start_time; + rw->rw_worker.bgw_restart_time = slot->worker.bgw_restart_time; + rw->rw_worker.bgw_main_arg = slot->worker.bgw_main_arg; + memcpy(rw->rw_worker.bgw_extra, slot->worker.bgw_extra, BGW_EXTRALEN); + + /* + * Copy the PID to be notified about state changes, but only if the + * postmaster knows about a backend with that PID. It isn't an error + * if the postmaster doesn't know about the PID, because the backend + * that requested the worker could have died (or been killed) just + * after doing so. Nonetheless, at least until we get some experience + * with how this plays out in the wild, log a message at a relative + * high debug level. + */ + rw->rw_worker.bgw_notify_pid = slot->worker.bgw_notify_pid; + if (!PostmasterMarkPIDForWorkerNotify(rw->rw_worker.bgw_notify_pid)) + { + elog(DEBUG1, "worker notification PID %d is not valid", + (int) rw->rw_worker.bgw_notify_pid); + rw->rw_worker.bgw_notify_pid = 0; + } + + /* Initialize postmaster bookkeeping. */ + rw->rw_backend = NULL; + rw->rw_pid = 0; + rw->rw_child_slot = 0; + rw->rw_crashed_at = 0; + rw->rw_shmem_slot = slotno; + rw->rw_terminate = false; + + /* Log it! */ + ereport(DEBUG1, + (errmsg_internal("registering background worker \"%s\"", + rw->rw_worker.bgw_name))); + + slist_push_head(&BackgroundWorkerList, &rw->rw_lnode); + } +} + +/* + * Forget about a background worker that's no longer needed. + * + * The worker must be identified by passing an slist_mutable_iter that + * points to it. This convention allows deletion of workers during + * searches of the worker list, and saves having to search the list again. + * + * Caller is responsible for notifying bgw_notify_pid, if appropriate. + * + * This function must be invoked only in the postmaster. + */ +void +ForgetBackgroundWorker(slist_mutable_iter *cur) +{ + RegisteredBgWorker *rw; + BackgroundWorkerSlot *slot; + + rw = slist_container(RegisteredBgWorker, rw_lnode, cur->cur); + + Assert(rw->rw_shmem_slot < max_worker_processes); + slot = &BackgroundWorkerData->slot[rw->rw_shmem_slot]; + Assert(slot->in_use); + + /* + * We need a memory barrier here to make sure that the update of + * parallel_terminate_count completes before the store to in_use. + */ + if ((rw->rw_worker.bgw_flags & BGWORKER_CLASS_PARALLEL) != 0) + BackgroundWorkerData->parallel_terminate_count++; + + pg_memory_barrier(); + slot->in_use = false; + + ereport(DEBUG1, + (errmsg_internal("unregistering background worker \"%s\"", + rw->rw_worker.bgw_name))); + + slist_delete_current(cur); + free(rw); +} + +/* + * Report the PID of a newly-launched background worker in shared memory. + * + * This function should only be called from the postmaster. + */ +void +ReportBackgroundWorkerPID(RegisteredBgWorker *rw) +{ + BackgroundWorkerSlot *slot; + + Assert(rw->rw_shmem_slot < max_worker_processes); + slot = &BackgroundWorkerData->slot[rw->rw_shmem_slot]; + slot->pid = rw->rw_pid; + + if (rw->rw_worker.bgw_notify_pid != 0) + kill(rw->rw_worker.bgw_notify_pid, SIGUSR1); +} + +/* + * Report that the PID of a background worker is now zero because a + * previously-running background worker has exited. + * + * This function should only be called from the postmaster. + */ +void +ReportBackgroundWorkerExit(slist_mutable_iter *cur) +{ + RegisteredBgWorker *rw; + BackgroundWorkerSlot *slot; + int notify_pid; + + rw = slist_container(RegisteredBgWorker, rw_lnode, cur->cur); + + Assert(rw->rw_shmem_slot < max_worker_processes); + slot = &BackgroundWorkerData->slot[rw->rw_shmem_slot]; + slot->pid = rw->rw_pid; + notify_pid = rw->rw_worker.bgw_notify_pid; + + /* + * If this worker is slated for deregistration, do that before notifying + * the process which started it. Otherwise, if that process tries to + * reuse the slot immediately, it might not be available yet. In theory + * that could happen anyway if the process checks slot->pid at just the + * wrong moment, but this makes the window narrower. + */ + if (rw->rw_terminate || + rw->rw_worker.bgw_restart_time == BGW_NEVER_RESTART) + ForgetBackgroundWorker(cur); + + if (notify_pid != 0) + kill(notify_pid, SIGUSR1); +} + +/* + * Cancel SIGUSR1 notifications for a PID belonging to an exiting backend. + * + * This function should only be called from the postmaster. + */ +void +BackgroundWorkerStopNotifications(pid_t pid) +{ + slist_iter siter; + + slist_foreach(siter, &BackgroundWorkerList) + { + RegisteredBgWorker *rw; + + rw = slist_container(RegisteredBgWorker, rw_lnode, siter.cur); + if (rw->rw_worker.bgw_notify_pid == pid) + rw->rw_worker.bgw_notify_pid = 0; + } +} + +/* + * Cancel any not-yet-started worker requests that have waiting processes. + * + * This is called during a normal ("smart" or "fast") database shutdown. + * After this point, no new background workers will be started, so anything + * that might be waiting for them needs to be kicked off its wait. We do + * that by canceling the bgworker registration entirely, which is perhaps + * overkill, but since we're shutting down it does not matter whether the + * registration record sticks around. + * + * This function should only be called from the postmaster. + */ +void +ForgetUnstartedBackgroundWorkers(void) +{ + slist_mutable_iter iter; + + slist_foreach_modify(iter, &BackgroundWorkerList) + { + RegisteredBgWorker *rw; + BackgroundWorkerSlot *slot; + + rw = slist_container(RegisteredBgWorker, rw_lnode, iter.cur); + Assert(rw->rw_shmem_slot < max_worker_processes); + slot = &BackgroundWorkerData->slot[rw->rw_shmem_slot]; + + /* If it's not yet started, and there's someone waiting ... */ + if (slot->pid == InvalidPid && + rw->rw_worker.bgw_notify_pid != 0) + { + /* ... then zap it, and notify the waiter */ + int notify_pid = rw->rw_worker.bgw_notify_pid; + + ForgetBackgroundWorker(&iter); + if (notify_pid != 0) + kill(notify_pid, SIGUSR1); + } + } +} + +/* + * Reset background worker crash state. + * + * We assume that, after a crash-and-restart cycle, background workers without + * the never-restart flag should be restarted immediately, instead of waiting + * for bgw_restart_time to elapse. On the other hand, workers with that flag + * should be forgotten immediately, since we won't ever restart them. + * + * This function should only be called from the postmaster. + */ +void +ResetBackgroundWorkerCrashTimes(void) +{ + slist_mutable_iter iter; + + slist_foreach_modify(iter, &BackgroundWorkerList) + { + RegisteredBgWorker *rw; + + rw = slist_container(RegisteredBgWorker, rw_lnode, iter.cur); + + if (rw->rw_worker.bgw_restart_time == BGW_NEVER_RESTART) + { + /* + * Workers marked BGW_NEVER_RESTART shouldn't get relaunched after + * the crash, so forget about them. (If we wait until after the + * crash to forget about them, and they are parallel workers, + * parallel_terminate_count will get incremented after we've + * already zeroed parallel_register_count, which would be bad.) + */ + ForgetBackgroundWorker(&iter); + } + else + { + /* + * The accounting which we do via parallel_register_count and + * parallel_terminate_count would get messed up if a worker marked + * parallel could survive a crash and restart cycle. All such + * workers should be marked BGW_NEVER_RESTART, and thus control + * should never reach this branch. + */ + Assert((rw->rw_worker.bgw_flags & BGWORKER_CLASS_PARALLEL) == 0); + + /* + * Allow this worker to be restarted immediately after we finish + * resetting. + */ + rw->rw_crashed_at = 0; + + /* + * If there was anyone waiting for it, they're history. + */ + rw->rw_worker.bgw_notify_pid = 0; + } + } +} + +#ifdef EXEC_BACKEND +/* + * In EXEC_BACKEND mode, workers use this to retrieve their details from + * shared memory. + */ +BackgroundWorker * +BackgroundWorkerEntry(int slotno) +{ + static BackgroundWorker myEntry; + BackgroundWorkerSlot *slot; + + Assert(slotno < BackgroundWorkerData->total_slots); + slot = &BackgroundWorkerData->slot[slotno]; + Assert(slot->in_use); + + /* must copy this in case we don't intend to retain shmem access */ + memcpy(&myEntry, &slot->worker, sizeof myEntry); + return &myEntry; +} +#endif + +/* + * Complain about the BackgroundWorker definition using error level elevel. + * Return true if it looks ok, false if not (unless elevel >= ERROR, in + * which case we won't return at all in the not-OK case). + */ +static bool +SanityCheckBackgroundWorker(BackgroundWorker *worker, int elevel) +{ + /* sanity check for flags */ + + /* + * We used to support workers not connected to shared memory, but don't + * anymore. Thus this is a required flag now. We're not removing the flag + * for compatibility reasons and because the flag still provides some + * signal when reading code. + */ + if (!(worker->bgw_flags & BGWORKER_SHMEM_ACCESS)) + { + ereport(elevel, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("background worker \"%s\": background workers without shared memory access are not supported", + worker->bgw_name))); + return false; + } + + if (worker->bgw_flags & BGWORKER_BACKEND_DATABASE_CONNECTION) + { + if (worker->bgw_start_time == BgWorkerStart_PostmasterStart) + { + ereport(elevel, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("background worker \"%s\": cannot request database access if starting at postmaster start", + worker->bgw_name))); + return false; + } + + /* XXX other checks? */ + } + + if ((worker->bgw_restart_time < 0 && + worker->bgw_restart_time != BGW_NEVER_RESTART) || + (worker->bgw_restart_time > USECS_PER_DAY / 1000)) + { + ereport(elevel, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("background worker \"%s\": invalid restart interval", + worker->bgw_name))); + return false; + } + + /* + * Parallel workers may not be configured for restart, because the + * parallel_register_count/parallel_terminate_count accounting can't + * handle parallel workers lasting through a crash-and-restart cycle. + */ + if (worker->bgw_restart_time != BGW_NEVER_RESTART && + (worker->bgw_flags & BGWORKER_CLASS_PARALLEL) != 0) + { + ereport(elevel, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("background worker \"%s\": parallel workers may not be configured for restart", + worker->bgw_name))); + return false; + } + + /* + * If bgw_type is not filled in, use bgw_name. + */ + if (strcmp(worker->bgw_type, "") == 0) + strcpy(worker->bgw_type, worker->bgw_name); + + return true; +} + +/* + * Standard SIGTERM handler for background workers + */ +static void +bgworker_die(SIGNAL_ARGS) +{ + sigprocmask(SIG_SETMASK, &BlockSig, NULL); + + ereport(FATAL, + (errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("terminating background worker \"%s\" due to administrator command", + MyBgworkerEntry->bgw_type))); +} + +/* + * Start a new background worker + * + * This is the main entry point for background worker, to be called from + * postmaster. + */ +void +StartBackgroundWorker(void) +{ + sigjmp_buf local_sigjmp_buf; + BackgroundWorker *worker = MyBgworkerEntry; + bgworker_main_type entrypt; + + if (worker == NULL) + elog(FATAL, "unable to find bgworker entry"); + + IsBackgroundWorker = true; + + MyBackendType = B_BG_WORKER; + init_ps_display(worker->bgw_name); + + SetProcessingMode(InitProcessing); + + /* Apply PostAuthDelay */ + if (PostAuthDelay > 0) + pg_usleep(PostAuthDelay * 1000000L); + + /* + * Set up signal handlers. + */ + if (worker->bgw_flags & BGWORKER_BACKEND_DATABASE_CONNECTION) + { + /* + * SIGINT is used to signal canceling the current action + */ + pqsignal(SIGINT, StatementCancelHandler); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGFPE, FloatExceptionHandler); + + /* XXX Any other handlers needed here? */ + } + else + { + pqsignal(SIGINT, SIG_IGN); + pqsignal(SIGUSR1, SIG_IGN); + pqsignal(SIGFPE, SIG_IGN); + } + pqsignal(SIGTERM, bgworker_die); + /* SIGQUIT handler was already set up by InitPostmasterChild */ + pqsignal(SIGHUP, SIG_IGN); + + InitializeTimeouts(); /* establishes SIGALRM handler */ + + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR2, SIG_IGN); + pqsignal(SIGCHLD, SIG_DFL); + + /* + * If an exception is encountered, processing resumes here. + * + * We just need to clean up, report the error, and go away. + */ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* Since not using PG_TRY, must reset error stack by hand */ + error_context_stack = NULL; + + /* Prevent interrupts while cleaning up */ + HOLD_INTERRUPTS(); + + /* + * sigsetjmp will have blocked all signals, but we may need to accept + * signals while communicating with our parallel leader. Once we've + * done HOLD_INTERRUPTS() it should be safe to unblock signals. + */ + BackgroundWorkerUnblockSignals(); + + /* Report the error to the parallel leader and the server log */ + EmitErrorReport(); + + /* + * Do we need more cleanup here? For shmem-connected bgworkers, we + * will call InitProcess below, which will install ProcKill as exit + * callback. That will take care of releasing locks, etc. + */ + + /* and go away */ + proc_exit(1); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + /* + * Create a per-backend PGPROC struct in shared memory, except in the + * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do + * this before we can use LWLocks (and in the EXEC_BACKEND case we already + * had to do some stuff with LWLocks). + */ +#ifndef EXEC_BACKEND + InitProcess(); +#endif + + /* + * Early initialization. + */ + BaseInit(); + + /* + * Look up the entry point function, loading its library if necessary. + */ + entrypt = LookupBackgroundWorkerFunction(worker->bgw_library_name, + worker->bgw_function_name); + + /* + * Note that in normal processes, we would call InitPostgres here. For a + * worker, however, we don't know what database to connect to, yet; so we + * need to wait until the user code does it via + * BackgroundWorkerInitializeConnection(). + */ + + /* + * Now invoke the user-defined worker code + */ + entrypt(worker->bgw_main_arg); + + /* ... and if it returns, we're done */ + proc_exit(0); +} + +/* + * Register a new static background worker. + * + * This can only be called directly from postmaster or in the _PG_init + * function of a module library that's loaded by shared_preload_libraries; + * otherwise it will have no effect. + */ +void +RegisterBackgroundWorker(BackgroundWorker *worker) +{ + RegisteredBgWorker *rw; + static int numworkers = 0; + + if (!IsUnderPostmaster) + ereport(DEBUG1, + (errmsg_internal("registering background worker \"%s\"", worker->bgw_name))); + + if (!process_shared_preload_libraries_in_progress && + strcmp(worker->bgw_library_name, "postgres") != 0) + { + if (!IsUnderPostmaster) + ereport(LOG, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("background worker \"%s\": must be registered in shared_preload_libraries", + worker->bgw_name))); + return; + } + + if (!SanityCheckBackgroundWorker(worker, LOG)) + return; + + if (worker->bgw_notify_pid != 0) + { + ereport(LOG, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("background worker \"%s\": only dynamic background workers can request notification", + worker->bgw_name))); + return; + } + + /* + * Enforce maximum number of workers. Note this is overly restrictive: we + * could allow more non-shmem-connected workers, because these don't count + * towards the MAX_BACKENDS limit elsewhere. For now, it doesn't seem + * important to relax this restriction. + */ + if (++numworkers > max_worker_processes) + { + ereport(LOG, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("too many background workers"), + errdetail_plural("Up to %d background worker can be registered with the current settings.", + "Up to %d background workers can be registered with the current settings.", + max_worker_processes, + max_worker_processes), + errhint("Consider increasing the configuration parameter \"max_worker_processes\"."))); + return; + } + + /* + * Copy the registration data into the registered workers list. + */ + rw = malloc(sizeof(RegisteredBgWorker)); + if (rw == NULL) + { + ereport(LOG, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + return; + } + + rw->rw_worker = *worker; + rw->rw_backend = NULL; + rw->rw_pid = 0; + rw->rw_child_slot = 0; + rw->rw_crashed_at = 0; + rw->rw_terminate = false; + + slist_push_head(&BackgroundWorkerList, &rw->rw_lnode); +} + +/* + * Register a new background worker from a regular backend. + * + * Returns true on success and false on failure. Failure typically indicates + * that no background worker slots are currently available. + * + * If handle != NULL, we'll set *handle to a pointer that can subsequently + * be used as an argument to GetBackgroundWorkerPid(). The caller can + * free this pointer using pfree(), if desired. + */ +bool +RegisterDynamicBackgroundWorker(BackgroundWorker *worker, + BackgroundWorkerHandle **handle) +{ + int slotno; + bool success = false; + bool parallel; + uint64 generation = 0; + + /* + * We can't register dynamic background workers from the postmaster. If + * this is a standalone backend, we're the only process and can't start + * any more. In a multi-process environment, it might be theoretically + * possible, but we don't currently support it due to locking + * considerations; see comments on the BackgroundWorkerSlot data + * structure. + */ + if (!IsUnderPostmaster) + return false; + + if (!SanityCheckBackgroundWorker(worker, ERROR)) + return false; + + parallel = (worker->bgw_flags & BGWORKER_CLASS_PARALLEL) != 0; + + LWLockAcquire(BackgroundWorkerLock, LW_EXCLUSIVE); + + /* + * If this is a parallel worker, check whether there are already too many + * parallel workers; if so, don't register another one. Our view of + * parallel_terminate_count may be slightly stale, but that doesn't really + * matter: we would have gotten the same result if we'd arrived here + * slightly earlier anyway. There's no help for it, either, since the + * postmaster must not take locks; a memory barrier wouldn't guarantee + * anything useful. + */ + if (parallel && (BackgroundWorkerData->parallel_register_count - + BackgroundWorkerData->parallel_terminate_count) >= + max_parallel_workers) + { + Assert(BackgroundWorkerData->parallel_register_count - + BackgroundWorkerData->parallel_terminate_count <= + MAX_PARALLEL_WORKER_LIMIT); + LWLockRelease(BackgroundWorkerLock); + return false; + } + + /* + * Look for an unused slot. If we find one, grab it. + */ + for (slotno = 0; slotno < BackgroundWorkerData->total_slots; ++slotno) + { + BackgroundWorkerSlot *slot = &BackgroundWorkerData->slot[slotno]; + + if (!slot->in_use) + { + memcpy(&slot->worker, worker, sizeof(BackgroundWorker)); + slot->pid = InvalidPid; /* indicates not started yet */ + slot->generation++; + slot->terminate = false; + generation = slot->generation; + if (parallel) + BackgroundWorkerData->parallel_register_count++; + + /* + * Make sure postmaster doesn't see the slot as in use before it + * sees the new contents. + */ + pg_write_barrier(); + + slot->in_use = true; + success = true; + break; + } + } + + LWLockRelease(BackgroundWorkerLock); + + /* If we found a slot, tell the postmaster to notice the change. */ + if (success) + SendPostmasterSignal(PMSIGNAL_BACKGROUND_WORKER_CHANGE); + + /* + * If we found a slot and the user has provided a handle, initialize it. + */ + if (success && handle) + { + *handle = palloc(sizeof(BackgroundWorkerHandle)); + (*handle)->slot = slotno; + (*handle)->generation = generation; + } + + return success; +} + +/* + * Get the PID of a dynamically-registered background worker. + * + * If the worker is determined to be running, the return value will be + * BGWH_STARTED and *pidp will get the PID of the worker process. If the + * postmaster has not yet attempted to start the worker, the return value will + * be BGWH_NOT_YET_STARTED. Otherwise, the return value is BGWH_STOPPED. + * + * BGWH_STOPPED can indicate either that the worker is temporarily stopped + * (because it is configured for automatic restart and exited non-zero), + * or that the worker is permanently stopped (because it exited with exit + * code 0, or was not configured for automatic restart), or even that the + * worker was unregistered without ever starting (either because startup + * failed and the worker is not configured for automatic restart, or because + * TerminateBackgroundWorker was used before the worker was successfully + * started). + */ +BgwHandleStatus +GetBackgroundWorkerPid(BackgroundWorkerHandle *handle, pid_t *pidp) +{ + BackgroundWorkerSlot *slot; + pid_t pid; + + Assert(handle->slot < max_worker_processes); + slot = &BackgroundWorkerData->slot[handle->slot]; + + /* + * We could probably arrange to synchronize access to data using memory + * barriers only, but for now, let's just keep it simple and grab the + * lock. It seems unlikely that there will be enough traffic here to + * result in meaningful contention. + */ + LWLockAcquire(BackgroundWorkerLock, LW_SHARED); + + /* + * The generation number can't be concurrently changed while we hold the + * lock. The pid, which is updated by the postmaster, can change at any + * time, but we assume such changes are atomic. So the value we read + * won't be garbage, but it might be out of date by the time the caller + * examines it (but that's unavoidable anyway). + * + * The in_use flag could be in the process of changing from true to false, + * but if it is already false then it can't change further. + */ + if (handle->generation != slot->generation || !slot->in_use) + pid = 0; + else + pid = slot->pid; + + /* All done. */ + LWLockRelease(BackgroundWorkerLock); + + if (pid == 0) + return BGWH_STOPPED; + else if (pid == InvalidPid) + return BGWH_NOT_YET_STARTED; + *pidp = pid; + return BGWH_STARTED; +} + +/* + * Wait for a background worker to start up. + * + * This is like GetBackgroundWorkerPid(), except that if the worker has not + * yet started, we wait for it to do so; thus, BGWH_NOT_YET_STARTED is never + * returned. However, if the postmaster has died, we give up and return + * BGWH_POSTMASTER_DIED, since it that case we know that startup will not + * take place. + * + * The caller *must* have set our PID as the worker's bgw_notify_pid, + * else we will not be awoken promptly when the worker's state changes. + */ +BgwHandleStatus +WaitForBackgroundWorkerStartup(BackgroundWorkerHandle *handle, pid_t *pidp) +{ + BgwHandleStatus status; + int rc; + + for (;;) + { + pid_t pid; + + CHECK_FOR_INTERRUPTS(); + + status = GetBackgroundWorkerPid(handle, &pid); + if (status == BGWH_STARTED) + *pidp = pid; + if (status != BGWH_NOT_YET_STARTED) + break; + + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_POSTMASTER_DEATH, 0, + WAIT_EVENT_BGWORKER_STARTUP); + + if (rc & WL_POSTMASTER_DEATH) + { + status = BGWH_POSTMASTER_DIED; + break; + } + + ResetLatch(MyLatch); + } + + return status; +} + +/* + * Wait for a background worker to stop. + * + * If the worker hasn't yet started, or is running, we wait for it to stop + * and then return BGWH_STOPPED. However, if the postmaster has died, we give + * up and return BGWH_POSTMASTER_DIED, because it's the postmaster that + * notifies us when a worker's state changes. + * + * The caller *must* have set our PID as the worker's bgw_notify_pid, + * else we will not be awoken promptly when the worker's state changes. + */ +BgwHandleStatus +WaitForBackgroundWorkerShutdown(BackgroundWorkerHandle *handle) +{ + BgwHandleStatus status; + int rc; + + for (;;) + { + pid_t pid; + + CHECK_FOR_INTERRUPTS(); + + status = GetBackgroundWorkerPid(handle, &pid); + if (status == BGWH_STOPPED) + break; + + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_POSTMASTER_DEATH, 0, + WAIT_EVENT_BGWORKER_SHUTDOWN); + + if (rc & WL_POSTMASTER_DEATH) + { + status = BGWH_POSTMASTER_DIED; + break; + } + + ResetLatch(MyLatch); + } + + return status; +} + +/* + * Instruct the postmaster to terminate a background worker. + * + * Note that it's safe to do this without regard to whether the worker is + * still running, or even if the worker may already have exited and been + * unregistered. + */ +void +TerminateBackgroundWorker(BackgroundWorkerHandle *handle) +{ + BackgroundWorkerSlot *slot; + bool signal_postmaster = false; + + Assert(handle->slot < max_worker_processes); + slot = &BackgroundWorkerData->slot[handle->slot]; + + /* Set terminate flag in shared memory, unless slot has been reused. */ + LWLockAcquire(BackgroundWorkerLock, LW_EXCLUSIVE); + if (handle->generation == slot->generation) + { + slot->terminate = true; + signal_postmaster = true; + } + LWLockRelease(BackgroundWorkerLock); + + /* Make sure the postmaster notices the change to shared memory. */ + if (signal_postmaster) + SendPostmasterSignal(PMSIGNAL_BACKGROUND_WORKER_CHANGE); +} + +/* + * Look up (and possibly load) a bgworker entry point function. + * + * For functions contained in the core code, we use library name "postgres" + * and consult the InternalBGWorkers array. External functions are + * looked up, and loaded if necessary, using load_external_function(). + * + * The point of this is to pass function names as strings across process + * boundaries. We can't pass actual function addresses because of the + * possibility that the function has been loaded at a different address + * in a different process. This is obviously a hazard for functions in + * loadable libraries, but it can happen even for functions in the core code + * on platforms using EXEC_BACKEND (e.g., Windows). + * + * At some point it might be worthwhile to get rid of InternalBGWorkers[] + * in favor of applying load_external_function() for core functions too; + * but that raises portability issues that are not worth addressing now. + */ +static bgworker_main_type +LookupBackgroundWorkerFunction(const char *libraryname, const char *funcname) +{ + /* + * If the function is to be loaded from postgres itself, search the + * InternalBGWorkers array. + */ + if (strcmp(libraryname, "postgres") == 0) + { + int i; + + for (i = 0; i < lengthof(InternalBGWorkers); i++) + { + if (strcmp(InternalBGWorkers[i].fn_name, funcname) == 0) + return InternalBGWorkers[i].fn_addr; + } + + /* We can only reach this by programming error. */ + elog(ERROR, "internal function \"%s\" not found", funcname); + } + + /* Otherwise load from external library. */ + return (bgworker_main_type) + load_external_function(libraryname, funcname, true, NULL); +} + +/* + * Given a PID, get the bgw_type of the background worker. Returns NULL if + * not a valid background worker. + * + * The return value is in static memory belonging to this function, so it has + * to be used before calling this function again. This is so that the caller + * doesn't have to worry about the background worker locking protocol. + */ +const char * +GetBackgroundWorkerTypeByPid(pid_t pid) +{ + int slotno; + bool found = false; + static char result[BGW_MAXLEN]; + + LWLockAcquire(BackgroundWorkerLock, LW_SHARED); + + for (slotno = 0; slotno < BackgroundWorkerData->total_slots; slotno++) + { + BackgroundWorkerSlot *slot = &BackgroundWorkerData->slot[slotno]; + + if (slot->pid > 0 && slot->pid == pid) + { + strcpy(result, slot->worker.bgw_type); + found = true; + break; + } + } + + LWLockRelease(BackgroundWorkerLock); + + if (!found) + return NULL; + + return result; +} diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c new file mode 100644 index 0000000..f2e4f23 --- /dev/null +++ b/src/backend/postmaster/bgwriter.c @@ -0,0 +1,346 @@ +/*------------------------------------------------------------------------- + * + * bgwriter.c + * + * The background writer (bgwriter) is new as of Postgres 8.0. It attempts + * to keep regular backends from having to write out dirty shared buffers + * (which they would only do when needing to free a shared buffer to read in + * another page). In the best scenario all writes from shared buffers will + * be issued by the background writer process. However, regular backends are + * still empowered to issue writes if the bgwriter fails to maintain enough + * clean shared buffers. + * + * As of Postgres 9.2 the bgwriter no longer handles checkpoints. + * + * Normal termination is by SIGTERM, which instructs the bgwriter to exit(0). + * Emergency termination is by SIGQUIT; like any backend, the bgwriter will + * simply abort and exit on SIGQUIT. + * + * If the bgwriter exits unexpectedly, the postmaster treats that the same + * as a backend crash: shared memory may be corrupted, so remaining backends + * should be killed by SIGQUIT and then a recovery cycle started. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/postmaster/bgwriter.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgwriter.h" +#include "postmaster/interrupt.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "storage/condition_variable.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/proc.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" +#include "storage/smgr.h" +#include "storage/spin.h" +#include "storage/standby.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/resowner.h" +#include "utils/timestamp.h" + +/* + * GUC parameters + */ +int BgWriterDelay = 200; + +/* + * Multiplier to apply to BgWriterDelay when we decide to hibernate. + * (Perhaps this needs to be configurable?) + */ +#define HIBERNATE_FACTOR 50 + +/* + * Interval in which standby snapshots are logged into the WAL stream, in + * milliseconds. + */ +#define LOG_SNAPSHOT_INTERVAL_MS 15000 + +/* + * LSN and timestamp at which we last issued a LogStandbySnapshot(), to avoid + * doing so too often or repeatedly if there has been no other write activity + * in the system. + */ +static TimestampTz last_snapshot_ts; +static XLogRecPtr last_snapshot_lsn = InvalidXLogRecPtr; + + +/* + * Main entry point for bgwriter process + * + * This is invoked from AuxiliaryProcessMain, which has already created the + * basic execution environment, but not enabled signals yet. + */ +void +BackgroundWriterMain(void) +{ + sigjmp_buf local_sigjmp_buf; + MemoryContext bgwriter_context; + bool prev_hibernate; + WritebackContext wb_context; + + /* + * Properly accept or ignore signals that might be sent to us. + */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGINT, SIG_IGN); + pqsignal(SIGTERM, SignalHandlerForShutdownRequest); + /* SIGQUIT handler was already set up by InitPostmasterChild */ + pqsignal(SIGALRM, SIG_IGN); + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, SIG_IGN); + + /* + * Reset some signals that are accepted by postmaster but not here + */ + pqsignal(SIGCHLD, SIG_DFL); + + /* + * We just started, assume there has been either a shutdown or + * end-of-recovery snapshot. + */ + last_snapshot_ts = GetCurrentTimestamp(); + + /* + * Create a memory context that we will do all our work in. We do this so + * that we can reset the context during error recovery and thereby avoid + * possible memory leaks. Formerly this code just ran in + * TopMemoryContext, but resetting that would be a really bad idea. + */ + bgwriter_context = AllocSetContextCreate(TopMemoryContext, + "Background Writer", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(bgwriter_context); + + WritebackContextInit(&wb_context, &bgwriter_flush_after); + + /* + * If an exception is encountered, processing resumes here. + * + * You might wonder why this isn't coded as an infinite loop around a + * PG_TRY construct. The reason is that this is the bottom of the + * exception stack, and so with PG_TRY there would be no exception handler + * in force at all during the CATCH part. By leaving the outermost setjmp + * always active, we have at least some chance of recovering from an error + * during error recovery. (If we get into an infinite loop thereby, it + * will soon be stopped by overflow of elog.c's internal state stack.) + * + * Note that we use sigsetjmp(..., 1), so that the prevailing signal mask + * (to wit, BlockSig) will be restored when longjmp'ing to here. Thus, + * signals other than SIGQUIT will be blocked until we complete error + * recovery. It might seem that this policy makes the HOLD_INTERRUPTS() + * call redundant, but it is not since InterruptPending might be set + * already. + */ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* Since not using PG_TRY, must reset error stack by hand */ + error_context_stack = NULL; + + /* Prevent interrupts while cleaning up */ + HOLD_INTERRUPTS(); + + /* Report the error to the server log */ + EmitErrorReport(); + + /* + * These operations are really just a minimal subset of + * AbortTransaction(). We don't have very many resources to worry + * about in bgwriter, but we do have LWLocks, buffers, and temp files. + */ + LWLockReleaseAll(); + ConditionVariableCancelSleep(); + UnlockBuffers(); + ReleaseAuxProcessResources(false); + AtEOXact_Buffers(false); + AtEOXact_SMgr(); + AtEOXact_Files(false); + AtEOXact_HashTables(false); + + /* + * Now return to normal top-level context and clear ErrorContext for + * next time. + */ + MemoryContextSwitchTo(bgwriter_context); + FlushErrorState(); + + /* Flush any leaked data in the top-level context */ + MemoryContextResetAndDeleteChildren(bgwriter_context); + + /* re-initialize to avoid repeated errors causing problems */ + WritebackContextInit(&wb_context, &bgwriter_flush_after); + + /* Now we can allow interrupts again */ + RESUME_INTERRUPTS(); + + /* + * Sleep at least 1 second after any error. A write error is likely + * to be repeated, and we don't want to be filling the error logs as + * fast as we can. + */ + pg_usleep(1000000L); + + /* + * Close all open files after any error. This is helpful on Windows, + * where holding deleted files open causes various strange errors. + * It's not clear we need it elsewhere, but shouldn't hurt. + */ + smgrcloseall(); + + /* Report wait end here, when there is no further possibility of wait */ + pgstat_report_wait_end(); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + /* + * Unblock signals (they were blocked when the postmaster forked us) + */ + sigprocmask(SIG_SETMASK, &UnBlockSig, NULL); + + /* + * Reset hibernation state after any error. + */ + prev_hibernate = false; + + /* + * Loop forever + */ + for (;;) + { + bool can_hibernate; + int rc; + + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + HandleMainLoopInterrupts(); + + /* + * Do one cycle of dirty-buffer writing. + */ + can_hibernate = BgBufferSync(&wb_context); + + /* Report pending statistics to the cumulative stats system */ + pgstat_report_bgwriter(); + pgstat_report_wal(true); + + if (FirstCallSinceLastCheckpoint()) + { + /* + * After any checkpoint, close all smgr files. This is so we + * won't hang onto smgr references to deleted files indefinitely. + */ + smgrcloseall(); + } + + /* + * Log a new xl_running_xacts every now and then so replication can + * get into a consistent state faster (think of suboverflowed + * snapshots) and clean up resources (locks, KnownXids*) more + * frequently. The costs of this are relatively low, so doing it 4 + * times (LOG_SNAPSHOT_INTERVAL_MS) a minute seems fine. + * + * We assume the interval for writing xl_running_xacts is + * significantly bigger than BgWriterDelay, so we don't complicate the + * overall timeout handling but just assume we're going to get called + * often enough even if hibernation mode is active. It's not that + * important that LOG_SNAPSHOT_INTERVAL_MS is met strictly. To make + * sure we're not waking the disk up unnecessarily on an idle system + * we check whether there has been any WAL inserted since the last + * time we've logged a running xacts. + * + * We do this logging in the bgwriter as it is the only process that + * is run regularly and returns to its mainloop all the time. E.g. + * Checkpointer, when active, is barely ever in its mainloop and thus + * makes it hard to log regularly. + */ + if (XLogStandbyInfoActive() && !RecoveryInProgress()) + { + TimestampTz timeout = 0; + TimestampTz now = GetCurrentTimestamp(); + + timeout = TimestampTzPlusMilliseconds(last_snapshot_ts, + LOG_SNAPSHOT_INTERVAL_MS); + + /* + * Only log if enough time has passed and interesting records have + * been inserted since the last snapshot. Have to compare with <= + * instead of < because GetLastImportantRecPtr() points at the + * start of a record, whereas last_snapshot_lsn points just past + * the end of the record. + */ + if (now >= timeout && + last_snapshot_lsn <= GetLastImportantRecPtr()) + { + last_snapshot_lsn = LogStandbySnapshot(); + last_snapshot_ts = now; + } + } + + /* + * Sleep until we are signaled or BgWriterDelay has elapsed. + * + * Note: the feedback control loop in BgBufferSync() expects that we + * will call it every BgWriterDelay msec. While it's not critical for + * correctness that that be exact, the feedback loop might misbehave + * if we stray too far from that. Hence, avoid loading this process + * down with latch events that are likely to happen frequently during + * normal operation. + */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + BgWriterDelay /* ms */ , WAIT_EVENT_BGWRITER_MAIN); + + /* + * If no latch event and BgBufferSync says nothing's happening, extend + * the sleep in "hibernation" mode, where we sleep for much longer + * than bgwriter_delay says. Fewer wakeups save electricity. When a + * backend starts using buffers again, it will wake us up by setting + * our latch. Because the extra sleep will persist only as long as no + * buffer allocations happen, this should not distort the behavior of + * BgBufferSync's control loop too badly; essentially, it will think + * that the system-wide idle interval didn't exist. + * + * There is a race condition here, in that a backend might allocate a + * buffer between the time BgBufferSync saw the alloc count as zero + * and the time we call StrategyNotifyBgWriter. While it's not + * critical that we not hibernate anyway, we try to reduce the odds of + * that by only hibernating when BgBufferSync says nothing's happening + * for two consecutive cycles. Also, we mitigate any possible + * consequences of a missed wakeup by not hibernating forever. + */ + if (rc == WL_TIMEOUT && can_hibernate && prev_hibernate) + { + /* Ask for notification at next buffer allocation */ + StrategyNotifyBgWriter(MyProc->pgprocno); + /* Sleep ... */ + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + BgWriterDelay * HIBERNATE_FACTOR, + WAIT_EVENT_BGWRITER_HIBERNATE); + /* Reset the notification request in case we timed out */ + StrategyNotifyBgWriter(-1); + } + + prev_hibernate = can_hibernate; + } +} diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c new file mode 100644 index 0000000..ace9893 --- /dev/null +++ b/src/backend/postmaster/checkpointer.c @@ -0,0 +1,1353 @@ +/*------------------------------------------------------------------------- + * + * checkpointer.c + * + * The checkpointer is new as of Postgres 9.2. It handles all checkpoints. + * Checkpoints are automatically dispatched after a certain amount of time has + * elapsed since the last one, and it can be signaled to perform requested + * checkpoints as well. (The GUC parameter that mandates a checkpoint every + * so many WAL segments is implemented by having backends signal when they + * fill WAL segments; the checkpointer itself doesn't watch for the + * condition.) + * + * Normal termination is by SIGUSR2, which instructs the checkpointer to + * execute a shutdown checkpoint and then exit(0). (All backends must be + * stopped before SIGUSR2 is issued!) Emergency termination is by SIGQUIT; + * like any backend, the checkpointer will simply abort and exit on SIGQUIT. + * + * If the checkpointer exits unexpectedly, the postmaster treats that the same + * as a backend crash: shared memory may be corrupted, so remaining backends + * should be killed by SIGQUIT and then a recovery cycle started. (Even if + * shared memory isn't corrupted, we have lost information about which + * files need to be fsync'd for the next checkpoint, and so a system + * restart needs to be forced.) + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/postmaster/checkpointer.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <sys/time.h> +#include <time.h> + +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xlogrecovery.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgwriter.h" +#include "postmaster/interrupt.h" +#include "replication/syncrep.h" +#include "storage/bufmgr.h" +#include "storage/condition_variable.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/proc.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" +#include "storage/smgr.h" +#include "storage/spin.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/resowner.h" + + +/*---------- + * Shared memory area for communication between checkpointer and backends + * + * The ckpt counters allow backends to watch for completion of a checkpoint + * request they send. Here's how it works: + * * At start of a checkpoint, checkpointer reads (and clears) the request + * flags and increments ckpt_started, while holding ckpt_lck. + * * On completion of a checkpoint, checkpointer sets ckpt_done to + * equal ckpt_started. + * * On failure of a checkpoint, checkpointer increments ckpt_failed + * and sets ckpt_done to equal ckpt_started. + * + * The algorithm for backends is: + * 1. Record current values of ckpt_failed and ckpt_started, and + * set request flags, while holding ckpt_lck. + * 2. Send signal to request checkpoint. + * 3. Sleep until ckpt_started changes. Now you know a checkpoint has + * begun since you started this algorithm (although *not* that it was + * specifically initiated by your signal), and that it is using your flags. + * 4. Record new value of ckpt_started. + * 5. Sleep until ckpt_done >= saved value of ckpt_started. (Use modulo + * arithmetic here in case counters wrap around.) Now you know a + * checkpoint has started and completed, but not whether it was + * successful. + * 6. If ckpt_failed is different from the originally saved value, + * assume request failed; otherwise it was definitely successful. + * + * ckpt_flags holds the OR of the checkpoint request flags sent by all + * requesting backends since the last checkpoint start. The flags are + * chosen so that OR'ing is the correct way to combine multiple requests. + * + * num_backend_writes is used to count the number of buffer writes performed + * by user backend processes. This counter should be wide enough that it + * can't overflow during a single processing cycle. num_backend_fsync + * counts the subset of those writes that also had to do their own fsync, + * because the checkpointer failed to absorb their request. + * + * The requests array holds fsync requests sent by backends and not yet + * absorbed by the checkpointer. + * + * Unlike the checkpoint fields, num_backend_writes, num_backend_fsync, and + * the requests fields are protected by CheckpointerCommLock. + *---------- + */ +typedef struct +{ + SyncRequestType type; /* request type */ + FileTag ftag; /* file identifier */ +} CheckpointerRequest; + +typedef struct +{ + pid_t checkpointer_pid; /* PID (0 if not started) */ + + slock_t ckpt_lck; /* protects all the ckpt_* fields */ + + int ckpt_started; /* advances when checkpoint starts */ + int ckpt_done; /* advances when checkpoint done */ + int ckpt_failed; /* advances when checkpoint fails */ + + int ckpt_flags; /* checkpoint flags, as defined in xlog.h */ + + ConditionVariable start_cv; /* signaled when ckpt_started advances */ + ConditionVariable done_cv; /* signaled when ckpt_done advances */ + + uint32 num_backend_writes; /* counts user backend buffer writes */ + uint32 num_backend_fsync; /* counts user backend fsync calls */ + + int num_requests; /* current # of requests */ + int max_requests; /* allocated array size */ + CheckpointerRequest requests[FLEXIBLE_ARRAY_MEMBER]; +} CheckpointerShmemStruct; + +static CheckpointerShmemStruct *CheckpointerShmem; + +/* interval for calling AbsorbSyncRequests in CheckpointWriteDelay */ +#define WRITES_PER_ABSORB 1000 + +/* + * GUC parameters + */ +int CheckPointTimeout = 300; +int CheckPointWarning = 30; +double CheckPointCompletionTarget = 0.9; + +/* + * Private state + */ +static bool ckpt_active = false; + +/* these values are valid when ckpt_active is true: */ +static pg_time_t ckpt_start_time; +static XLogRecPtr ckpt_start_recptr; +static double ckpt_cached_elapsed; + +static pg_time_t last_checkpoint_time; +static pg_time_t last_xlog_switch_time; + +/* Prototypes for private functions */ + +static void HandleCheckpointerInterrupts(void); +static void CheckArchiveTimeout(void); +static bool IsCheckpointOnSchedule(double progress); +static bool ImmediateCheckpointRequested(void); +static bool CompactCheckpointerRequestQueue(void); +static void UpdateSharedMemoryConfig(void); + +/* Signal handlers */ +static void ReqCheckpointHandler(SIGNAL_ARGS); + + +/* + * Main entry point for checkpointer process + * + * This is invoked from AuxiliaryProcessMain, which has already created the + * basic execution environment, but not enabled signals yet. + */ +void +CheckpointerMain(void) +{ + sigjmp_buf local_sigjmp_buf; + MemoryContext checkpointer_context; + + CheckpointerShmem->checkpointer_pid = MyProcPid; + + /* + * Properly accept or ignore signals the postmaster might send us + * + * Note: we deliberately ignore SIGTERM, because during a standard Unix + * system shutdown cycle, init will SIGTERM all processes at once. We + * want to wait for the backends to exit, whereupon the postmaster will + * tell us it's okay to shut down (via SIGUSR2). + */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGINT, ReqCheckpointHandler); /* request checkpoint */ + pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */ + /* SIGQUIT handler was already set up by InitPostmasterChild */ + pqsignal(SIGALRM, SIG_IGN); + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, SignalHandlerForShutdownRequest); + + /* + * Reset some signals that are accepted by postmaster but not here + */ + pqsignal(SIGCHLD, SIG_DFL); + + /* + * Initialize so that first time-driven event happens at the correct time. + */ + last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); + + /* + * Write out stats after shutdown. This needs to be called by exactly one + * process during a normal shutdown, and since checkpointer is shut down + * very late... + * + * Walsenders are shut down after the checkpointer, but currently don't + * report stats. If that changes, we need a more complicated solution. + */ + before_shmem_exit(pgstat_before_server_shutdown, 0); + + /* + * Create a memory context that we will do all our work in. We do this so + * that we can reset the context during error recovery and thereby avoid + * possible memory leaks. Formerly this code just ran in + * TopMemoryContext, but resetting that would be a really bad idea. + */ + checkpointer_context = AllocSetContextCreate(TopMemoryContext, + "Checkpointer", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(checkpointer_context); + + /* + * If an exception is encountered, processing resumes here. + * + * You might wonder why this isn't coded as an infinite loop around a + * PG_TRY construct. The reason is that this is the bottom of the + * exception stack, and so with PG_TRY there would be no exception handler + * in force at all during the CATCH part. By leaving the outermost setjmp + * always active, we have at least some chance of recovering from an error + * during error recovery. (If we get into an infinite loop thereby, it + * will soon be stopped by overflow of elog.c's internal state stack.) + * + * Note that we use sigsetjmp(..., 1), so that the prevailing signal mask + * (to wit, BlockSig) will be restored when longjmp'ing to here. Thus, + * signals other than SIGQUIT will be blocked until we complete error + * recovery. It might seem that this policy makes the HOLD_INTERRUPTS() + * call redundant, but it is not since InterruptPending might be set + * already. + */ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* Since not using PG_TRY, must reset error stack by hand */ + error_context_stack = NULL; + + /* Prevent interrupts while cleaning up */ + HOLD_INTERRUPTS(); + + /* Report the error to the server log */ + EmitErrorReport(); + + /* + * These operations are really just a minimal subset of + * AbortTransaction(). We don't have very many resources to worry + * about in checkpointer, but we do have LWLocks, buffers, and temp + * files. + */ + LWLockReleaseAll(); + ConditionVariableCancelSleep(); + pgstat_report_wait_end(); + UnlockBuffers(); + ReleaseAuxProcessResources(false); + AtEOXact_Buffers(false); + AtEOXact_SMgr(); + AtEOXact_Files(false); + AtEOXact_HashTables(false); + + /* Warn any waiting backends that the checkpoint failed. */ + if (ckpt_active) + { + SpinLockAcquire(&CheckpointerShmem->ckpt_lck); + CheckpointerShmem->ckpt_failed++; + CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started; + SpinLockRelease(&CheckpointerShmem->ckpt_lck); + + ConditionVariableBroadcast(&CheckpointerShmem->done_cv); + + ckpt_active = false; + } + + /* + * Now return to normal top-level context and clear ErrorContext for + * next time. + */ + MemoryContextSwitchTo(checkpointer_context); + FlushErrorState(); + + /* Flush any leaked data in the top-level context */ + MemoryContextResetAndDeleteChildren(checkpointer_context); + + /* Now we can allow interrupts again */ + RESUME_INTERRUPTS(); + + /* + * Sleep at least 1 second after any error. A write error is likely + * to be repeated, and we don't want to be filling the error logs as + * fast as we can. + */ + pg_usleep(1000000L); + + /* + * Close all open files after any error. This is helpful on Windows, + * where holding deleted files open causes various strange errors. + * It's not clear we need it elsewhere, but shouldn't hurt. + */ + smgrcloseall(); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + /* + * Unblock signals (they were blocked when the postmaster forked us) + */ + sigprocmask(SIG_SETMASK, &UnBlockSig, NULL); + + /* + * Ensure all shared memory values are set correctly for the config. Doing + * this here ensures no race conditions from other concurrent updaters. + */ + UpdateSharedMemoryConfig(); + + /* + * Advertise our latch that backends can use to wake us up while we're + * sleeping. + */ + ProcGlobal->checkpointerLatch = &MyProc->procLatch; + + /* + * Loop forever + */ + for (;;) + { + bool do_checkpoint = false; + int flags = 0; + pg_time_t now; + int elapsed_secs; + int cur_timeout; + + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + /* + * Process any requests or signals received recently. + */ + AbsorbSyncRequests(); + HandleCheckpointerInterrupts(); + + /* + * Detect a pending checkpoint request by checking whether the flags + * word in shared memory is nonzero. We shouldn't need to acquire the + * ckpt_lck for this. + */ + if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags) + { + do_checkpoint = true; + PendingCheckpointerStats.requested_checkpoints++; + } + + /* + * Force a checkpoint if too much time has elapsed since the last one. + * Note that we count a timed checkpoint in stats only when this + * occurs without an external request, but we set the CAUSE_TIME flag + * bit even if there is also an external request. + */ + now = (pg_time_t) time(NULL); + elapsed_secs = now - last_checkpoint_time; + if (elapsed_secs >= CheckPointTimeout) + { + if (!do_checkpoint) + PendingCheckpointerStats.timed_checkpoints++; + do_checkpoint = true; + flags |= CHECKPOINT_CAUSE_TIME; + } + + /* + * Do a checkpoint if requested. + */ + if (do_checkpoint) + { + bool ckpt_performed = false; + bool do_restartpoint; + + /* Check if we should perform a checkpoint or a restartpoint. */ + do_restartpoint = RecoveryInProgress(); + + /* + * Atomically fetch the request flags to figure out what kind of a + * checkpoint we should perform, and increase the started-counter + * to acknowledge that we've started a new checkpoint. + */ + SpinLockAcquire(&CheckpointerShmem->ckpt_lck); + flags |= CheckpointerShmem->ckpt_flags; + CheckpointerShmem->ckpt_flags = 0; + CheckpointerShmem->ckpt_started++; + SpinLockRelease(&CheckpointerShmem->ckpt_lck); + + ConditionVariableBroadcast(&CheckpointerShmem->start_cv); + + /* + * The end-of-recovery checkpoint is a real checkpoint that's + * performed while we're still in recovery. + */ + if (flags & CHECKPOINT_END_OF_RECOVERY) + do_restartpoint = false; + + /* + * We will warn if (a) too soon since last checkpoint (whatever + * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag + * since the last checkpoint start. Note in particular that this + * implementation will not generate warnings caused by + * CheckPointTimeout < CheckPointWarning. + */ + if (!do_restartpoint && + (flags & CHECKPOINT_CAUSE_XLOG) && + elapsed_secs < CheckPointWarning) + ereport(LOG, + (errmsg_plural("checkpoints are occurring too frequently (%d second apart)", + "checkpoints are occurring too frequently (%d seconds apart)", + elapsed_secs, + elapsed_secs), + errhint("Consider increasing the configuration parameter \"max_wal_size\"."))); + + /* + * Initialize checkpointer-private variables used during + * checkpoint. + */ + ckpt_active = true; + if (do_restartpoint) + ckpt_start_recptr = GetXLogReplayRecPtr(NULL); + else + ckpt_start_recptr = GetInsertRecPtr(); + ckpt_start_time = now; + ckpt_cached_elapsed = 0; + + /* + * Do the checkpoint. + */ + if (!do_restartpoint) + { + CreateCheckPoint(flags); + ckpt_performed = true; + } + else + ckpt_performed = CreateRestartPoint(flags); + + /* + * After any checkpoint, close all smgr files. This is so we + * won't hang onto smgr references to deleted files indefinitely. + */ + smgrcloseall(); + + /* + * Indicate checkpoint completion to any waiting backends. + */ + SpinLockAcquire(&CheckpointerShmem->ckpt_lck); + CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started; + SpinLockRelease(&CheckpointerShmem->ckpt_lck); + + ConditionVariableBroadcast(&CheckpointerShmem->done_cv); + + if (ckpt_performed) + { + /* + * Note we record the checkpoint start time not end time as + * last_checkpoint_time. This is so that time-driven + * checkpoints happen at a predictable spacing. + */ + last_checkpoint_time = now; + } + else + { + /* + * We were not able to perform the restartpoint (checkpoints + * throw an ERROR in case of error). Most likely because we + * have not received any new checkpoint WAL records since the + * last restartpoint. Try again in 15 s. + */ + last_checkpoint_time = now - CheckPointTimeout + 15; + } + + ckpt_active = false; + + /* We may have received an interrupt during the checkpoint. */ + HandleCheckpointerInterrupts(); + } + + /* Check for archive_timeout and switch xlog files if necessary. */ + CheckArchiveTimeout(); + + /* Report pending statistics to the cumulative stats system */ + pgstat_report_checkpointer(); + pgstat_report_wal(true); + + /* + * If any checkpoint flags have been set, redo the loop to handle the + * checkpoint without sleeping. + */ + if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags) + continue; + + /* + * Sleep until we are signaled or it's time for another checkpoint or + * xlog file switch. + */ + now = (pg_time_t) time(NULL); + elapsed_secs = now - last_checkpoint_time; + if (elapsed_secs >= CheckPointTimeout) + continue; /* no sleep for us ... */ + cur_timeout = CheckPointTimeout - elapsed_secs; + if (XLogArchiveTimeout > 0 && !RecoveryInProgress()) + { + elapsed_secs = now - last_xlog_switch_time; + if (elapsed_secs >= XLogArchiveTimeout) + continue; /* no sleep for us ... */ + cur_timeout = Min(cur_timeout, XLogArchiveTimeout - elapsed_secs); + } + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + cur_timeout * 1000L /* convert to ms */ , + WAIT_EVENT_CHECKPOINTER_MAIN); + } +} + +/* + * Process any new interrupts. + */ +static void +HandleCheckpointerInterrupts(void) +{ + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + + /* + * Checkpointer is the last process to shut down, so we ask it to hold + * the keys for a range of other tasks required most of which have + * nothing to do with checkpointing at all. + * + * For various reasons, some config values can change dynamically so + * the primary copy of them is held in shared memory to make sure all + * backends see the same value. We make Checkpointer responsible for + * updating the shared memory copy if the parameter setting changes + * because of SIGHUP. + */ + UpdateSharedMemoryConfig(); + } + if (ShutdownRequestPending) + { + /* + * From here on, elog(ERROR) should end with exit(1), not send control + * back to the sigsetjmp block above + */ + ExitOnAnyError = true; + + /* + * Close down the database. + * + * Since ShutdownXLOG() creates restartpoint or checkpoint, and + * updates the statistics, increment the checkpoint request and flush + * out pending statistic. + */ + PendingCheckpointerStats.requested_checkpoints++; + ShutdownXLOG(0, 0); + pgstat_report_checkpointer(); + pgstat_report_wal(true); + + /* Normal exit from the checkpointer is here */ + proc_exit(0); /* done */ + } + + /* Perform logging of memory contexts of this process */ + if (LogMemoryContextPending) + ProcessLogMemoryContextInterrupt(); +} + +/* + * CheckArchiveTimeout -- check for archive_timeout and switch xlog files + * + * This will switch to a new WAL file and force an archive file write if + * meaningful activity is recorded in the current WAL file. This includes most + * writes, including just a single checkpoint record, but excludes WAL records + * that were inserted with the XLOG_MARK_UNIMPORTANT flag being set (like + * snapshots of running transactions). Such records, depending on + * configuration, occur on regular intervals and don't contain important + * information. This avoids generating archives with a few unimportant + * records. + */ +static void +CheckArchiveTimeout(void) +{ + pg_time_t now; + pg_time_t last_time; + XLogRecPtr last_switch_lsn; + + if (XLogArchiveTimeout <= 0 || RecoveryInProgress()) + return; + + now = (pg_time_t) time(NULL); + + /* First we do a quick check using possibly-stale local state. */ + if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout) + return; + + /* + * Update local state ... note that last_xlog_switch_time is the last time + * a switch was performed *or requested*. + */ + last_time = GetLastSegSwitchData(&last_switch_lsn); + + last_xlog_switch_time = Max(last_xlog_switch_time, last_time); + + /* Now we can do the real checks */ + if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout) + { + /* + * Switch segment only when "important" WAL has been logged since the + * last segment switch (last_switch_lsn points to end of segment + * switch occurred in). + */ + if (GetLastImportantRecPtr() > last_switch_lsn) + { + XLogRecPtr switchpoint; + + /* mark switch as unimportant, avoids triggering checkpoints */ + switchpoint = RequestXLogSwitch(true); + + /* + * If the returned pointer points exactly to a segment boundary, + * assume nothing happened. + */ + if (XLogSegmentOffset(switchpoint, wal_segment_size) != 0) + elog(DEBUG1, "write-ahead log switch forced (archive_timeout=%d)", + XLogArchiveTimeout); + } + + /* + * Update state in any case, so we don't retry constantly when the + * system is idle. + */ + last_xlog_switch_time = now; + } +} + +/* + * Returns true if an immediate checkpoint request is pending. (Note that + * this does not check the *current* checkpoint's IMMEDIATE flag, but whether + * there is one pending behind it.) + */ +static bool +ImmediateCheckpointRequested(void) +{ + volatile CheckpointerShmemStruct *cps = CheckpointerShmem; + + /* + * We don't need to acquire the ckpt_lck in this case because we're only + * looking at a single flag bit. + */ + if (cps->ckpt_flags & CHECKPOINT_IMMEDIATE) + return true; + return false; +} + +/* + * CheckpointWriteDelay -- control rate of checkpoint + * + * This function is called after each page write performed by BufferSync(). + * It is responsible for throttling BufferSync()'s write rate to hit + * checkpoint_completion_target. + * + * The checkpoint request flags should be passed in; currently the only one + * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes. + * + * 'progress' is an estimate of how much of the work has been done, as a + * fraction between 0.0 meaning none, and 1.0 meaning all done. + */ +void +CheckpointWriteDelay(int flags, double progress) +{ + static int absorb_counter = WRITES_PER_ABSORB; + + /* Do nothing if checkpoint is being executed by non-checkpointer process */ + if (!AmCheckpointerProcess()) + return; + + /* + * Perform the usual duties and take a nap, unless we're behind schedule, + * in which case we just try to catch up as quickly as possible. + */ + if (!(flags & CHECKPOINT_IMMEDIATE) && + !ShutdownRequestPending && + !ImmediateCheckpointRequested() && + IsCheckpointOnSchedule(progress)) + { + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + /* update shmem copies of config variables */ + UpdateSharedMemoryConfig(); + } + + AbsorbSyncRequests(); + absorb_counter = WRITES_PER_ABSORB; + + CheckArchiveTimeout(); + + /* Report interim statistics to the cumulative stats system */ + pgstat_report_checkpointer(); + + /* + * This sleep used to be connected to bgwriter_delay, typically 200ms. + * That resulted in more frequent wakeups if not much work to do. + * Checkpointer and bgwriter are no longer related so take the Big + * Sleep. + */ + WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, + 100, + WAIT_EVENT_CHECKPOINT_WRITE_DELAY); + ResetLatch(MyLatch); + } + else if (--absorb_counter <= 0) + { + /* + * Absorb pending fsync requests after each WRITES_PER_ABSORB write + * operations even when we don't sleep, to prevent overflow of the + * fsync request queue. + */ + AbsorbSyncRequests(); + absorb_counter = WRITES_PER_ABSORB; + } + + /* Check for barrier events. */ + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); +} + +/* + * IsCheckpointOnSchedule -- are we on schedule to finish this checkpoint + * (or restartpoint) in time? + * + * Compares the current progress against the time/segments elapsed since last + * checkpoint, and returns true if the progress we've made this far is greater + * than the elapsed time/segments. + */ +static bool +IsCheckpointOnSchedule(double progress) +{ + XLogRecPtr recptr; + struct timeval now; + double elapsed_xlogs, + elapsed_time; + + Assert(ckpt_active); + + /* Scale progress according to checkpoint_completion_target. */ + progress *= CheckPointCompletionTarget; + + /* + * Check against the cached value first. Only do the more expensive + * calculations once we reach the target previously calculated. Since + * neither time or WAL insert pointer moves backwards, a freshly + * calculated value can only be greater than or equal to the cached value. + */ + if (progress < ckpt_cached_elapsed) + return false; + + /* + * Check progress against WAL segments written and CheckPointSegments. + * + * We compare the current WAL insert location against the location + * computed before calling CreateCheckPoint. The code in XLogInsert that + * actually triggers a checkpoint when CheckPointSegments is exceeded + * compares against RedoRecPtr, so this is not completely accurate. + * However, it's good enough for our purposes, we're only calculating an + * estimate anyway. + * + * During recovery, we compare last replayed WAL record's location with + * the location computed before calling CreateRestartPoint. That maintains + * the same pacing as we have during checkpoints in normal operation, but + * we might exceed max_wal_size by a fair amount. That's because there can + * be a large gap between a checkpoint's redo-pointer and the checkpoint + * record itself, and we only start the restartpoint after we've seen the + * checkpoint record. (The gap is typically up to CheckPointSegments * + * checkpoint_completion_target where checkpoint_completion_target is the + * value that was in effect when the WAL was generated). + */ + if (RecoveryInProgress()) + recptr = GetXLogReplayRecPtr(NULL); + else + recptr = GetInsertRecPtr(); + elapsed_xlogs = (((double) (recptr - ckpt_start_recptr)) / + wal_segment_size) / CheckPointSegments; + + if (progress < elapsed_xlogs) + { + ckpt_cached_elapsed = elapsed_xlogs; + return false; + } + + /* + * Check progress against time elapsed and checkpoint_timeout. + */ + gettimeofday(&now, NULL); + elapsed_time = ((double) ((pg_time_t) now.tv_sec - ckpt_start_time) + + now.tv_usec / 1000000.0) / CheckPointTimeout; + + if (progress < elapsed_time) + { + ckpt_cached_elapsed = elapsed_time; + return false; + } + + /* It looks like we're on schedule. */ + return true; +} + + +/* -------------------------------- + * signal handler routines + * -------------------------------- + */ + +/* SIGINT: set flag to run a normal checkpoint right away */ +static void +ReqCheckpointHandler(SIGNAL_ARGS) +{ + int save_errno = errno; + + /* + * The signaling process should have set ckpt_flags nonzero, so all we + * need do is ensure that our main loop gets kicked out of any wait. + */ + SetLatch(MyLatch); + + errno = save_errno; +} + + +/* -------------------------------- + * communication with backends + * -------------------------------- + */ + +/* + * CheckpointerShmemSize + * Compute space needed for checkpointer-related shared memory + */ +Size +CheckpointerShmemSize(void) +{ + Size size; + + /* + * Currently, the size of the requests[] array is arbitrarily set equal to + * NBuffers. This may prove too large or small ... + */ + size = offsetof(CheckpointerShmemStruct, requests); + size = add_size(size, mul_size(NBuffers, sizeof(CheckpointerRequest))); + + return size; +} + +/* + * CheckpointerShmemInit + * Allocate and initialize checkpointer-related shared memory + */ +void +CheckpointerShmemInit(void) +{ + Size size = CheckpointerShmemSize(); + bool found; + + CheckpointerShmem = (CheckpointerShmemStruct *) + ShmemInitStruct("Checkpointer Data", + size, + &found); + + if (!found) + { + /* + * First time through, so initialize. Note that we zero the whole + * requests array; this is so that CompactCheckpointerRequestQueue can + * assume that any pad bytes in the request structs are zeroes. + */ + MemSet(CheckpointerShmem, 0, size); + SpinLockInit(&CheckpointerShmem->ckpt_lck); + CheckpointerShmem->max_requests = NBuffers; + ConditionVariableInit(&CheckpointerShmem->start_cv); + ConditionVariableInit(&CheckpointerShmem->done_cv); + } +} + +/* + * RequestCheckpoint + * Called in backend processes to request a checkpoint + * + * flags is a bitwise OR of the following: + * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. + * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. + * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, + * ignoring checkpoint_completion_target parameter. + * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred + * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or + * CHECKPOINT_END_OF_RECOVERY). + * CHECKPOINT_WAIT: wait for completion before returning (otherwise, + * just signal checkpointer to do it, and return). + * CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling. + * (This affects logging, and in particular enables CheckPointWarning.) + */ +void +RequestCheckpoint(int flags) +{ + int ntries; + int old_failed, + old_started; + + /* + * If in a standalone backend, just do it ourselves. + */ + if (!IsPostmasterEnvironment) + { + /* + * There's no point in doing slow checkpoints in a standalone backend, + * because there's no other backends the checkpoint could disrupt. + */ + CreateCheckPoint(flags | CHECKPOINT_IMMEDIATE); + + /* + * After any checkpoint, close all smgr files. This is so we won't + * hang onto smgr references to deleted files indefinitely. + */ + smgrcloseall(); + + return; + } + + /* + * Atomically set the request flags, and take a snapshot of the counters. + * When we see ckpt_started > old_started, we know the flags we set here + * have been seen by checkpointer. + * + * Note that we OR the flags with any existing flags, to avoid overriding + * a "stronger" request by another backend. The flag senses must be + * chosen to make this work! + */ + SpinLockAcquire(&CheckpointerShmem->ckpt_lck); + + old_failed = CheckpointerShmem->ckpt_failed; + old_started = CheckpointerShmem->ckpt_started; + CheckpointerShmem->ckpt_flags |= (flags | CHECKPOINT_REQUESTED); + + SpinLockRelease(&CheckpointerShmem->ckpt_lck); + + /* + * Send signal to request checkpoint. It's possible that the checkpointer + * hasn't started yet, or is in process of restarting, so we will retry a + * few times if needed. (Actually, more than a few times, since on slow + * or overloaded buildfarm machines, it's been observed that the + * checkpointer can take several seconds to start.) However, if not told + * to wait for the checkpoint to occur, we consider failure to send the + * signal to be nonfatal and merely LOG it. The checkpointer should see + * the request when it does start, with or without getting a signal. + */ +#define MAX_SIGNAL_TRIES 600 /* max wait 60.0 sec */ + for (ntries = 0;; ntries++) + { + if (CheckpointerShmem->checkpointer_pid == 0) + { + if (ntries >= MAX_SIGNAL_TRIES || !(flags & CHECKPOINT_WAIT)) + { + elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG, + "could not signal for checkpoint: checkpointer is not running"); + break; + } + } + else if (kill(CheckpointerShmem->checkpointer_pid, SIGINT) != 0) + { + if (ntries >= MAX_SIGNAL_TRIES || !(flags & CHECKPOINT_WAIT)) + { + elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG, + "could not signal for checkpoint: %m"); + break; + } + } + else + break; /* signal sent successfully */ + + CHECK_FOR_INTERRUPTS(); + pg_usleep(100000L); /* wait 0.1 sec, then retry */ + } + + /* + * If requested, wait for completion. We detect completion according to + * the algorithm given above. + */ + if (flags & CHECKPOINT_WAIT) + { + int new_started, + new_failed; + + /* Wait for a new checkpoint to start. */ + ConditionVariablePrepareToSleep(&CheckpointerShmem->start_cv); + for (;;) + { + SpinLockAcquire(&CheckpointerShmem->ckpt_lck); + new_started = CheckpointerShmem->ckpt_started; + SpinLockRelease(&CheckpointerShmem->ckpt_lck); + + if (new_started != old_started) + break; + + ConditionVariableSleep(&CheckpointerShmem->start_cv, + WAIT_EVENT_CHECKPOINT_START); + } + ConditionVariableCancelSleep(); + + /* + * We are waiting for ckpt_done >= new_started, in a modulo sense. + */ + ConditionVariablePrepareToSleep(&CheckpointerShmem->done_cv); + for (;;) + { + int new_done; + + SpinLockAcquire(&CheckpointerShmem->ckpt_lck); + new_done = CheckpointerShmem->ckpt_done; + new_failed = CheckpointerShmem->ckpt_failed; + SpinLockRelease(&CheckpointerShmem->ckpt_lck); + + if (new_done - new_started >= 0) + break; + + ConditionVariableSleep(&CheckpointerShmem->done_cv, + WAIT_EVENT_CHECKPOINT_DONE); + } + ConditionVariableCancelSleep(); + + if (new_failed != old_failed) + ereport(ERROR, + (errmsg("checkpoint request failed"), + errhint("Consult recent messages in the server log for details."))); + } +} + +/* + * ForwardSyncRequest + * Forward a file-fsync request from a backend to the checkpointer + * + * Whenever a backend is compelled to write directly to a relation + * (which should be seldom, if the background writer is getting its job done), + * the backend calls this routine to pass over knowledge that the relation + * is dirty and must be fsync'd before next checkpoint. We also use this + * opportunity to count such writes for statistical purposes. + * + * To avoid holding the lock for longer than necessary, we normally write + * to the requests[] queue without checking for duplicates. The checkpointer + * will have to eliminate dups internally anyway. However, if we discover + * that the queue is full, we make a pass over the entire queue to compact + * it. This is somewhat expensive, but the alternative is for the backend + * to perform its own fsync, which is far more expensive in practice. It + * is theoretically possible a backend fsync might still be necessary, if + * the queue is full and contains no duplicate entries. In that case, we + * let the backend know by returning false. + */ +bool +ForwardSyncRequest(const FileTag *ftag, SyncRequestType type) +{ + CheckpointerRequest *request; + bool too_full; + + if (!IsUnderPostmaster) + return false; /* probably shouldn't even get here */ + + if (AmCheckpointerProcess()) + elog(ERROR, "ForwardSyncRequest must not be called in checkpointer"); + + LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); + + /* Count all backend writes regardless of if they fit in the queue */ + if (!AmBackgroundWriterProcess()) + CheckpointerShmem->num_backend_writes++; + + /* + * If the checkpointer isn't running or the request queue is full, the + * backend will have to perform its own fsync request. But before forcing + * that to happen, we can try to compact the request queue. + */ + if (CheckpointerShmem->checkpointer_pid == 0 || + (CheckpointerShmem->num_requests >= CheckpointerShmem->max_requests && + !CompactCheckpointerRequestQueue())) + { + /* + * Count the subset of writes where backends have to do their own + * fsync + */ + if (!AmBackgroundWriterProcess()) + CheckpointerShmem->num_backend_fsync++; + LWLockRelease(CheckpointerCommLock); + return false; + } + + /* OK, insert request */ + request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++]; + request->ftag = *ftag; + request->type = type; + + /* If queue is more than half full, nudge the checkpointer to empty it */ + too_full = (CheckpointerShmem->num_requests >= + CheckpointerShmem->max_requests / 2); + + LWLockRelease(CheckpointerCommLock); + + /* ... but not till after we release the lock */ + if (too_full && ProcGlobal->checkpointerLatch) + SetLatch(ProcGlobal->checkpointerLatch); + + return true; +} + +/* + * CompactCheckpointerRequestQueue + * Remove duplicates from the request queue to avoid backend fsyncs. + * Returns "true" if any entries were removed. + * + * Although a full fsync request queue is not common, it can lead to severe + * performance problems when it does happen. So far, this situation has + * only been observed to occur when the system is under heavy write load, + * and especially during the "sync" phase of a checkpoint. Without this + * logic, each backend begins doing an fsync for every block written, which + * gets very expensive and can slow down the whole system. + * + * Trying to do this every time the queue is full could lose if there + * aren't any removable entries. But that should be vanishingly rare in + * practice: there's one queue entry per shared buffer. + */ +static bool +CompactCheckpointerRequestQueue(void) +{ + struct CheckpointerSlotMapping + { + CheckpointerRequest request; + int slot; + }; + + int n, + preserve_count; + int num_skipped = 0; + HASHCTL ctl; + HTAB *htab; + bool *skip_slot; + + /* must hold CheckpointerCommLock in exclusive mode */ + Assert(LWLockHeldByMe(CheckpointerCommLock)); + + /* Initialize skip_slot array */ + skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests); + + /* Initialize temporary hash table */ + ctl.keysize = sizeof(CheckpointerRequest); + ctl.entrysize = sizeof(struct CheckpointerSlotMapping); + ctl.hcxt = CurrentMemoryContext; + + htab = hash_create("CompactCheckpointerRequestQueue", + CheckpointerShmem->num_requests, + &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + /* + * The basic idea here is that a request can be skipped if it's followed + * by a later, identical request. It might seem more sensible to work + * backwards from the end of the queue and check whether a request is + * *preceded* by an earlier, identical request, in the hopes of doing less + * copying. But that might change the semantics, if there's an + * intervening SYNC_FORGET_REQUEST or SYNC_FILTER_REQUEST, so we do it + * this way. It would be possible to be even smarter if we made the code + * below understand the specific semantics of such requests (it could blow + * away preceding entries that would end up being canceled anyhow), but + * it's not clear that the extra complexity would buy us anything. + */ + for (n = 0; n < CheckpointerShmem->num_requests; n++) + { + CheckpointerRequest *request; + struct CheckpointerSlotMapping *slotmap; + bool found; + + /* + * We use the request struct directly as a hashtable key. This + * assumes that any padding bytes in the structs are consistently the + * same, which should be okay because we zeroed them in + * CheckpointerShmemInit. Note also that RelFileLocator had better + * contain no pad bytes. + */ + request = &CheckpointerShmem->requests[n]; + slotmap = hash_search(htab, request, HASH_ENTER, &found); + if (found) + { + /* Duplicate, so mark the previous occurrence as skippable */ + skip_slot[slotmap->slot] = true; + num_skipped++; + } + /* Remember slot containing latest occurrence of this request value */ + slotmap->slot = n; + } + + /* Done with the hash table. */ + hash_destroy(htab); + + /* If no duplicates, we're out of luck. */ + if (!num_skipped) + { + pfree(skip_slot); + return false; + } + + /* We found some duplicates; remove them. */ + preserve_count = 0; + for (n = 0; n < CheckpointerShmem->num_requests; n++) + { + if (skip_slot[n]) + continue; + CheckpointerShmem->requests[preserve_count++] = CheckpointerShmem->requests[n]; + } + ereport(DEBUG1, + (errmsg_internal("compacted fsync request queue from %d entries to %d entries", + CheckpointerShmem->num_requests, preserve_count))); + CheckpointerShmem->num_requests = preserve_count; + + /* Cleanup. */ + pfree(skip_slot); + return true; +} + +/* + * AbsorbSyncRequests + * Retrieve queued sync requests and pass them to sync mechanism. + * + * This is exported because it must be called during CreateCheckPoint; + * we have to be sure we have accepted all pending requests just before + * we start fsync'ing. Since CreateCheckPoint sometimes runs in + * non-checkpointer processes, do nothing if not checkpointer. + */ +void +AbsorbSyncRequests(void) +{ + CheckpointerRequest *requests = NULL; + CheckpointerRequest *request; + int n; + + if (!AmCheckpointerProcess()) + return; + + LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); + + /* Transfer stats counts into pending pgstats message */ + PendingCheckpointerStats.buf_written_backend + += CheckpointerShmem->num_backend_writes; + PendingCheckpointerStats.buf_fsync_backend + += CheckpointerShmem->num_backend_fsync; + + CheckpointerShmem->num_backend_writes = 0; + CheckpointerShmem->num_backend_fsync = 0; + + /* + * We try to avoid holding the lock for a long time by copying the request + * array, and processing the requests after releasing the lock. + * + * Once we have cleared the requests from shared memory, we have to PANIC + * if we then fail to absorb them (eg, because our hashtable runs out of + * memory). This is because the system cannot run safely if we are unable + * to fsync what we have been told to fsync. Fortunately, the hashtable + * is so small that the problem is quite unlikely to arise in practice. + */ + n = CheckpointerShmem->num_requests; + if (n > 0) + { + requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest)); + memcpy(requests, CheckpointerShmem->requests, n * sizeof(CheckpointerRequest)); + } + + START_CRIT_SECTION(); + + CheckpointerShmem->num_requests = 0; + + LWLockRelease(CheckpointerCommLock); + + for (request = requests; n > 0; request++, n--) + RememberSyncRequest(&request->ftag, request->type); + + END_CRIT_SECTION(); + + if (requests) + pfree(requests); +} + +/* + * Update any shared memory configurations based on config parameters + */ +static void +UpdateSharedMemoryConfig(void) +{ + /* update global shmem state for sync rep */ + SyncRepUpdateSyncStandbysDefined(); + + /* + * If full_page_writes has been changed by SIGHUP, we update it in shared + * memory and write an XLOG_FPW_CHANGE record. + */ + UpdateFullPageWrites(); + + elog(DEBUG2, "checkpointer updated shared memory configuration values"); +} + +/* + * FirstCallSinceLastCheckpoint allows a process to take an action once + * per checkpoint cycle by asynchronously checking for checkpoint completion. + */ +bool +FirstCallSinceLastCheckpoint(void) +{ + static int ckpt_done = 0; + int new_done; + bool FirstCall = false; + + SpinLockAcquire(&CheckpointerShmem->ckpt_lck); + new_done = CheckpointerShmem->ckpt_done; + SpinLockRelease(&CheckpointerShmem->ckpt_lck); + + if (new_done != ckpt_done) + FirstCall = true; + + ckpt_done = new_done; + + return FirstCall; +} diff --git a/src/backend/postmaster/fork_process.c b/src/backend/postmaster/fork_process.c new file mode 100644 index 0000000..6f9c276 --- /dev/null +++ b/src/backend/postmaster/fork_process.c @@ -0,0 +1,126 @@ +/* + * fork_process.c + * A simple wrapper on top of fork(). This does not handle the + * EXEC_BACKEND case; it might be extended to do so, but it would be + * considerably more complex. + * + * Copyright (c) 1996-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/postmaster/fork_process.c + */ +#include "postgres.h" + +#include <fcntl.h> +#include <signal.h> +#include <time.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <unistd.h> + +#include "libpq/pqsignal.h" +#include "postmaster/fork_process.h" + +#ifndef WIN32 +/* + * Wrapper for fork(). Return values are the same as those for fork(): + * -1 if the fork failed, 0 in the child process, and the PID of the + * child in the parent process. Signals are blocked while forking, so + * the child must unblock. + */ +pid_t +fork_process(void) +{ + pid_t result; + const char *oomfilename; + sigset_t save_mask; + +#ifdef LINUX_PROFILE + struct itimerval prof_itimer; +#endif + + /* + * Flush stdio channels just before fork, to avoid double-output problems. + */ + fflush(NULL); + +#ifdef LINUX_PROFILE + + /* + * Linux's fork() resets the profiling timer in the child process. If we + * want to profile child processes then we need to save and restore the + * timer setting. This is a waste of time if not profiling, however, so + * only do it if commanded by specific -DLINUX_PROFILE switch. + */ + getitimer(ITIMER_PROF, &prof_itimer); +#endif + + /* + * We start postmaster children with signals blocked. This allows them to + * install their own handlers before unblocking, to avoid races where they + * might run the postmaster's handler and miss an important control + * signal. With more analysis this could potentially be relaxed. + */ + sigprocmask(SIG_SETMASK, &BlockSig, &save_mask); + result = fork(); + if (result == 0) + { + /* fork succeeded, in child */ +#ifdef LINUX_PROFILE + setitimer(ITIMER_PROF, &prof_itimer, NULL); +#endif + + /* + * By default, Linux tends to kill the postmaster in out-of-memory + * situations, because it blames the postmaster for the sum of child + * process sizes *including shared memory*. (This is unbelievably + * stupid, but the kernel hackers seem uninterested in improving it.) + * Therefore it's often a good idea to protect the postmaster by + * setting its OOM score adjustment negative (which has to be done in + * a root-owned startup script). Since the adjustment is inherited by + * child processes, this would ordinarily mean that all the + * postmaster's children are equally protected against OOM kill, which + * is not such a good idea. So we provide this code to allow the + * children to change their OOM score adjustments again. Both the + * file name to write to and the value to write are controlled by + * environment variables, which can be set by the same startup script + * that did the original adjustment. + */ + oomfilename = getenv("PG_OOM_ADJUST_FILE"); + + if (oomfilename != NULL) + { + /* + * Use open() not stdio, to ensure we control the open flags. Some + * Linux security environments reject anything but O_WRONLY. + */ + int fd = open(oomfilename, O_WRONLY, 0); + + /* We ignore all errors */ + if (fd >= 0) + { + const char *oomvalue = getenv("PG_OOM_ADJUST_VALUE"); + int rc; + + if (oomvalue == NULL) /* supply a useful default */ + oomvalue = "0"; + + rc = write(fd, oomvalue, strlen(oomvalue)); + (void) rc; + close(fd); + } + } + + /* do post-fork initialization for random number generation */ + pg_strong_random_init(); + } + else + { + /* in parent, restore signal mask */ + sigprocmask(SIG_SETMASK, &save_mask, NULL); + } + + return result; +} + +#endif /* ! WIN32 */ diff --git a/src/backend/postmaster/interrupt.c b/src/backend/postmaster/interrupt.c new file mode 100644 index 0000000..6d4bd76 --- /dev/null +++ b/src/backend/postmaster/interrupt.c @@ -0,0 +1,117 @@ +/*------------------------------------------------------------------------- + * + * interrupt.c + * Interrupt handling routines. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/postmaster/interrupt.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <unistd.h> + +#include "miscadmin.h" +#include "postmaster/interrupt.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/procsignal.h" +#include "utils/guc.h" +#include "utils/memutils.h" + +volatile sig_atomic_t ConfigReloadPending = false; +volatile sig_atomic_t ShutdownRequestPending = false; + +/* + * Simple interrupt handler for main loops of background processes. + */ +void +HandleMainLoopInterrupts(void) +{ + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + if (ShutdownRequestPending) + proc_exit(0); + + /* Perform logging of memory contexts of this process */ + if (LogMemoryContextPending) + ProcessLogMemoryContextInterrupt(); +} + +/* + * Simple signal handler for triggering a configuration reload. + * + * Normally, this handler would be used for SIGHUP. The idea is that code + * which uses it would arrange to check the ConfigReloadPending flag at + * convenient places inside main loops, or else call HandleMainLoopInterrupts. + */ +void +SignalHandlerForConfigReload(SIGNAL_ARGS) +{ + int save_errno = errno; + + ConfigReloadPending = true; + SetLatch(MyLatch); + + errno = save_errno; +} + +/* + * Simple signal handler for exiting quickly as if due to a crash. + * + * Normally, this would be used for handling SIGQUIT. + */ +void +SignalHandlerForCrashExit(SIGNAL_ARGS) +{ + /* + * We DO NOT want to run proc_exit() or atexit() callbacks -- we're here + * because shared memory may be corrupted, so we don't want to try to + * clean up our transaction. Just nail the windows shut and get out of + * town. The callbacks wouldn't be safe to run from a signal handler, + * anyway. + * + * Note we do _exit(2) not _exit(0). This is to force the postmaster into + * a system reset cycle if someone sends a manual SIGQUIT to a random + * backend. This is necessary precisely because we don't clean up our + * shared memory state. (The "dead man switch" mechanism in pmsignal.c + * should ensure the postmaster sees this as a crash, too, but no harm in + * being doubly sure.) + */ + _exit(2); +} + +/* + * Simple signal handler for triggering a long-running background process to + * shut down and exit. + * + * Typically, this handler would be used for SIGTERM, but some processes use + * other signals. In particular, the checkpointer exits on SIGUSR2, and the WAL + * writer and the logical replication parallel apply worker exits on either + * SIGINT or SIGTERM. + * + * ShutdownRequestPending should be checked at a convenient place within the + * main loop, or else the main loop should call HandleMainLoopInterrupts. + */ +void +SignalHandlerForShutdownRequest(SIGNAL_ARGS) +{ + int save_errno = errno; + + ShutdownRequestPending = true; + SetLatch(MyLatch); + + errno = save_errno; +} diff --git a/src/backend/postmaster/meson.build b/src/backend/postmaster/meson.build new file mode 100644 index 0000000..cda921f --- /dev/null +++ b/src/backend/postmaster/meson.build @@ -0,0 +1,16 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +backend_sources += files( + 'autovacuum.c', + 'auxprocess.c', + 'bgworker.c', + 'bgwriter.c', + 'checkpointer.c', + 'fork_process.c', + 'interrupt.c', + 'pgarch.c', + 'postmaster.c', + 'startup.c', + 'syslogger.c', + 'walwriter.c', +) diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c new file mode 100644 index 0000000..46af349 --- /dev/null +++ b/src/backend/postmaster/pgarch.c @@ -0,0 +1,869 @@ +/*------------------------------------------------------------------------- + * + * pgarch.c + * + * PostgreSQL WAL archiver + * + * All functions relating to archiver are included here + * + * - All functions executed by archiver process + * + * - archiver is forked from postmaster, and the two + * processes then communicate using signals. All functions + * executed by postmaster are included in this file. + * + * Initial author: Simon Riggs simon@2ndquadrant.com + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/postmaster/pgarch.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <time.h> +#include <sys/stat.h> +#include <unistd.h> + +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "archive/archive_module.h" +#include "archive/shell_archive.h" +#include "lib/binaryheap.h" +#include "libpq/pqsignal.h" +#include "pgstat.h" +#include "postmaster/interrupt.h" +#include "postmaster/pgarch.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/ps_status.h" + + +/* ---------- + * Timer definitions. + * ---------- + */ +#define PGARCH_AUTOWAKE_INTERVAL 60 /* How often to force a poll of the + * archive status directory; in seconds. */ +#define PGARCH_RESTART_INTERVAL 10 /* How often to attempt to restart a + * failed archiver; in seconds. */ + +/* + * Maximum number of retries allowed when attempting to archive a WAL + * file. + */ +#define NUM_ARCHIVE_RETRIES 3 + +/* + * Maximum number of retries allowed when attempting to remove an + * orphan archive status file. + */ +#define NUM_ORPHAN_CLEANUP_RETRIES 3 + +/* + * Maximum number of .ready files to gather per directory scan. + */ +#define NUM_FILES_PER_DIRECTORY_SCAN 64 + +/* Shared memory area for archiver process */ +typedef struct PgArchData +{ + int pgprocno; /* pgprocno of archiver process */ + + /* + * Forces a directory scan in pgarch_readyXlog(). Protected by arch_lck. + */ + bool force_dir_scan; + + slock_t arch_lck; +} PgArchData; + +char *XLogArchiveLibrary = ""; + + +/* ---------- + * Local data + * ---------- + */ +static time_t last_sigterm_time = 0; +static PgArchData *PgArch = NULL; +static const ArchiveModuleCallbacks *ArchiveCallbacks; +static ArchiveModuleState *archive_module_state; + + +/* + * Stuff for tracking multiple files to archive from each scan of + * archive_status. Minimizing the number of directory scans when there are + * many files to archive can significantly improve archival rate. + * + * arch_heap is a max-heap that is used during the directory scan to track + * the highest-priority files to archive. After the directory scan + * completes, the file names are stored in ascending order of priority in + * arch_files. pgarch_readyXlog() returns files from arch_files until it + * is empty, at which point another directory scan must be performed. + * + * We only need this data in the archiver process, so make it a palloc'd + * struct rather than a bunch of static arrays. + */ +struct arch_files_state +{ + binaryheap *arch_heap; + int arch_files_size; /* number of live entries in arch_files[] */ + char *arch_files[NUM_FILES_PER_DIRECTORY_SCAN]; + /* buffers underlying heap, and later arch_files[], entries: */ + char arch_filenames[NUM_FILES_PER_DIRECTORY_SCAN][MAX_XFN_CHARS + 1]; +}; + +static struct arch_files_state *arch_files = NULL; + +/* + * Flags set by interrupt handlers for later service in the main loop. + */ +static volatile sig_atomic_t ready_to_stop = false; + +/* ---------- + * Local function forward declarations + * ---------- + */ +static void pgarch_waken_stop(SIGNAL_ARGS); +static void pgarch_MainLoop(void); +static void pgarch_ArchiverCopyLoop(void); +static bool pgarch_archiveXlog(char *xlog); +static bool pgarch_readyXlog(char *xlog); +static void pgarch_archiveDone(char *xlog); +static void pgarch_die(int code, Datum arg); +static void HandlePgArchInterrupts(void); +static int ready_file_comparator(Datum a, Datum b, void *arg); +static void LoadArchiveLibrary(void); +static void pgarch_call_module_shutdown_cb(int code, Datum arg); + +/* Report shared memory space needed by PgArchShmemInit */ +Size +PgArchShmemSize(void) +{ + Size size = 0; + + size = add_size(size, sizeof(PgArchData)); + + return size; +} + +/* Allocate and initialize archiver-related shared memory */ +void +PgArchShmemInit(void) +{ + bool found; + + PgArch = (PgArchData *) + ShmemInitStruct("Archiver Data", PgArchShmemSize(), &found); + + if (!found) + { + /* First time through, so initialize */ + MemSet(PgArch, 0, PgArchShmemSize()); + PgArch->pgprocno = INVALID_PGPROCNO; + SpinLockInit(&PgArch->arch_lck); + } +} + +/* + * PgArchCanRestart + * + * Return true and archiver is allowed to restart if enough time has + * passed since it was launched last to reach PGARCH_RESTART_INTERVAL. + * Otherwise return false. + * + * This is a safety valve to protect against continuous respawn attempts if the + * archiver is dying immediately at launch. Note that since we will retry to + * launch the archiver from the postmaster main loop, we will get another + * chance later. + */ +bool +PgArchCanRestart(void) +{ + static time_t last_pgarch_start_time = 0; + time_t curtime = time(NULL); + + /* + * Return false and don't restart archiver if too soon since last archiver + * start. + */ + if ((unsigned int) (curtime - last_pgarch_start_time) < + (unsigned int) PGARCH_RESTART_INTERVAL) + return false; + + last_pgarch_start_time = curtime; + return true; +} + + +/* Main entry point for archiver process */ +void +PgArchiverMain(void) +{ + /* + * Ignore all signals usually bound to some action in the postmaster, + * except for SIGHUP, SIGTERM, SIGUSR1, SIGUSR2, and SIGQUIT. + */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGINT, SIG_IGN); + pqsignal(SIGTERM, SignalHandlerForShutdownRequest); + /* SIGQUIT handler was already set up by InitPostmasterChild */ + pqsignal(SIGALRM, SIG_IGN); + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, pgarch_waken_stop); + + /* Reset some signals that are accepted by postmaster but not here */ + pqsignal(SIGCHLD, SIG_DFL); + + /* Unblock signals (they were blocked when the postmaster forked us) */ + sigprocmask(SIG_SETMASK, &UnBlockSig, NULL); + + /* We shouldn't be launched unnecessarily. */ + Assert(XLogArchivingActive()); + + /* Arrange to clean up at archiver exit */ + on_shmem_exit(pgarch_die, 0); + + /* + * Advertise our pgprocno so that backends can use our latch to wake us up + * while we're sleeping. + */ + PgArch->pgprocno = MyProc->pgprocno; + + /* Create workspace for pgarch_readyXlog() */ + arch_files = palloc(sizeof(struct arch_files_state)); + arch_files->arch_files_size = 0; + + /* Initialize our max-heap for prioritizing files to archive. */ + arch_files->arch_heap = binaryheap_allocate(NUM_FILES_PER_DIRECTORY_SCAN, + ready_file_comparator, NULL); + + /* Load the archive_library. */ + LoadArchiveLibrary(); + + pgarch_MainLoop(); + + proc_exit(0); +} + +/* + * Wake up the archiver + */ +void +PgArchWakeup(void) +{ + int arch_pgprocno = PgArch->pgprocno; + + /* + * We don't acquire ProcArrayLock here. It's actually fine because + * procLatch isn't ever freed, so we just can potentially set the wrong + * process' (or no process') latch. Even in that case the archiver will + * be relaunched shortly and will start archiving. + */ + if (arch_pgprocno != INVALID_PGPROCNO) + SetLatch(&ProcGlobal->allProcs[arch_pgprocno].procLatch); +} + + +/* SIGUSR2 signal handler for archiver process */ +static void +pgarch_waken_stop(SIGNAL_ARGS) +{ + int save_errno = errno; + + /* set flag to do a final cycle and shut down afterwards */ + ready_to_stop = true; + SetLatch(MyLatch); + + errno = save_errno; +} + +/* + * pgarch_MainLoop + * + * Main loop for archiver + */ +static void +pgarch_MainLoop(void) +{ + bool time_to_stop; + + /* + * There shouldn't be anything for the archiver to do except to wait for a + * signal ... however, the archiver exists to protect our data, so it + * wakes up occasionally to allow itself to be proactive. + */ + do + { + ResetLatch(MyLatch); + + /* When we get SIGUSR2, we do one more archive cycle, then exit */ + time_to_stop = ready_to_stop; + + /* Check for barrier events and config update */ + HandlePgArchInterrupts(); + + /* + * If we've gotten SIGTERM, we normally just sit and do nothing until + * SIGUSR2 arrives. However, that means a random SIGTERM would + * disable archiving indefinitely, which doesn't seem like a good + * idea. If more than 60 seconds pass since SIGTERM, exit anyway, so + * that the postmaster can start a new archiver if needed. + */ + if (ShutdownRequestPending) + { + time_t curtime = time(NULL); + + if (last_sigterm_time == 0) + last_sigterm_time = curtime; + else if ((unsigned int) (curtime - last_sigterm_time) >= + (unsigned int) 60) + break; + } + + /* Do what we're here for */ + pgarch_ArchiverCopyLoop(); + + /* + * Sleep until a signal is received, or until a poll is forced by + * PGARCH_AUTOWAKE_INTERVAL, or until postmaster dies. + */ + if (!time_to_stop) /* Don't wait during last iteration */ + { + int rc; + + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + PGARCH_AUTOWAKE_INTERVAL * 1000L, + WAIT_EVENT_ARCHIVER_MAIN); + if (rc & WL_POSTMASTER_DEATH) + time_to_stop = true; + } + + /* + * The archiver quits either when the postmaster dies (not expected) + * or after completing one more archiving cycle after receiving + * SIGUSR2. + */ + } while (!time_to_stop); +} + +/* + * pgarch_ArchiverCopyLoop + * + * Archives all outstanding xlogs then returns + */ +static void +pgarch_ArchiverCopyLoop(void) +{ + char xlog[MAX_XFN_CHARS + 1]; + + /* force directory scan in the first call to pgarch_readyXlog() */ + arch_files->arch_files_size = 0; + + /* + * loop through all xlogs with archive_status of .ready and archive + * them...mostly we expect this to be a single file, though it is possible + * some backend will add files onto the list of those that need archiving + * while we are still copying earlier archives + */ + while (pgarch_readyXlog(xlog)) + { + int failures = 0; + int failures_orphan = 0; + + for (;;) + { + struct stat stat_buf; + char pathname[MAXPGPATH]; + + /* + * Do not initiate any more archive commands after receiving + * SIGTERM, nor after the postmaster has died unexpectedly. The + * first condition is to try to keep from having init SIGKILL the + * command, and the second is to avoid conflicts with another + * archiver spawned by a newer postmaster. + */ + if (ShutdownRequestPending || !PostmasterIsAlive()) + return; + + /* + * Check for barrier events and config update. This is so that + * we'll adopt a new setting for archive_command as soon as + * possible, even if there is a backlog of files to be archived. + */ + HandlePgArchInterrupts(); + + /* can't do anything if not configured ... */ + if (ArchiveCallbacks->check_configured_cb != NULL && + !ArchiveCallbacks->check_configured_cb(archive_module_state)) + { + ereport(WARNING, + (errmsg("archive_mode enabled, yet archiving is not configured"))); + return; + } + + /* + * Since archive status files are not removed in a durable manner, + * a system crash could leave behind .ready files for WAL segments + * that have already been recycled or removed. In this case, + * simply remove the orphan status file and move on. unlink() is + * used here as even on subsequent crashes the same orphan files + * would get removed, so there is no need to worry about + * durability. + */ + snprintf(pathname, MAXPGPATH, XLOGDIR "/%s", xlog); + if (stat(pathname, &stat_buf) != 0 && errno == ENOENT) + { + char xlogready[MAXPGPATH]; + + StatusFilePath(xlogready, xlog, ".ready"); + if (unlink(xlogready) == 0) + { + ereport(WARNING, + (errmsg("removed orphan archive status file \"%s\"", + xlogready))); + + /* leave loop and move to the next status file */ + break; + } + + if (++failures_orphan >= NUM_ORPHAN_CLEANUP_RETRIES) + { + ereport(WARNING, + (errmsg("removal of orphan archive status file \"%s\" failed too many times, will try again later", + xlogready))); + + /* give up cleanup of orphan status files */ + return; + } + + /* wait a bit before retrying */ + pg_usleep(1000000L); + continue; + } + + if (pgarch_archiveXlog(xlog)) + { + /* successful */ + pgarch_archiveDone(xlog); + + /* + * Tell the cumulative stats system about the WAL file that we + * successfully archived + */ + pgstat_report_archiver(xlog, false); + + break; /* out of inner retry loop */ + } + else + { + /* + * Tell the cumulative stats system about the WAL file that we + * failed to archive + */ + pgstat_report_archiver(xlog, true); + + if (++failures >= NUM_ARCHIVE_RETRIES) + { + ereport(WARNING, + (errmsg("archiving write-ahead log file \"%s\" failed too many times, will try again later", + xlog))); + return; /* give up archiving for now */ + } + pg_usleep(1000000L); /* wait a bit before retrying */ + } + } + } +} + +/* + * pgarch_archiveXlog + * + * Invokes archive_file_cb to copy one archive file to wherever it should go + * + * Returns true if successful + */ +static bool +pgarch_archiveXlog(char *xlog) +{ + char pathname[MAXPGPATH]; + char activitymsg[MAXFNAMELEN + 16]; + bool ret; + + snprintf(pathname, MAXPGPATH, XLOGDIR "/%s", xlog); + + /* Report archive activity in PS display */ + snprintf(activitymsg, sizeof(activitymsg), "archiving %s", xlog); + set_ps_display(activitymsg); + + ret = ArchiveCallbacks->archive_file_cb(archive_module_state, xlog, pathname); + if (ret) + snprintf(activitymsg, sizeof(activitymsg), "last was %s", xlog); + else + snprintf(activitymsg, sizeof(activitymsg), "failed on %s", xlog); + set_ps_display(activitymsg); + + return ret; +} + +/* + * pgarch_readyXlog + * + * Return name of the oldest xlog file that has not yet been archived. + * No notification is set that file archiving is now in progress, so + * this would need to be extended if multiple concurrent archival + * tasks were created. If a failure occurs, we will completely + * re-copy the file at the next available opportunity. + * + * It is important that we return the oldest, so that we archive xlogs + * in order that they were written, for two reasons: + * 1) to maintain the sequential chain of xlogs required for recovery + * 2) because the oldest ones will sooner become candidates for + * recycling at time of checkpoint + * + * NOTE: the "oldest" comparison will consider any .history file to be older + * than any other file except another .history file. Segments on a timeline + * with a smaller ID will be older than all segments on a timeline with a + * larger ID; the net result being that past timelines are given higher + * priority for archiving. This seems okay, or at least not obviously worth + * changing. + */ +static bool +pgarch_readyXlog(char *xlog) +{ + char XLogArchiveStatusDir[MAXPGPATH]; + DIR *rldir; + struct dirent *rlde; + bool force_dir_scan; + + /* + * If a directory scan was requested, clear the stored file names and + * proceed. + */ + SpinLockAcquire(&PgArch->arch_lck); + force_dir_scan = PgArch->force_dir_scan; + PgArch->force_dir_scan = false; + SpinLockRelease(&PgArch->arch_lck); + + if (force_dir_scan) + arch_files->arch_files_size = 0; + + /* + * If we still have stored file names from the previous directory scan, + * try to return one of those. We check to make sure the status file is + * still present, as the archive_command for a previous file may have + * already marked it done. + */ + while (arch_files->arch_files_size > 0) + { + struct stat st; + char status_file[MAXPGPATH]; + char *arch_file; + + arch_files->arch_files_size--; + arch_file = arch_files->arch_files[arch_files->arch_files_size]; + StatusFilePath(status_file, arch_file, ".ready"); + + if (stat(status_file, &st) == 0) + { + strcpy(xlog, arch_file); + return true; + } + else if (errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", status_file))); + } + + /* arch_heap is probably empty, but let's make sure */ + binaryheap_reset(arch_files->arch_heap); + + /* + * Open the archive status directory and read through the list of files + * with the .ready suffix, looking for the earliest files. + */ + snprintf(XLogArchiveStatusDir, MAXPGPATH, XLOGDIR "/archive_status"); + rldir = AllocateDir(XLogArchiveStatusDir); + + while ((rlde = ReadDir(rldir, XLogArchiveStatusDir)) != NULL) + { + int basenamelen = (int) strlen(rlde->d_name) - 6; + char basename[MAX_XFN_CHARS + 1]; + char *arch_file; + + /* Ignore entries with unexpected number of characters */ + if (basenamelen < MIN_XFN_CHARS || + basenamelen > MAX_XFN_CHARS) + continue; + + /* Ignore entries with unexpected characters */ + if (strspn(rlde->d_name, VALID_XFN_CHARS) < basenamelen) + continue; + + /* Ignore anything not suffixed with .ready */ + if (strcmp(rlde->d_name + basenamelen, ".ready") != 0) + continue; + + /* Truncate off the .ready */ + memcpy(basename, rlde->d_name, basenamelen); + basename[basenamelen] = '\0'; + + /* + * Store the file in our max-heap if it has a high enough priority. + */ + if (arch_files->arch_heap->bh_size < NUM_FILES_PER_DIRECTORY_SCAN) + { + /* If the heap isn't full yet, quickly add it. */ + arch_file = arch_files->arch_filenames[arch_files->arch_heap->bh_size]; + strcpy(arch_file, basename); + binaryheap_add_unordered(arch_files->arch_heap, CStringGetDatum(arch_file)); + + /* If we just filled the heap, make it a valid one. */ + if (arch_files->arch_heap->bh_size == NUM_FILES_PER_DIRECTORY_SCAN) + binaryheap_build(arch_files->arch_heap); + } + else if (ready_file_comparator(binaryheap_first(arch_files->arch_heap), + CStringGetDatum(basename), NULL) > 0) + { + /* + * Remove the lowest priority file and add the current one to the + * heap. + */ + arch_file = DatumGetCString(binaryheap_remove_first(arch_files->arch_heap)); + strcpy(arch_file, basename); + binaryheap_add(arch_files->arch_heap, CStringGetDatum(arch_file)); + } + } + FreeDir(rldir); + + /* If no files were found, simply return. */ + if (arch_files->arch_heap->bh_size == 0) + return false; + + /* + * If we didn't fill the heap, we didn't make it a valid one. Do that + * now. + */ + if (arch_files->arch_heap->bh_size < NUM_FILES_PER_DIRECTORY_SCAN) + binaryheap_build(arch_files->arch_heap); + + /* + * Fill arch_files array with the files to archive in ascending order of + * priority. + */ + arch_files->arch_files_size = arch_files->arch_heap->bh_size; + for (int i = 0; i < arch_files->arch_files_size; i++) + arch_files->arch_files[i] = DatumGetCString(binaryheap_remove_first(arch_files->arch_heap)); + + /* Return the highest priority file. */ + arch_files->arch_files_size--; + strcpy(xlog, arch_files->arch_files[arch_files->arch_files_size]); + + return true; +} + +/* + * ready_file_comparator + * + * Compares the archival priority of the given files to archive. If "a" + * has a higher priority than "b", a negative value will be returned. If + * "b" has a higher priority than "a", a positive value will be returned. + * If "a" and "b" have equivalent values, 0 will be returned. + */ +static int +ready_file_comparator(Datum a, Datum b, void *arg) +{ + char *a_str = DatumGetCString(a); + char *b_str = DatumGetCString(b); + bool a_history = IsTLHistoryFileName(a_str); + bool b_history = IsTLHistoryFileName(b_str); + + /* Timeline history files always have the highest priority. */ + if (a_history != b_history) + return a_history ? -1 : 1; + + /* Priority is given to older files. */ + return strcmp(a_str, b_str); +} + +/* + * PgArchForceDirScan + * + * When called, the next call to pgarch_readyXlog() will perform a + * directory scan. This is useful for ensuring that important files such + * as timeline history files are archived as quickly as possible. + */ +void +PgArchForceDirScan(void) +{ + SpinLockAcquire(&PgArch->arch_lck); + PgArch->force_dir_scan = true; + SpinLockRelease(&PgArch->arch_lck); +} + +/* + * pgarch_archiveDone + * + * Emit notification that an xlog file has been successfully archived. + * We do this by renaming the status file from NNN.ready to NNN.done. + * Eventually, a checkpoint process will notice this and delete both the + * NNN.done file and the xlog file itself. + */ +static void +pgarch_archiveDone(char *xlog) +{ + char rlogready[MAXPGPATH]; + char rlogdone[MAXPGPATH]; + + StatusFilePath(rlogready, xlog, ".ready"); + StatusFilePath(rlogdone, xlog, ".done"); + + /* + * To avoid extra overhead, we don't durably rename the .ready file to + * .done. Archive commands and libraries must gracefully handle attempts + * to re-archive files (e.g., if the server crashes just before this + * function is called), so it should be okay if the .ready file reappears + * after a crash. + */ + if (rename(rlogready, rlogdone) < 0) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + rlogready, rlogdone))); +} + + +/* + * pgarch_die + * + * Exit-time cleanup handler + */ +static void +pgarch_die(int code, Datum arg) +{ + PgArch->pgprocno = INVALID_PGPROCNO; +} + +/* + * Interrupt handler for WAL archiver process. + * + * This is called in the loops pgarch_MainLoop and pgarch_ArchiverCopyLoop. + * It checks for barrier events, config update and request for logging of + * memory contexts, but not shutdown request because how to handle + * shutdown request is different between those loops. + */ +static void +HandlePgArchInterrupts(void) +{ + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); + + /* Perform logging of memory contexts of this process */ + if (LogMemoryContextPending) + ProcessLogMemoryContextInterrupt(); + + if (ConfigReloadPending) + { + char *archiveLib = pstrdup(XLogArchiveLibrary); + bool archiveLibChanged; + + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + + if (XLogArchiveLibrary[0] != '\0' && XLogArchiveCommand[0] != '\0') + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("both archive_command and archive_library set"), + errdetail("Only one of archive_command, archive_library may be set."))); + + archiveLibChanged = strcmp(XLogArchiveLibrary, archiveLib) != 0; + pfree(archiveLib); + + if (archiveLibChanged) + { + /* + * Ideally, we would simply unload the previous archive module and + * load the new one, but there is presently no mechanism for + * unloading a library (see the comment above + * internal_load_library()). To deal with this, we simply restart + * the archiver. The new archive module will be loaded when the + * new archiver process starts up. Note that this triggers the + * module's shutdown callback, if defined. + */ + ereport(LOG, + (errmsg("restarting archiver process because value of " + "\"archive_library\" was changed"))); + + proc_exit(0); + } + } +} + +/* + * LoadArchiveLibrary + * + * Loads the archiving callbacks into our local ArchiveCallbacks. + */ +static void +LoadArchiveLibrary(void) +{ + ArchiveModuleInit archive_init; + + if (XLogArchiveLibrary[0] != '\0' && XLogArchiveCommand[0] != '\0') + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("both archive_command and archive_library set"), + errdetail("Only one of archive_command, archive_library may be set."))); + + /* + * If shell archiving is enabled, use our special initialization function. + * Otherwise, load the library and call its _PG_archive_module_init(). + */ + if (XLogArchiveLibrary[0] == '\0') + archive_init = shell_archive_init; + else + archive_init = (ArchiveModuleInit) + load_external_function(XLogArchiveLibrary, + "_PG_archive_module_init", false, NULL); + + if (archive_init == NULL) + ereport(ERROR, + (errmsg("archive modules have to define the symbol %s", "_PG_archive_module_init"))); + + ArchiveCallbacks = (*archive_init) (); + + if (ArchiveCallbacks->archive_file_cb == NULL) + ereport(ERROR, + (errmsg("archive modules must register an archive callback"))); + + archive_module_state = (ArchiveModuleState *) palloc0(sizeof(ArchiveModuleState)); + if (ArchiveCallbacks->startup_cb != NULL) + ArchiveCallbacks->startup_cb(archive_module_state); + + before_shmem_exit(pgarch_call_module_shutdown_cb, 0); +} + +/* + * Call the shutdown callback of the loaded archive module, if defined. + */ +static void +pgarch_call_module_shutdown_cb(int code, Datum arg) +{ + if (ArchiveCallbacks->shutdown_cb != NULL) + ArchiveCallbacks->shutdown_cb(archive_module_state); +} diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c new file mode 100644 index 0000000..b42aae4 --- /dev/null +++ b/src/backend/postmaster/postmaster.c @@ -0,0 +1,6549 @@ +/*------------------------------------------------------------------------- + * + * postmaster.c + * This program acts as a clearing house for requests to the + * POSTGRES system. Frontend programs send a startup message + * to the Postmaster and the postmaster uses the info in the + * message to setup a backend process. + * + * The postmaster also manages system-wide operations such as + * startup and shutdown. The postmaster itself doesn't do those + * operations, mind you --- it just forks off a subprocess to do them + * at the right times. It also takes care of resetting the system + * if a backend crashes. + * + * The postmaster process creates the shared memory and semaphore + * pools during startup, but as a rule does not touch them itself. + * In particular, it is not a member of the PGPROC array of backends + * and so it cannot participate in lock-manager operations. Keeping + * the postmaster away from shared memory operations makes it simpler + * and more reliable. The postmaster is almost always able to recover + * from crashes of individual backends by resetting shared memory; + * if it did much with shared memory then it would be prone to crashing + * along with the backends. + * + * When a request message is received, we now fork() immediately. + * The child process performs authentication of the request, and + * then becomes a backend if successful. This allows the auth code + * to be written in a simple single-threaded style (as opposed to the + * crufty "poor man's multitasking" code that used to be needed). + * More importantly, it ensures that blockages in non-multithreaded + * libraries like SSL or PAM cannot cause denial of service to other + * clients. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/postmaster/postmaster.c + * + * NOTES + * + * Initialization: + * The Postmaster sets up shared memory data structures + * for the backends. + * + * Synchronization: + * The Postmaster shares memory with the backends but should avoid + * touching shared memory, so as not to become stuck if a crashing + * backend screws up locks or shared memory. Likewise, the Postmaster + * should never block on messages from frontend clients. + * + * Garbage Collection: + * The Postmaster cleans up after backends if they have an emergency + * exit and/or core dump. + * + * Error Reporting: + * Use write_stderr() only for reporting "interactive" errors + * (essentially, bogus arguments on the command line). Once the + * postmaster is launched, use ereport(). + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <unistd.h> +#include <signal.h> +#include <time.h> +#include <sys/wait.h> +#include <ctype.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <fcntl.h> +#include <sys/param.h> +#include <netdb.h> +#include <limits.h> + +#ifdef USE_BONJOUR +#include <dns_sd.h> +#endif + +#ifdef USE_SYSTEMD +#include <systemd/sd-daemon.h> +#endif + +#ifdef HAVE_PTHREAD_IS_THREADED_NP +#include <pthread.h> +#endif + +#include "access/transam.h" +#include "access/xlog.h" +#include "access/xlogrecovery.h" +#include "catalog/pg_control.h" +#include "common/file_perm.h" +#include "common/ip.h" +#include "common/pg_prng.h" +#include "common/string.h" +#include "lib/ilist.h" +#include "libpq/auth.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "libpq/pqsignal.h" +#include "nodes/queryjumble.h" +#include "pg_getopt.h" +#include "pgstat.h" +#include "port/pg_bswap.h" +#include "postmaster/autovacuum.h" +#include "postmaster/auxprocess.h" +#include "postmaster/bgworker_internals.h" +#include "postmaster/fork_process.h" +#include "postmaster/interrupt.h" +#include "postmaster/pgarch.h" +#include "postmaster/postmaster.h" +#include "postmaster/syslogger.h" +#include "replication/logicallauncher.h" +#include "replication/walsender.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/pg_shmem.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" +#include "utils/datetime.h" +#include "utils/memutils.h" +#include "utils/pidfile.h" +#include "utils/ps_status.h" +#include "utils/timeout.h" +#include "utils/timestamp.h" +#include "utils/varlena.h" + +#ifdef EXEC_BACKEND +#include "storage/spin.h" +#endif + + +/* + * Possible types of a backend. Beyond being the possible bkend_type values in + * struct bkend, these are OR-able request flag bits for SignalSomeChildren() + * and CountChildren(). + */ +#define BACKEND_TYPE_NORMAL 0x0001 /* normal backend */ +#define BACKEND_TYPE_AUTOVAC 0x0002 /* autovacuum worker process */ +#define BACKEND_TYPE_WALSND 0x0004 /* walsender process */ +#define BACKEND_TYPE_BGWORKER 0x0008 /* bgworker process */ +#define BACKEND_TYPE_ALL 0x000F /* OR of all the above */ + +/* + * List of active backends (or child processes anyway; we don't actually + * know whether a given child has become a backend or is still in the + * authorization phase). This is used mainly to keep track of how many + * children we have and send them appropriate signals when necessary. + * + * As shown in the above set of backend types, this list includes not only + * "normal" client sessions, but also autovacuum workers, walsenders, and + * background workers. (Note that at the time of launch, walsenders are + * labeled BACKEND_TYPE_NORMAL; we relabel them to BACKEND_TYPE_WALSND + * upon noticing they've changed their PMChildFlags entry. Hence that check + * must be done before any operation that needs to distinguish walsenders + * from normal backends.) + * + * Also, "dead_end" children are in it: these are children launched just for + * the purpose of sending a friendly rejection message to a would-be client. + * We must track them because they are attached to shared memory, but we know + * they will never become live backends. dead_end children are not assigned a + * PMChildSlot. dead_end children have bkend_type NORMAL. + * + * "Special" children such as the startup, bgwriter and autovacuum launcher + * tasks are not in this list. They are tracked via StartupPID and other + * pid_t variables below. (Thus, there can't be more than one of any given + * "special" child process type. We use BackendList entries for any child + * process there can be more than one of.) + */ +typedef struct bkend +{ + pid_t pid; /* process id of backend */ + int32 cancel_key; /* cancel key for cancels for this backend */ + int child_slot; /* PMChildSlot for this backend, if any */ + int bkend_type; /* child process flavor, see above */ + bool dead_end; /* is it going to send an error and quit? */ + bool bgworker_notify; /* gets bgworker start/stop notifications */ + dlist_node elem; /* list link in BackendList */ +} Backend; + +static dlist_head BackendList = DLIST_STATIC_INIT(BackendList); + +#ifdef EXEC_BACKEND +static Backend *ShmemBackendArray; +#endif + +BackgroundWorker *MyBgworkerEntry = NULL; + + + +/* The socket number we are listening for connections on */ +int PostPortNumber = DEF_PGPORT; + +/* The directory names for Unix socket(s) */ +char *Unix_socket_directories; + +/* The TCP listen address(es) */ +char *ListenAddresses; + +/* + * SuperuserReservedConnections is the number of backends reserved for + * superuser use, and ReservedConnections is the number of backends reserved + * for use by roles with privileges of the pg_use_reserved_connections + * predefined role. These are taken out of the pool of MaxConnections backend + * slots, so the number of backend slots available for roles that are neither + * superuser nor have privileges of pg_use_reserved_connections is + * (MaxConnections - SuperuserReservedConnections - ReservedConnections). + * + * If the number of remaining slots is less than or equal to + * SuperuserReservedConnections, only superusers can make new connections. If + * the number of remaining slots is greater than SuperuserReservedConnections + * but less than or equal to + * (SuperuserReservedConnections + ReservedConnections), only superusers and + * roles with privileges of pg_use_reserved_connections can make new + * connections. Note that pre-existing superuser and + * pg_use_reserved_connections connections don't count against the limits. + */ +int SuperuserReservedConnections; +int ReservedConnections; + +/* The socket(s) we're listening to. */ +#define MAXLISTEN 64 +static pgsocket ListenSocket[MAXLISTEN]; + +/* still more option variables */ +bool EnableSSL = false; + +int PreAuthDelay = 0; +int AuthenticationTimeout = 60; + +bool log_hostname; /* for ps display and logging */ +bool Log_connections = false; +bool Db_user_namespace = false; + +bool enable_bonjour = false; +char *bonjour_name; +bool restart_after_crash = true; +bool remove_temp_files_after_crash = true; +bool send_abort_for_crash = false; +bool send_abort_for_kill = false; + +/* PIDs of special child processes; 0 when not running */ +static pid_t StartupPID = 0, + BgWriterPID = 0, + CheckpointerPID = 0, + WalWriterPID = 0, + WalReceiverPID = 0, + AutoVacPID = 0, + PgArchPID = 0, + SysLoggerPID = 0; + +/* Startup process's status */ +typedef enum +{ + STARTUP_NOT_RUNNING, + STARTUP_RUNNING, + STARTUP_SIGNALED, /* we sent it a SIGQUIT or SIGKILL */ + STARTUP_CRASHED +} StartupStatusEnum; + +static StartupStatusEnum StartupStatus = STARTUP_NOT_RUNNING; + +/* Startup/shutdown state */ +#define NoShutdown 0 +#define SmartShutdown 1 +#define FastShutdown 2 +#define ImmediateShutdown 3 + +static int Shutdown = NoShutdown; + +static bool FatalError = false; /* T if recovering from backend crash */ + +/* + * We use a simple state machine to control startup, shutdown, and + * crash recovery (which is rather like shutdown followed by startup). + * + * After doing all the postmaster initialization work, we enter PM_STARTUP + * state and the startup process is launched. The startup process begins by + * reading the control file and other preliminary initialization steps. + * In a normal startup, or after crash recovery, the startup process exits + * with exit code 0 and we switch to PM_RUN state. However, archive recovery + * is handled specially since it takes much longer and we would like to support + * hot standby during archive recovery. + * + * When the startup process is ready to start archive recovery, it signals the + * postmaster, and we switch to PM_RECOVERY state. The background writer and + * checkpointer are launched, while the startup process continues applying WAL. + * If Hot Standby is enabled, then, after reaching a consistent point in WAL + * redo, startup process signals us again, and we switch to PM_HOT_STANDBY + * state and begin accepting connections to perform read-only queries. When + * archive recovery is finished, the startup process exits with exit code 0 + * and we switch to PM_RUN state. + * + * Normal child backends can only be launched when we are in PM_RUN or + * PM_HOT_STANDBY state. (connsAllowed can also restrict launching.) + * In other states we handle connection requests by launching "dead_end" + * child processes, which will simply send the client an error message and + * quit. (We track these in the BackendList so that we can know when they + * are all gone; this is important because they're still connected to shared + * memory, and would interfere with an attempt to destroy the shmem segment, + * possibly leading to SHMALL failure when we try to make a new one.) + * In PM_WAIT_DEAD_END state we are waiting for all the dead_end children + * to drain out of the system, and therefore stop accepting connection + * requests at all until the last existing child has quit (which hopefully + * will not be very long). + * + * Notice that this state variable does not distinguish *why* we entered + * states later than PM_RUN --- Shutdown and FatalError must be consulted + * to find that out. FatalError is never true in PM_RECOVERY, PM_HOT_STANDBY, + * or PM_RUN states, nor in PM_SHUTDOWN states (because we don't enter those + * states when trying to recover from a crash). It can be true in PM_STARTUP + * state, because we don't clear it until we've successfully started WAL redo. + */ +typedef enum +{ + PM_INIT, /* postmaster starting */ + PM_STARTUP, /* waiting for startup subprocess */ + PM_RECOVERY, /* in archive recovery mode */ + PM_HOT_STANDBY, /* in hot standby mode */ + PM_RUN, /* normal "database is alive" state */ + PM_STOP_BACKENDS, /* need to stop remaining backends */ + PM_WAIT_BACKENDS, /* waiting for live backends to exit */ + PM_SHUTDOWN, /* waiting for checkpointer to do shutdown + * ckpt */ + PM_SHUTDOWN_2, /* waiting for archiver and walsenders to + * finish */ + PM_WAIT_DEAD_END, /* waiting for dead_end children to exit */ + PM_NO_CHILDREN /* all important children have exited */ +} PMState; + +static PMState pmState = PM_INIT; + +/* + * While performing a "smart shutdown", we restrict new connections but stay + * in PM_RUN or PM_HOT_STANDBY state until all the client backends are gone. + * connsAllowed is a sub-state indicator showing the active restriction. + * It is of no interest unless pmState is PM_RUN or PM_HOT_STANDBY. + */ +static bool connsAllowed = true; + +/* Start time of SIGKILL timeout during immediate shutdown or child crash */ +/* Zero means timeout is not running */ +static time_t AbortStartTime = 0; + +/* Length of said timeout */ +#define SIGKILL_CHILDREN_AFTER_SECS 5 + +static bool ReachedNormalRunning = false; /* T if we've reached PM_RUN */ + +bool ClientAuthInProgress = false; /* T during new-client + * authentication */ + +bool redirection_done = false; /* stderr redirected for syslogger? */ + +/* received START_AUTOVAC_LAUNCHER signal */ +static bool start_autovac_launcher = false; + +/* the launcher needs to be signaled to communicate some condition */ +static bool avlauncher_needs_signal = false; + +/* received START_WALRECEIVER signal */ +static bool WalReceiverRequested = false; + +/* set when there's a worker that needs to be started up */ +static bool StartWorkerNeeded = true; +static bool HaveCrashedWorker = false; + +/* set when signals arrive */ +static volatile sig_atomic_t pending_pm_pmsignal; +static volatile sig_atomic_t pending_pm_child_exit; +static volatile sig_atomic_t pending_pm_reload_request; +static volatile sig_atomic_t pending_pm_shutdown_request; +static volatile sig_atomic_t pending_pm_fast_shutdown_request; +static volatile sig_atomic_t pending_pm_immediate_shutdown_request; + +/* event multiplexing object */ +static WaitEventSet *pm_wait_set; + +#ifdef USE_SSL +/* Set when and if SSL has been initialized properly */ +static bool LoadedSSL = false; +#endif + +#ifdef USE_BONJOUR +static DNSServiceRef bonjour_sdref = NULL; +#endif + +/* + * postmaster.c - function prototypes + */ +static void CloseServerPorts(int status, Datum arg); +static void unlink_external_pid_file(int status, Datum arg); +static void getInstallationPaths(const char *argv0); +static void checkControlFile(void); +static Port *ConnCreate(int serverFd); +static void ConnFree(Port *port); +static void handle_pm_pmsignal_signal(SIGNAL_ARGS); +static void handle_pm_child_exit_signal(SIGNAL_ARGS); +static void handle_pm_reload_request_signal(SIGNAL_ARGS); +static void handle_pm_shutdown_request_signal(SIGNAL_ARGS); +static void process_pm_pmsignal(void); +static void process_pm_child_exit(void); +static void process_pm_reload_request(void); +static void process_pm_shutdown_request(void); +static void process_startup_packet_die(SIGNAL_ARGS); +static void dummy_handler(SIGNAL_ARGS); +static void StartupPacketTimeoutHandler(void); +static void CleanupBackend(int pid, int exitstatus); +static bool CleanupBackgroundWorker(int pid, int exitstatus); +static void HandleChildCrash(int pid, int exitstatus, const char *procname); +static void LogChildExit(int lev, const char *procname, + int pid, int exitstatus); +static void PostmasterStateMachine(void); +static void BackendInitialize(Port *port); +static void BackendRun(Port *port) pg_attribute_noreturn(); +static void ExitPostmaster(int status) pg_attribute_noreturn(); +static int ServerLoop(void); +static int BackendStartup(Port *port); +static int ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done); +static void SendNegotiateProtocolVersion(List *unrecognized_protocol_options); +static void processCancelRequest(Port *port, void *pkt); +static void report_fork_failure_to_client(Port *port, int errnum); +static CAC_state canAcceptConnections(int backend_type); +static bool RandomCancelKey(int32 *cancel_key); +static void signal_child(pid_t pid, int signal); +static void sigquit_child(pid_t pid); +static bool SignalSomeChildren(int signal, int target); +static void TerminateChildren(int signal); + +#define SignalChildren(sig) SignalSomeChildren(sig, BACKEND_TYPE_ALL) + +static int CountChildren(int target); +static bool assign_backendlist_entry(RegisteredBgWorker *rw); +static void maybe_start_bgworkers(void); +static bool CreateOptsFile(int argc, char *argv[], char *fullprogname); +static pid_t StartChildProcess(AuxProcType type); +static void StartAutovacuumWorker(void); +static void MaybeStartWalReceiver(void); +static void InitPostmasterDeathWatchHandle(void); + +/* + * Archiver is allowed to start up at the current postmaster state? + * + * If WAL archiving is enabled always, we are allowed to start archiver + * even during recovery. + */ +#define PgArchStartupAllowed() \ + (((XLogArchivingActive() && pmState == PM_RUN) || \ + (XLogArchivingAlways() && \ + (pmState == PM_RECOVERY || pmState == PM_HOT_STANDBY))) && \ + PgArchCanRestart()) + +#ifdef EXEC_BACKEND + +#ifdef WIN32 +#define WNOHANG 0 /* ignored, so any integer value will do */ + +static pid_t waitpid(pid_t pid, int *exitstatus, int options); +static void WINAPI pgwin32_deadchild_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired); + +static HANDLE win32ChildQueue; + +typedef struct +{ + HANDLE waitHandle; + HANDLE procHandle; + DWORD procId; +} win32_deadchild_waitinfo; +#endif /* WIN32 */ + +static pid_t backend_forkexec(Port *port); +static pid_t internal_forkexec(int argc, char *argv[], Port *port); + +/* Type for a socket that can be inherited to a client process */ +#ifdef WIN32 +typedef struct +{ + SOCKET origsocket; /* Original socket value, or PGINVALID_SOCKET + * if not a socket */ + WSAPROTOCOL_INFO wsainfo; +} InheritableSocket; +#else +typedef int InheritableSocket; +#endif + +/* + * Structure contains all variables passed to exec:ed backends + */ +typedef struct +{ + Port port; + InheritableSocket portsocket; + char DataDir[MAXPGPATH]; + pgsocket ListenSocket[MAXLISTEN]; + int32 MyCancelKey; + int MyPMChildSlot; +#ifndef WIN32 + unsigned long UsedShmemSegID; +#else + void *ShmemProtectiveRegion; + HANDLE UsedShmemSegID; +#endif + void *UsedShmemSegAddr; + slock_t *ShmemLock; + VariableCache ShmemVariableCache; + Backend *ShmemBackendArray; +#ifndef HAVE_SPINLOCKS + PGSemaphore *SpinlockSemaArray; +#endif + int NamedLWLockTrancheRequests; + NamedLWLockTranche *NamedLWLockTrancheArray; + LWLockPadded *MainLWLockArray; + slock_t *ProcStructLock; + PROC_HDR *ProcGlobal; + PGPROC *AuxiliaryProcs; + PGPROC *PreparedXactProcs; + PMSignalData *PMSignalState; + pid_t PostmasterPid; + TimestampTz PgStartTime; + TimestampTz PgReloadTime; + pg_time_t first_syslogger_file_time; + bool redirection_done; + bool IsBinaryUpgrade; + bool query_id_enabled; + int max_safe_fds; + int MaxBackends; +#ifdef WIN32 + HANDLE PostmasterHandle; + HANDLE initial_signal_pipe; + HANDLE syslogPipe[2]; +#else + int postmaster_alive_fds[2]; + int syslogPipe[2]; +#endif + char my_exec_path[MAXPGPATH]; + char pkglib_path[MAXPGPATH]; +} BackendParameters; + +static void read_backend_variables(char *id, Port *port); +static void restore_backend_variables(BackendParameters *param, Port *port); + +#ifndef WIN32 +static bool save_backend_variables(BackendParameters *param, Port *port); +#else +static bool save_backend_variables(BackendParameters *param, Port *port, + HANDLE childProcess, pid_t childPid); +#endif + +static void ShmemBackendArrayAdd(Backend *bn); +static void ShmemBackendArrayRemove(Backend *bn); +#endif /* EXEC_BACKEND */ + +#define StartupDataBase() StartChildProcess(StartupProcess) +#define StartArchiver() StartChildProcess(ArchiverProcess) +#define StartBackgroundWriter() StartChildProcess(BgWriterProcess) +#define StartCheckpointer() StartChildProcess(CheckpointerProcess) +#define StartWalWriter() StartChildProcess(WalWriterProcess) +#define StartWalReceiver() StartChildProcess(WalReceiverProcess) + +/* Macros to check exit status of a child process */ +#define EXIT_STATUS_0(st) ((st) == 0) +#define EXIT_STATUS_1(st) (WIFEXITED(st) && WEXITSTATUS(st) == 1) +#define EXIT_STATUS_3(st) (WIFEXITED(st) && WEXITSTATUS(st) == 3) + +#ifndef WIN32 +/* + * File descriptors for pipe used to monitor if postmaster is alive. + * First is POSTMASTER_FD_WATCH, second is POSTMASTER_FD_OWN. + */ +int postmaster_alive_fds[2] = {-1, -1}; +#else +/* Process handle of postmaster used for the same purpose on Windows */ +HANDLE PostmasterHandle; +#endif + +/* + * Postmaster main entry point + */ +void +PostmasterMain(int argc, char *argv[]) +{ + int opt; + int status; + char *userDoption = NULL; + bool listen_addr_saved = false; + int i; + char *output_config_variable = NULL; + + InitProcessGlobals(); + + PostmasterPid = MyProcPid; + + IsPostmasterEnvironment = true; + + /* + * Start our win32 signal implementation + */ +#ifdef WIN32 + pgwin32_signal_initialize(); +#endif + + /* + * We should not be creating any files or directories before we check the + * data directory (see checkDataDir()), but just in case set the umask to + * the most restrictive (owner-only) permissions. + * + * checkDataDir() will reset the umask based on the data directory + * permissions. + */ + umask(PG_MODE_MASK_OWNER); + + /* + * By default, palloc() requests in the postmaster will be allocated in + * the PostmasterContext, which is space that can be recycled by backends. + * Allocated data that needs to be available to backends should be + * allocated in TopMemoryContext. + */ + PostmasterContext = AllocSetContextCreate(TopMemoryContext, + "Postmaster", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(PostmasterContext); + + /* Initialize paths to installation files */ + getInstallationPaths(argv[0]); + + /* + * Set up signal handlers for the postmaster process. + * + * CAUTION: when changing this list, check for side-effects on the signal + * handling setup of child processes. See tcop/postgres.c, + * bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/walwriter.c, + * postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/syslogger.c, + * postmaster/bgworker.c and postmaster/checkpointer.c. + */ + pqinitmask(); + sigprocmask(SIG_SETMASK, &BlockSig, NULL); + + pqsignal(SIGHUP, handle_pm_reload_request_signal); + pqsignal(SIGINT, handle_pm_shutdown_request_signal); + pqsignal(SIGQUIT, handle_pm_shutdown_request_signal); + pqsignal(SIGTERM, handle_pm_shutdown_request_signal); + pqsignal(SIGALRM, SIG_IGN); /* ignored */ + pqsignal(SIGPIPE, SIG_IGN); /* ignored */ + pqsignal(SIGUSR1, handle_pm_pmsignal_signal); + pqsignal(SIGUSR2, dummy_handler); /* unused, reserve for children */ + pqsignal(SIGCHLD, handle_pm_child_exit_signal); + + /* This may configure SIGURG, depending on platform. */ + InitializeLatchSupport(); + InitProcessLocalLatch(); + + /* + * No other place in Postgres should touch SIGTTIN/SIGTTOU handling. We + * ignore those signals in a postmaster environment, so that there is no + * risk of a child process freezing up due to writing to stderr. But for + * a standalone backend, their default handling is reasonable. Hence, all + * child processes should just allow the inherited settings to stand. + */ +#ifdef SIGTTIN + pqsignal(SIGTTIN, SIG_IGN); /* ignored */ +#endif +#ifdef SIGTTOU + pqsignal(SIGTTOU, SIG_IGN); /* ignored */ +#endif + + /* ignore SIGXFSZ, so that ulimit violations work like disk full */ +#ifdef SIGXFSZ + pqsignal(SIGXFSZ, SIG_IGN); /* ignored */ +#endif + + /* Begin accepting signals. */ + sigprocmask(SIG_SETMASK, &UnBlockSig, NULL); + + /* + * Options setup + */ + InitializeGUCOptions(); + + opterr = 1; + + /* + * Parse command-line options. CAUTION: keep this in sync with + * tcop/postgres.c (the option sets should not conflict) and with the + * common help() function in main/main.c. + */ + while ((opt = getopt(argc, argv, "B:bC:c:D:d:EeFf:h:ijk:lN:OPp:r:S:sTt:W:-:")) != -1) + { + switch (opt) + { + case 'B': + SetConfigOption("shared_buffers", optarg, PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'b': + /* Undocumented flag used for binary upgrades */ + IsBinaryUpgrade = true; + break; + + case 'C': + output_config_variable = strdup(optarg); + break; + + case 'c': + case '-': + { + char *name, + *value; + + ParseLongOption(optarg, &name, &value); + if (!value) + { + if (opt == '-') + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("--%s requires a value", + optarg))); + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("-c %s requires a value", + optarg))); + } + + SetConfigOption(name, value, PGC_POSTMASTER, PGC_S_ARGV); + pfree(name); + pfree(value); + break; + } + + case 'D': + userDoption = strdup(optarg); + break; + + case 'd': + set_debug_options(atoi(optarg), PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'E': + SetConfigOption("log_statement", "all", PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'e': + SetConfigOption("datestyle", "euro", PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'F': + SetConfigOption("fsync", "false", PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'f': + if (!set_plan_disabling_options(optarg, PGC_POSTMASTER, PGC_S_ARGV)) + { + write_stderr("%s: invalid argument for option -f: \"%s\"\n", + progname, optarg); + ExitPostmaster(1); + } + break; + + case 'h': + SetConfigOption("listen_addresses", optarg, PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'i': + SetConfigOption("listen_addresses", "*", PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'j': + /* only used by interactive backend */ + break; + + case 'k': + SetConfigOption("unix_socket_directories", optarg, PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'l': + SetConfigOption("ssl", "true", PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'N': + SetConfigOption("max_connections", optarg, PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'O': + SetConfigOption("allow_system_table_mods", "true", PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'P': + SetConfigOption("ignore_system_indexes", "true", PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'p': + SetConfigOption("port", optarg, PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'r': + /* only used by single-user backend */ + break; + + case 'S': + SetConfigOption("work_mem", optarg, PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 's': + SetConfigOption("log_statement_stats", "true", PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'T': + + /* + * This option used to be defined as sending SIGSTOP after a + * backend crash, but sending SIGABRT seems more useful. + */ + SetConfigOption("send_abort_for_crash", "true", PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 't': + { + const char *tmp = get_stats_option_name(optarg); + + if (tmp) + { + SetConfigOption(tmp, "true", PGC_POSTMASTER, PGC_S_ARGV); + } + else + { + write_stderr("%s: invalid argument for option -t: \"%s\"\n", + progname, optarg); + ExitPostmaster(1); + } + break; + } + + case 'W': + SetConfigOption("post_auth_delay", optarg, PGC_POSTMASTER, PGC_S_ARGV); + break; + + default: + write_stderr("Try \"%s --help\" for more information.\n", + progname); + ExitPostmaster(1); + } + } + + /* + * Postmaster accepts no non-option switch arguments. + */ + if (optind < argc) + { + write_stderr("%s: invalid argument: \"%s\"\n", + progname, argv[optind]); + write_stderr("Try \"%s --help\" for more information.\n", + progname); + ExitPostmaster(1); + } + + /* + * Locate the proper configuration files and data directory, and read + * postgresql.conf for the first time. + */ + if (!SelectConfigFiles(userDoption, progname)) + ExitPostmaster(2); + + if (output_config_variable != NULL) + { + /* + * If this is a runtime-computed GUC, it hasn't yet been initialized, + * and the present value is not useful. However, this is a convenient + * place to print the value for most GUCs because it is safe to run + * postmaster startup to this point even if the server is already + * running. For the handful of runtime-computed GUCs that we cannot + * provide meaningful values for yet, we wait until later in + * postmaster startup to print the value. We won't be able to use -C + * on running servers for those GUCs, but using this option now would + * lead to incorrect results for them. + */ + int flags = GetConfigOptionFlags(output_config_variable, true); + + if ((flags & GUC_RUNTIME_COMPUTED) == 0) + { + /* + * "-C guc" was specified, so print GUC's value and exit. No + * extra permission check is needed because the user is reading + * inside the data dir. + */ + const char *config_val = GetConfigOption(output_config_variable, + false, false); + + puts(config_val ? config_val : ""); + ExitPostmaster(0); + } + + /* + * A runtime-computed GUC will be printed later on. As we initialize + * a server startup sequence, silence any log messages that may show + * up in the output generated. FATAL and more severe messages are + * useful to show, even if one would only expect at least PANIC. LOG + * entries are hidden. + */ + SetConfigOption("log_min_messages", "FATAL", PGC_SUSET, + PGC_S_OVERRIDE); + } + + /* Verify that DataDir looks reasonable */ + checkDataDir(); + + /* Check that pg_control exists */ + checkControlFile(); + + /* And switch working directory into it */ + ChangeToDataDir(); + + /* + * Check for invalid combinations of GUC settings. + */ + if (SuperuserReservedConnections + ReservedConnections >= MaxConnections) + { + write_stderr("%s: superuser_reserved_connections (%d) plus reserved_connections (%d) must be less than max_connections (%d)\n", + progname, + SuperuserReservedConnections, ReservedConnections, + MaxConnections); + ExitPostmaster(1); + } + if (XLogArchiveMode > ARCHIVE_MODE_OFF && wal_level == WAL_LEVEL_MINIMAL) + ereport(ERROR, + (errmsg("WAL archival cannot be enabled when wal_level is \"minimal\""))); + if (max_wal_senders > 0 && wal_level == WAL_LEVEL_MINIMAL) + ereport(ERROR, + (errmsg("WAL streaming (max_wal_senders > 0) requires wal_level \"replica\" or \"logical\""))); + + /* + * Other one-time internal sanity checks can go here, if they are fast. + * (Put any slow processing further down, after postmaster.pid creation.) + */ + if (!CheckDateTokenTables()) + { + write_stderr("%s: invalid datetoken tables, please fix\n", progname); + ExitPostmaster(1); + } + + /* + * Now that we are done processing the postmaster arguments, reset + * getopt(3) library so that it will work correctly in subprocesses. + */ + optind = 1; +#ifdef HAVE_INT_OPTRESET + optreset = 1; /* some systems need this too */ +#endif + + /* For debugging: display postmaster environment */ + { + extern char **environ; + char **p; + + ereport(DEBUG3, + (errmsg_internal("%s: PostmasterMain: initial environment dump:", + progname))); + ereport(DEBUG3, + (errmsg_internal("-----------------------------------------"))); + for (p = environ; *p; ++p) + ereport(DEBUG3, + (errmsg_internal("\t%s", *p))); + ereport(DEBUG3, + (errmsg_internal("-----------------------------------------"))); + } + + /* + * Create lockfile for data directory. + * + * We want to do this before we try to grab the input sockets, because the + * data directory interlock is more reliable than the socket-file + * interlock (thanks to whoever decided to put socket files in /tmp :-(). + * For the same reason, it's best to grab the TCP socket(s) before the + * Unix socket(s). + * + * Also note that this internally sets up the on_proc_exit function that + * is responsible for removing both data directory and socket lockfiles; + * so it must happen before opening sockets so that at exit, the socket + * lockfiles go away after CloseServerPorts runs. + */ + CreateDataDirLockFile(true); + + /* + * Read the control file (for error checking and config info). + * + * Since we verify the control file's CRC, this has a useful side effect + * on machines where we need a run-time test for CRC support instructions. + * The postmaster will do the test once at startup, and then its child + * processes will inherit the correct function pointer and not need to + * repeat the test. + */ + LocalProcessControlFile(false); + + /* + * Register the apply launcher. It's probably a good idea to call this + * before any modules had a chance to take the background worker slots. + */ + ApplyLauncherRegister(); + + /* + * process any libraries that should be preloaded at postmaster start + */ + process_shared_preload_libraries(); + + /* + * Initialize SSL library, if specified. + */ +#ifdef USE_SSL + if (EnableSSL) + { + (void) secure_initialize(true); + LoadedSSL = true; + } +#endif + + /* + * Now that loadable modules have had their chance to alter any GUCs, + * calculate MaxBackends. + */ + InitializeMaxBackends(); + + /* + * Give preloaded libraries a chance to request additional shared memory. + */ + process_shmem_requests(); + + /* + * Now that loadable modules have had their chance to request additional + * shared memory, determine the value of any runtime-computed GUCs that + * depend on the amount of shared memory required. + */ + InitializeShmemGUCs(); + + /* + * Now that modules have been loaded, we can process any custom resource + * managers specified in the wal_consistency_checking GUC. + */ + InitializeWalConsistencyChecking(); + + /* + * If -C was specified with a runtime-computed GUC, we held off printing + * the value earlier, as the GUC was not yet initialized. We handle -C + * for most GUCs before we lock the data directory so that the option may + * be used on a running server. However, a handful of GUCs are runtime- + * computed and do not have meaningful values until after locking the data + * directory, and we cannot safely calculate their values earlier on a + * running server. At this point, such GUCs should be properly + * initialized, and we haven't yet set up shared memory, so this is a good + * time to handle the -C option for these special GUCs. + */ + if (output_config_variable != NULL) + { + const char *config_val = GetConfigOption(output_config_variable, + false, false); + + puts(config_val ? config_val : ""); + ExitPostmaster(0); + } + + /* + * Set up shared memory and semaphores. + * + * Note: if using SysV shmem and/or semas, each postmaster startup will + * normally choose the same IPC keys. This helps ensure that we will + * clean up dead IPC objects if the postmaster crashes and is restarted. + */ + CreateSharedMemoryAndSemaphores(); + + /* + * Estimate number of openable files. This must happen after setting up + * semaphores, because on some platforms semaphores count as open files. + */ + set_max_safe_fds(); + + /* + * Set reference point for stack-depth checking. + */ + (void) set_stack_base(); + + /* + * Initialize pipe (or process handle on Windows) that allows children to + * wake up from sleep on postmaster death. + */ + InitPostmasterDeathWatchHandle(); + +#ifdef WIN32 + + /* + * Initialize I/O completion port used to deliver list of dead children. + */ + win32ChildQueue = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 1); + if (win32ChildQueue == NULL) + ereport(FATAL, + (errmsg("could not create I/O completion port for child queue"))); +#endif + +#ifdef EXEC_BACKEND + /* Write out nondefault GUC settings for child processes to use */ + write_nondefault_variables(PGC_POSTMASTER); + + /* + * Clean out the temp directory used to transmit parameters to child + * processes (see internal_forkexec, below). We must do this before + * launching any child processes, else we have a race condition: we could + * remove a parameter file before the child can read it. It should be + * safe to do so now, because we verified earlier that there are no + * conflicting Postgres processes in this data directory. + */ + RemovePgTempFilesInDir(PG_TEMP_FILES_DIR, true, false); +#endif + + /* + * Forcibly remove the files signaling a standby promotion request. + * Otherwise, the existence of those files triggers a promotion too early, + * whether a user wants that or not. + * + * This removal of files is usually unnecessary because they can exist + * only during a few moments during a standby promotion. However there is + * a race condition: if pg_ctl promote is executed and creates the files + * during a promotion, the files can stay around even after the server is + * brought up to be the primary. Then, if a new standby starts by using + * the backup taken from the new primary, the files can exist at server + * startup and must be removed in order to avoid an unexpected promotion. + * + * Note that promotion signal files need to be removed before the startup + * process is invoked. Because, after that, they can be used by + * postmaster's SIGUSR1 signal handler. + */ + RemovePromoteSignalFiles(); + + /* Do the same for logrotate signal file */ + RemoveLogrotateSignalFiles(); + + /* Remove any outdated file holding the current log filenames. */ + if (unlink(LOG_METAINFO_DATAFILE) < 0 && errno != ENOENT) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + LOG_METAINFO_DATAFILE))); + + /* + * Initialize input sockets. + * + * Mark them all closed, and set up an on_proc_exit function that's + * charged with closing the sockets again at postmaster shutdown. + */ + for (i = 0; i < MAXLISTEN; i++) + ListenSocket[i] = PGINVALID_SOCKET; + + on_proc_exit(CloseServerPorts, 0); + + /* + * If enabled, start up syslogger collection subprocess + */ + SysLoggerPID = SysLogger_Start(); + + /* + * Reset whereToSendOutput from DestDebug (its starting state) to + * DestNone. This stops ereport from sending log messages to stderr unless + * Log_destination permits. We don't do this until the postmaster is + * fully launched, since startup failures may as well be reported to + * stderr. + * + * If we are in fact disabling logging to stderr, first emit a log message + * saying so, to provide a breadcrumb trail for users who may not remember + * that their logging is configured to go somewhere else. + */ + if (!(Log_destination & LOG_DESTINATION_STDERR)) + ereport(LOG, + (errmsg("ending log output to stderr"), + errhint("Future log output will go to log destination \"%s\".", + Log_destination_string))); + + whereToSendOutput = DestNone; + + /* + * Report server startup in log. While we could emit this much earlier, + * it seems best to do so after starting the log collector, if we intend + * to use one. + */ + ereport(LOG, + (errmsg("starting %s", PG_VERSION_STR))); + + /* + * Establish input sockets. + */ + if (ListenAddresses) + { + char *rawstring; + List *elemlist; + ListCell *l; + int success = 0; + + /* Need a modifiable copy of ListenAddresses */ + rawstring = pstrdup(ListenAddresses); + + /* Parse string into list of hostnames */ + if (!SplitGUCList(rawstring, ',', &elemlist)) + { + /* syntax error in list */ + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid list syntax in parameter \"%s\"", + "listen_addresses"))); + } + + foreach(l, elemlist) + { + char *curhost = (char *) lfirst(l); + + if (strcmp(curhost, "*") == 0) + status = StreamServerPort(AF_UNSPEC, NULL, + (unsigned short) PostPortNumber, + NULL, + ListenSocket, MAXLISTEN); + else + status = StreamServerPort(AF_UNSPEC, curhost, + (unsigned short) PostPortNumber, + NULL, + ListenSocket, MAXLISTEN); + + if (status == STATUS_OK) + { + success++; + /* record the first successful host addr in lockfile */ + if (!listen_addr_saved) + { + AddToDataDirLockFile(LOCK_FILE_LINE_LISTEN_ADDR, curhost); + listen_addr_saved = true; + } + } + else + ereport(WARNING, + (errmsg("could not create listen socket for \"%s\"", + curhost))); + } + + if (!success && elemlist != NIL) + ereport(FATAL, + (errmsg("could not create any TCP/IP sockets"))); + + list_free(elemlist); + pfree(rawstring); + } + +#ifdef USE_BONJOUR + /* Register for Bonjour only if we opened TCP socket(s) */ + if (enable_bonjour && ListenSocket[0] != PGINVALID_SOCKET) + { + DNSServiceErrorType err; + + /* + * We pass 0 for interface_index, which will result in registering on + * all "applicable" interfaces. It's not entirely clear from the + * DNS-SD docs whether this would be appropriate if we have bound to + * just a subset of the available network interfaces. + */ + err = DNSServiceRegister(&bonjour_sdref, + 0, + 0, + bonjour_name, + "_postgresql._tcp.", + NULL, + NULL, + pg_hton16(PostPortNumber), + 0, + NULL, + NULL, + NULL); + if (err != kDNSServiceErr_NoError) + ereport(LOG, + (errmsg("DNSServiceRegister() failed: error code %ld", + (long) err))); + + /* + * We don't bother to read the mDNS daemon's reply, and we expect that + * it will automatically terminate our registration when the socket is + * closed at postmaster termination. So there's nothing more to be + * done here. However, the bonjour_sdref is kept around so that + * forked children can close their copies of the socket. + */ + } +#endif + + if (Unix_socket_directories) + { + char *rawstring; + List *elemlist; + ListCell *l; + int success = 0; + + /* Need a modifiable copy of Unix_socket_directories */ + rawstring = pstrdup(Unix_socket_directories); + + /* Parse string into list of directories */ + if (!SplitDirectoriesString(rawstring, ',', &elemlist)) + { + /* syntax error in list */ + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid list syntax in parameter \"%s\"", + "unix_socket_directories"))); + } + + foreach(l, elemlist) + { + char *socketdir = (char *) lfirst(l); + + status = StreamServerPort(AF_UNIX, NULL, + (unsigned short) PostPortNumber, + socketdir, + ListenSocket, MAXLISTEN); + + if (status == STATUS_OK) + { + success++; + /* record the first successful Unix socket in lockfile */ + if (success == 1) + AddToDataDirLockFile(LOCK_FILE_LINE_SOCKET_DIR, socketdir); + } + else + ereport(WARNING, + (errmsg("could not create Unix-domain socket in directory \"%s\"", + socketdir))); + } + + if (!success && elemlist != NIL) + ereport(FATAL, + (errmsg("could not create any Unix-domain sockets"))); + + list_free_deep(elemlist); + pfree(rawstring); + } + + /* + * check that we have some socket to listen on + */ + if (ListenSocket[0] == PGINVALID_SOCKET) + ereport(FATAL, + (errmsg("no socket created for listening"))); + + /* + * If no valid TCP ports, write an empty line for listen address, + * indicating the Unix socket must be used. Note that this line is not + * added to the lock file until there is a socket backing it. + */ + if (!listen_addr_saved) + AddToDataDirLockFile(LOCK_FILE_LINE_LISTEN_ADDR, ""); + + /* + * Record postmaster options. We delay this till now to avoid recording + * bogus options (eg, unusable port number). + */ + if (!CreateOptsFile(argc, argv, my_exec_path)) + ExitPostmaster(1); + + /* + * Write the external PID file if requested + */ + if (external_pid_file) + { + FILE *fpidfile = fopen(external_pid_file, "w"); + + if (fpidfile) + { + fprintf(fpidfile, "%d\n", MyProcPid); + fclose(fpidfile); + + /* Make PID file world readable */ + if (chmod(external_pid_file, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) != 0) + write_stderr("%s: could not change permissions of external PID file \"%s\": %s\n", + progname, external_pid_file, strerror(errno)); + } + else + write_stderr("%s: could not write external PID file \"%s\": %s\n", + progname, external_pid_file, strerror(errno)); + + on_proc_exit(unlink_external_pid_file, 0); + } + + /* + * Remove old temporary files. At this point there can be no other + * Postgres processes running in this directory, so this should be safe. + */ + RemovePgTempFiles(); + + /* + * Initialize the autovacuum subsystem (again, no process start yet) + */ + autovac_init(); + + /* + * Load configuration files for client authentication. + */ + if (!load_hba()) + { + /* + * It makes no sense to continue if we fail to load the HBA file, + * since there is no way to connect to the database in this case. + */ + ereport(FATAL, + /* translator: %s is a configuration file */ + (errmsg("could not load %s", HbaFileName))); + } + if (!load_ident()) + { + /* + * We can start up without the IDENT file, although it means that you + * cannot log in using any of the authentication methods that need a + * user name mapping. load_ident() already logged the details of error + * to the log. + */ + } + +#ifdef HAVE_PTHREAD_IS_THREADED_NP + + /* + * On macOS, libintl replaces setlocale() with a version that calls + * CFLocaleCopyCurrent() when its second argument is "" and every relevant + * environment variable is unset or empty. CFLocaleCopyCurrent() makes + * the process multithreaded. The postmaster calls sigprocmask() and + * calls fork() without an immediate exec(), both of which have undefined + * behavior in a multithreaded program. A multithreaded postmaster is the + * normal case on Windows, which offers neither fork() nor sigprocmask(). + */ + if (pthread_is_threaded_np() != 0) + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("postmaster became multithreaded during startup"), + errhint("Set the LC_ALL environment variable to a valid locale."))); +#endif + + /* + * Remember postmaster startup time + */ + PgStartTime = GetCurrentTimestamp(); + + /* + * Report postmaster status in the postmaster.pid file, to allow pg_ctl to + * see what's happening. + */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STARTING); + + /* Start bgwriter and checkpointer so they can help with recovery */ + if (CheckpointerPID == 0) + CheckpointerPID = StartCheckpointer(); + if (BgWriterPID == 0) + BgWriterPID = StartBackgroundWriter(); + + /* + * We're ready to rock and roll... + */ + StartupPID = StartupDataBase(); + Assert(StartupPID != 0); + StartupStatus = STARTUP_RUNNING; + pmState = PM_STARTUP; + + /* Some workers may be scheduled to start now */ + maybe_start_bgworkers(); + + status = ServerLoop(); + + /* + * ServerLoop probably shouldn't ever return, but if it does, close down. + */ + ExitPostmaster(status != STATUS_OK); + + abort(); /* not reached */ +} + + +/* + * on_proc_exit callback to close server's listen sockets + */ +static void +CloseServerPorts(int status, Datum arg) +{ + int i; + + /* + * First, explicitly close all the socket FDs. We used to just let this + * happen implicitly at postmaster exit, but it's better to close them + * before we remove the postmaster.pid lockfile; otherwise there's a race + * condition if a new postmaster wants to re-use the TCP port number. + */ + for (i = 0; i < MAXLISTEN; i++) + { + if (ListenSocket[i] != PGINVALID_SOCKET) + { + StreamClose(ListenSocket[i]); + ListenSocket[i] = PGINVALID_SOCKET; + } + } + + /* + * Next, remove any filesystem entries for Unix sockets. To avoid race + * conditions against incoming postmasters, this must happen after closing + * the sockets and before removing lock files. + */ + RemoveSocketFiles(); + + /* + * We don't do anything about socket lock files here; those will be + * removed in a later on_proc_exit callback. + */ +} + +/* + * on_proc_exit callback to delete external_pid_file + */ +static void +unlink_external_pid_file(int status, Datum arg) +{ + if (external_pid_file) + unlink(external_pid_file); +} + + +/* + * Compute and check the directory paths to files that are part of the + * installation (as deduced from the postgres executable's own location) + */ +static void +getInstallationPaths(const char *argv0) +{ + DIR *pdir; + + /* Locate the postgres executable itself */ + if (find_my_exec(argv0, my_exec_path) < 0) + ereport(FATAL, + (errmsg("%s: could not locate my own executable path", argv0))); + +#ifdef EXEC_BACKEND + /* Locate executable backend before we change working directory */ + if (find_other_exec(argv0, "postgres", PG_BACKEND_VERSIONSTR, + postgres_exec_path) < 0) + ereport(FATAL, + (errmsg("%s: could not locate matching postgres executable", + argv0))); +#endif + + /* + * Locate the pkglib directory --- this has to be set early in case we try + * to load any modules from it in response to postgresql.conf entries. + */ + get_pkglib_path(my_exec_path, pkglib_path); + + /* + * Verify that there's a readable directory there; otherwise the Postgres + * installation is incomplete or corrupt. (A typical cause of this + * failure is that the postgres executable has been moved or hardlinked to + * some directory that's not a sibling of the installation lib/ + * directory.) + */ + pdir = AllocateDir(pkglib_path); + if (pdir == NULL) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open directory \"%s\": %m", + pkglib_path), + errhint("This may indicate an incomplete PostgreSQL installation, or that the file \"%s\" has been moved away from its proper location.", + my_exec_path))); + FreeDir(pdir); + + /* + * XXX is it worth similarly checking the share/ directory? If the lib/ + * directory is there, then share/ probably is too. + */ +} + +/* + * Check that pg_control exists in the correct location in the data directory. + * + * No attempt is made to validate the contents of pg_control here. This is + * just a sanity check to see if we are looking at a real data directory. + */ +static void +checkControlFile(void) +{ + char path[MAXPGPATH]; + FILE *fp; + + snprintf(path, sizeof(path), "%s/global/pg_control", DataDir); + + fp = AllocateFile(path, PG_BINARY_R); + if (fp == NULL) + { + write_stderr("%s: could not find the database system\n" + "Expected to find it in the directory \"%s\",\n" + "but could not open file \"%s\": %s\n", + progname, DataDir, path, strerror(errno)); + ExitPostmaster(2); + } + FreeFile(fp); +} + +/* + * Determine how long should we let ServerLoop sleep, in milliseconds. + * + * In normal conditions we wait at most one minute, to ensure that the other + * background tasks handled by ServerLoop get done even when no requests are + * arriving. However, if there are background workers waiting to be started, + * we don't actually sleep so that they are quickly serviced. Other exception + * cases are as shown in the code. + */ +static int +DetermineSleepTime(void) +{ + TimestampTz next_wakeup = 0; + + /* + * Normal case: either there are no background workers at all, or we're in + * a shutdown sequence (during which we ignore bgworkers altogether). + */ + if (Shutdown > NoShutdown || + (!StartWorkerNeeded && !HaveCrashedWorker)) + { + if (AbortStartTime != 0) + { + int seconds; + + /* time left to abort; clamp to 0 in case it already expired */ + seconds = SIGKILL_CHILDREN_AFTER_SECS - + (time(NULL) - AbortStartTime); + + return Max(seconds * 1000, 0); + } + else + return 60 * 1000; + } + + if (StartWorkerNeeded) + return 0; + + if (HaveCrashedWorker) + { + slist_mutable_iter siter; + + /* + * When there are crashed bgworkers, we sleep just long enough that + * they are restarted when they request to be. Scan the list to + * determine the minimum of all wakeup times according to most recent + * crash time and requested restart interval. + */ + slist_foreach_modify(siter, &BackgroundWorkerList) + { + RegisteredBgWorker *rw; + TimestampTz this_wakeup; + + rw = slist_container(RegisteredBgWorker, rw_lnode, siter.cur); + + if (rw->rw_crashed_at == 0) + continue; + + if (rw->rw_worker.bgw_restart_time == BGW_NEVER_RESTART + || rw->rw_terminate) + { + ForgetBackgroundWorker(&siter); + continue; + } + + this_wakeup = TimestampTzPlusMilliseconds(rw->rw_crashed_at, + 1000L * rw->rw_worker.bgw_restart_time); + if (next_wakeup == 0 || this_wakeup < next_wakeup) + next_wakeup = this_wakeup; + } + } + + if (next_wakeup != 0) + { + int ms; + + /* result of TimestampDifferenceMilliseconds is in [0, INT_MAX] */ + ms = (int) TimestampDifferenceMilliseconds(GetCurrentTimestamp(), + next_wakeup); + return Min(60 * 1000, ms); + } + + return 60 * 1000; +} + +/* + * Activate or deactivate notifications of server socket events. Since we + * don't currently have a way to remove events from an existing WaitEventSet, + * we'll just destroy and recreate the whole thing. This is called during + * shutdown so we can wait for backends to exit without accepting new + * connections, and during crash reinitialization when we need to start + * listening for new connections again. The WaitEventSet will be freed in fork + * children by ClosePostmasterPorts(). + */ +static void +ConfigurePostmasterWaitSet(bool accept_connections) +{ + int nsockets; + + if (pm_wait_set) + FreeWaitEventSet(pm_wait_set); + pm_wait_set = NULL; + + /* How many server sockets do we need to wait for? */ + nsockets = 0; + if (accept_connections) + { + while (nsockets < MAXLISTEN && + ListenSocket[nsockets] != PGINVALID_SOCKET) + ++nsockets; + } + + pm_wait_set = CreateWaitEventSet(CurrentMemoryContext, 1 + nsockets); + AddWaitEventToSet(pm_wait_set, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, + NULL); + + if (accept_connections) + { + for (int i = 0; i < nsockets; i++) + AddWaitEventToSet(pm_wait_set, WL_SOCKET_ACCEPT, ListenSocket[i], + NULL, NULL); + } +} + +/* + * Main idle loop of postmaster + */ +static int +ServerLoop(void) +{ + time_t last_lockfile_recheck_time, + last_touch_time; + WaitEvent events[MAXLISTEN]; + int nevents; + + ConfigurePostmasterWaitSet(true); + last_lockfile_recheck_time = last_touch_time = time(NULL); + + for (;;) + { + time_t now; + + nevents = WaitEventSetWait(pm_wait_set, + DetermineSleepTime(), + events, + lengthof(events), + 0 /* postmaster posts no wait_events */ ); + + /* + * Latch set by signal handler, or new connection pending on any of + * our sockets? If the latter, fork a child process to deal with it. + */ + for (int i = 0; i < nevents; i++) + { + if (events[i].events & WL_LATCH_SET) + ResetLatch(MyLatch); + + /* + * The following requests are handled unconditionally, even if we + * didn't see WL_LATCH_SET. This gives high priority to shutdown + * and reload requests where the latch happens to appear later in + * events[] or will be reported by a later call to + * WaitEventSetWait(). + */ + if (pending_pm_shutdown_request) + process_pm_shutdown_request(); + if (pending_pm_reload_request) + process_pm_reload_request(); + if (pending_pm_child_exit) + process_pm_child_exit(); + if (pending_pm_pmsignal) + process_pm_pmsignal(); + + if (events[i].events & WL_SOCKET_ACCEPT) + { + Port *port; + + port = ConnCreate(events[i].fd); + if (port) + { + BackendStartup(port); + + /* + * We no longer need the open socket or port structure in + * this process + */ + StreamClose(port->sock); + ConnFree(port); + } + } + } + + /* If we have lost the log collector, try to start a new one */ + if (SysLoggerPID == 0 && Logging_collector) + SysLoggerPID = SysLogger_Start(); + + /* + * If no background writer process is running, and we are not in a + * state that prevents it, start one. It doesn't matter if this + * fails, we'll just try again later. Likewise for the checkpointer. + */ + if (pmState == PM_RUN || pmState == PM_RECOVERY || + pmState == PM_HOT_STANDBY || pmState == PM_STARTUP) + { + if (CheckpointerPID == 0) + CheckpointerPID = StartCheckpointer(); + if (BgWriterPID == 0) + BgWriterPID = StartBackgroundWriter(); + } + + /* + * Likewise, if we have lost the walwriter process, try to start a new + * one. But this is needed only in normal operation (else we cannot + * be writing any new WAL). + */ + if (WalWriterPID == 0 && pmState == PM_RUN) + WalWriterPID = StartWalWriter(); + + /* + * If we have lost the autovacuum launcher, try to start a new one. We + * don't want autovacuum to run in binary upgrade mode because + * autovacuum might update relfrozenxid for empty tables before the + * physical files are put in place. + */ + if (!IsBinaryUpgrade && AutoVacPID == 0 && + (AutoVacuumingActive() || start_autovac_launcher) && + pmState == PM_RUN) + { + AutoVacPID = StartAutoVacLauncher(); + if (AutoVacPID != 0) + start_autovac_launcher = false; /* signal processed */ + } + + /* If we have lost the archiver, try to start a new one. */ + if (PgArchPID == 0 && PgArchStartupAllowed()) + PgArchPID = StartArchiver(); + + /* If we need to signal the autovacuum launcher, do so now */ + if (avlauncher_needs_signal) + { + avlauncher_needs_signal = false; + if (AutoVacPID != 0) + kill(AutoVacPID, SIGUSR2); + } + + /* If we need to start a WAL receiver, try to do that now */ + if (WalReceiverRequested) + MaybeStartWalReceiver(); + + /* Get other worker processes running, if needed */ + if (StartWorkerNeeded || HaveCrashedWorker) + maybe_start_bgworkers(); + +#ifdef HAVE_PTHREAD_IS_THREADED_NP + + /* + * With assertions enabled, check regularly for appearance of + * additional threads. All builds check at start and exit. + */ + Assert(pthread_is_threaded_np() == 0); +#endif + + /* + * Lastly, check to see if it's time to do some things that we don't + * want to do every single time through the loop, because they're a + * bit expensive. Note that there's up to a minute of slop in when + * these tasks will be performed, since DetermineSleepTime() will let + * us sleep at most that long; except for SIGKILL timeout which has + * special-case logic there. + */ + now = time(NULL); + + /* + * If we already sent SIGQUIT to children and they are slow to shut + * down, it's time to send them SIGKILL (or SIGABRT if requested). + * This doesn't happen normally, but under certain conditions backends + * can get stuck while shutting down. This is a last measure to get + * them unwedged. + * + * Note we also do this during recovery from a process crash. + */ + if ((Shutdown >= ImmediateShutdown || FatalError) && + AbortStartTime != 0 && + (now - AbortStartTime) >= SIGKILL_CHILDREN_AFTER_SECS) + { + /* We were gentle with them before. Not anymore */ + ereport(LOG, + /* translator: %s is SIGKILL or SIGABRT */ + (errmsg("issuing %s to recalcitrant children", + send_abort_for_kill ? "SIGABRT" : "SIGKILL"))); + TerminateChildren(send_abort_for_kill ? SIGABRT : SIGKILL); + /* reset flag so we don't SIGKILL again */ + AbortStartTime = 0; + } + + /* + * Once a minute, verify that postmaster.pid hasn't been removed or + * overwritten. If it has, we force a shutdown. This avoids having + * postmasters and child processes hanging around after their database + * is gone, and maybe causing problems if a new database cluster is + * created in the same place. It also provides some protection + * against a DBA foolishly removing postmaster.pid and manually + * starting a new postmaster. Data corruption is likely to ensue from + * that anyway, but we can minimize the damage by aborting ASAP. + */ + if (now - last_lockfile_recheck_time >= 1 * SECS_PER_MINUTE) + { + if (!RecheckDataDirLockFile()) + { + ereport(LOG, + (errmsg("performing immediate shutdown because data directory lock file is invalid"))); + kill(MyProcPid, SIGQUIT); + } + last_lockfile_recheck_time = now; + } + + /* + * Touch Unix socket and lock files every 58 minutes, to ensure that + * they are not removed by overzealous /tmp-cleaning tasks. We assume + * no one runs cleaners with cutoff times of less than an hour ... + */ + if (now - last_touch_time >= 58 * SECS_PER_MINUTE) + { + TouchSocketFiles(); + TouchSocketLockFiles(); + last_touch_time = now; + } + } +} + +/* + * Read a client's startup packet and do something according to it. + * + * Returns STATUS_OK or STATUS_ERROR, or might call ereport(FATAL) and + * not return at all. + * + * (Note that ereport(FATAL) stuff is sent to the client, so only use it + * if that's what you want. Return STATUS_ERROR if you don't want to + * send anything to the client, which would typically be appropriate + * if we detect a communications failure.) + * + * Set ssl_done and/or gss_done when negotiation of an encrypted layer + * (currently, TLS or GSSAPI) is completed. A successful negotiation of either + * encryption layer sets both flags, but a rejected negotiation sets only the + * flag for that layer, since the client may wish to try the other one. We + * should make no assumption here about the order in which the client may make + * requests. + */ +static int +ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) +{ + int32 len; + char *buf; + ProtocolVersion proto; + MemoryContext oldcontext; + + pq_startmsgread(); + + /* + * Grab the first byte of the length word separately, so that we can tell + * whether we have no data at all or an incomplete packet. (This might + * sound inefficient, but it's not really, because of buffering in + * pqcomm.c.) + */ + if (pq_getbytes((char *) &len, 1) == EOF) + { + /* + * If we get no data at all, don't clutter the log with a complaint; + * such cases often occur for legitimate reasons. An example is that + * we might be here after responding to NEGOTIATE_SSL_CODE, and if the + * client didn't like our response, it'll probably just drop the + * connection. Service-monitoring software also often just opens and + * closes a connection without sending anything. (So do port + * scanners, which may be less benign, but it's not really our job to + * notice those.) + */ + return STATUS_ERROR; + } + + if (pq_getbytes(((char *) &len) + 1, 3) == EOF) + { + /* Got a partial length word, so bleat about that */ + if (!ssl_done && !gss_done) + ereport(COMMERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("incomplete startup packet"))); + return STATUS_ERROR; + } + + len = pg_ntoh32(len); + len -= 4; + + if (len < (int32) sizeof(ProtocolVersion) || + len > MAX_STARTUP_PACKET_LENGTH) + { + ereport(COMMERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid length of startup packet"))); + return STATUS_ERROR; + } + + /* + * Allocate space to hold the startup packet, plus one extra byte that's + * initialized to be zero. This ensures we will have null termination of + * all strings inside the packet. + */ + buf = palloc(len + 1); + buf[len] = '\0'; + + if (pq_getbytes(buf, len) == EOF) + { + ereport(COMMERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("incomplete startup packet"))); + return STATUS_ERROR; + } + pq_endmsgread(); + + /* + * The first field is either a protocol version number or a special + * request code. + */ + port->proto = proto = pg_ntoh32(*((ProtocolVersion *) buf)); + + if (proto == CANCEL_REQUEST_CODE) + { + if (len != sizeof(CancelRequestPacket)) + { + ereport(COMMERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid length of startup packet"))); + return STATUS_ERROR; + } + processCancelRequest(port, buf); + /* Not really an error, but we don't want to proceed further */ + return STATUS_ERROR; + } + + if (proto == NEGOTIATE_SSL_CODE && !ssl_done) + { + char SSLok; + +#ifdef USE_SSL + /* No SSL when disabled or on Unix sockets */ + if (!LoadedSSL || port->laddr.addr.ss_family == AF_UNIX) + SSLok = 'N'; + else + SSLok = 'S'; /* Support for SSL */ +#else + SSLok = 'N'; /* No support for SSL */ +#endif + +retry1: + if (send(port->sock, &SSLok, 1, 0) != 1) + { + if (errno == EINTR) + goto retry1; /* if interrupted, just retry */ + ereport(COMMERROR, + (errcode_for_socket_access(), + errmsg("failed to send SSL negotiation response: %m"))); + return STATUS_ERROR; /* close the connection */ + } + +#ifdef USE_SSL + if (SSLok == 'S' && secure_open_server(port) == -1) + return STATUS_ERROR; +#endif + + /* + * At this point we should have no data already buffered. If we do, + * it was received before we performed the SSL handshake, so it wasn't + * encrypted and indeed may have been injected by a man-in-the-middle. + * We report this case to the client. + */ + if (pq_buffer_has_data()) + ereport(FATAL, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("received unencrypted data after SSL request"), + errdetail("This could be either a client-software bug or evidence of an attempted man-in-the-middle attack."))); + + /* + * regular startup packet, cancel, etc packet should follow, but not + * another SSL negotiation request, and a GSS request should only + * follow if SSL was rejected (client may negotiate in either order) + */ + return ProcessStartupPacket(port, true, SSLok == 'S'); + } + else if (proto == NEGOTIATE_GSS_CODE && !gss_done) + { + char GSSok = 'N'; + +#ifdef ENABLE_GSS + /* No GSSAPI encryption when on Unix socket */ + if (port->laddr.addr.ss_family != AF_UNIX) + GSSok = 'G'; +#endif + + while (send(port->sock, &GSSok, 1, 0) != 1) + { + if (errno == EINTR) + continue; + ereport(COMMERROR, + (errcode_for_socket_access(), + errmsg("failed to send GSSAPI negotiation response: %m"))); + return STATUS_ERROR; /* close the connection */ + } + +#ifdef ENABLE_GSS + if (GSSok == 'G' && secure_open_gssapi(port) == -1) + return STATUS_ERROR; +#endif + + /* + * At this point we should have no data already buffered. If we do, + * it was received before we performed the GSS handshake, so it wasn't + * encrypted and indeed may have been injected by a man-in-the-middle. + * We report this case to the client. + */ + if (pq_buffer_has_data()) + ereport(FATAL, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("received unencrypted data after GSSAPI encryption request"), + errdetail("This could be either a client-software bug or evidence of an attempted man-in-the-middle attack."))); + + /* + * regular startup packet, cancel, etc packet should follow, but not + * another GSS negotiation request, and an SSL request should only + * follow if GSS was rejected (client may negotiate in either order) + */ + return ProcessStartupPacket(port, GSSok == 'G', true); + } + + /* Could add additional special packet types here */ + + /* + * Set FrontendProtocol now so that ereport() knows what format to send if + * we fail during startup. + */ + FrontendProtocol = proto; + + /* Check that the major protocol version is in range. */ + if (PG_PROTOCOL_MAJOR(proto) < PG_PROTOCOL_MAJOR(PG_PROTOCOL_EARLIEST) || + PG_PROTOCOL_MAJOR(proto) > PG_PROTOCOL_MAJOR(PG_PROTOCOL_LATEST)) + ereport(FATAL, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("unsupported frontend protocol %u.%u: server supports %u.0 to %u.%u", + PG_PROTOCOL_MAJOR(proto), PG_PROTOCOL_MINOR(proto), + PG_PROTOCOL_MAJOR(PG_PROTOCOL_EARLIEST), + PG_PROTOCOL_MAJOR(PG_PROTOCOL_LATEST), + PG_PROTOCOL_MINOR(PG_PROTOCOL_LATEST)))); + + /* + * Now fetch parameters out of startup packet and save them into the Port + * structure. All data structures attached to the Port struct must be + * allocated in TopMemoryContext so that they will remain available in a + * running backend (even after PostmasterContext is destroyed). We need + * not worry about leaking this storage on failure, since we aren't in the + * postmaster process anymore. + */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + /* Handle protocol version 3 startup packet */ + { + int32 offset = sizeof(ProtocolVersion); + List *unrecognized_protocol_options = NIL; + + /* + * Scan packet body for name/option pairs. We can assume any string + * beginning within the packet body is null-terminated, thanks to + * zeroing extra byte above. + */ + port->guc_options = NIL; + + while (offset < len) + { + char *nameptr = buf + offset; + int32 valoffset; + char *valptr; + + if (*nameptr == '\0') + break; /* found packet terminator */ + valoffset = offset + strlen(nameptr) + 1; + if (valoffset >= len) + break; /* missing value, will complain below */ + valptr = buf + valoffset; + + if (strcmp(nameptr, "database") == 0) + port->database_name = pstrdup(valptr); + else if (strcmp(nameptr, "user") == 0) + port->user_name = pstrdup(valptr); + else if (strcmp(nameptr, "options") == 0) + port->cmdline_options = pstrdup(valptr); + else if (strcmp(nameptr, "replication") == 0) + { + /* + * Due to backward compatibility concerns the replication + * parameter is a hybrid beast which allows the value to be + * either boolean or the string 'database'. The latter + * connects to a specific database which is e.g. required for + * logical decoding while. + */ + if (strcmp(valptr, "database") == 0) + { + am_walsender = true; + am_db_walsender = true; + } + else if (!parse_bool(valptr, &am_walsender)) + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": \"%s\"", + "replication", + valptr), + errhint("Valid values are: \"false\", 0, \"true\", 1, \"database\"."))); + } + else if (strncmp(nameptr, "_pq_.", 5) == 0) + { + /* + * Any option beginning with _pq_. is reserved for use as a + * protocol-level option, but at present no such options are + * defined. + */ + unrecognized_protocol_options = + lappend(unrecognized_protocol_options, pstrdup(nameptr)); + } + else + { + /* Assume it's a generic GUC option */ + port->guc_options = lappend(port->guc_options, + pstrdup(nameptr)); + port->guc_options = lappend(port->guc_options, + pstrdup(valptr)); + + /* + * Copy application_name to port if we come across it. This + * is done so we can log the application_name in the + * connection authorization message. Note that the GUC would + * be used but we haven't gone through GUC setup yet. + */ + if (strcmp(nameptr, "application_name") == 0) + { + port->application_name = pg_clean_ascii(valptr, 0); + } + } + offset = valoffset + strlen(valptr) + 1; + } + + /* + * If we didn't find a packet terminator exactly at the end of the + * given packet length, complain. + */ + if (offset != len - 1) + ereport(FATAL, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid startup packet layout: expected terminator as last byte"))); + + /* + * If the client requested a newer protocol version or if the client + * requested any protocol options we didn't recognize, let them know + * the newest minor protocol version we do support and the names of + * any unrecognized options. + */ + if (PG_PROTOCOL_MINOR(proto) > PG_PROTOCOL_MINOR(PG_PROTOCOL_LATEST) || + unrecognized_protocol_options != NIL) + SendNegotiateProtocolVersion(unrecognized_protocol_options); + } + + /* Check a user name was given. */ + if (port->user_name == NULL || port->user_name[0] == '\0') + ereport(FATAL, + (errcode(ERRCODE_INVALID_AUTHORIZATION_SPECIFICATION), + errmsg("no PostgreSQL user name specified in startup packet"))); + + /* The database defaults to the user name. */ + if (port->database_name == NULL || port->database_name[0] == '\0') + port->database_name = pstrdup(port->user_name); + + if (Db_user_namespace) + { + /* + * If user@, it is a global user, remove '@'. We only want to do this + * if there is an '@' at the end and no earlier in the user string or + * they may fake as a local user of another database attaching to this + * database. + */ + if (strchr(port->user_name, '@') == + port->user_name + strlen(port->user_name) - 1) + *strchr(port->user_name, '@') = '\0'; + else + { + /* Append '@' and dbname */ + port->user_name = psprintf("%s@%s", port->user_name, port->database_name); + } + } + + /* + * Truncate given database and user names to length of a Postgres name. + * This avoids lookup failures when overlength names are given. + */ + if (strlen(port->database_name) >= NAMEDATALEN) + port->database_name[NAMEDATALEN - 1] = '\0'; + if (strlen(port->user_name) >= NAMEDATALEN) + port->user_name[NAMEDATALEN - 1] = '\0'; + + if (am_walsender) + MyBackendType = B_WAL_SENDER; + else + MyBackendType = B_BACKEND; + + /* + * Normal walsender backends, e.g. for streaming replication, are not + * connected to a particular database. But walsenders used for logical + * replication need to connect to a specific database. We allow streaming + * replication commands to be issued even if connected to a database as it + * can make sense to first make a basebackup and then stream changes + * starting from that. + */ + if (am_walsender && !am_db_walsender) + port->database_name[0] = '\0'; + + /* + * Done putting stuff in TopMemoryContext. + */ + MemoryContextSwitchTo(oldcontext); + + /* + * If we're going to reject the connection due to database state, say so + * now instead of wasting cycles on an authentication exchange. (This also + * allows a pg_ping utility to be written.) + */ + switch (port->canAcceptConnections) + { + case CAC_STARTUP: + ereport(FATAL, + (errcode(ERRCODE_CANNOT_CONNECT_NOW), + errmsg("the database system is starting up"))); + break; + case CAC_NOTCONSISTENT: + if (EnableHotStandby) + ereport(FATAL, + (errcode(ERRCODE_CANNOT_CONNECT_NOW), + errmsg("the database system is not yet accepting connections"), + errdetail("Consistent recovery state has not been yet reached."))); + else + ereport(FATAL, + (errcode(ERRCODE_CANNOT_CONNECT_NOW), + errmsg("the database system is not accepting connections"), + errdetail("Hot standby mode is disabled."))); + break; + case CAC_SHUTDOWN: + ereport(FATAL, + (errcode(ERRCODE_CANNOT_CONNECT_NOW), + errmsg("the database system is shutting down"))); + break; + case CAC_RECOVERY: + ereport(FATAL, + (errcode(ERRCODE_CANNOT_CONNECT_NOW), + errmsg("the database system is in recovery mode"))); + break; + case CAC_TOOMANY: + ereport(FATAL, + (errcode(ERRCODE_TOO_MANY_CONNECTIONS), + errmsg("sorry, too many clients already"))); + break; + case CAC_OK: + break; + } + + return STATUS_OK; +} + +/* + * Send a NegotiateProtocolVersion to the client. This lets the client know + * that they have requested a newer minor protocol version than we are able + * to speak. We'll speak the highest version we know about; the client can, + * of course, abandon the connection if that's a problem. + * + * We also include in the response a list of protocol options we didn't + * understand. This allows clients to include optional parameters that might + * be present either in newer protocol versions or third-party protocol + * extensions without fear of having to reconnect if those options are not + * understood, while at the same time making certain that the client is aware + * of which options were actually accepted. + */ +static void +SendNegotiateProtocolVersion(List *unrecognized_protocol_options) +{ + StringInfoData buf; + ListCell *lc; + + pq_beginmessage(&buf, 'v'); /* NegotiateProtocolVersion */ + pq_sendint32(&buf, PG_PROTOCOL_LATEST); + pq_sendint32(&buf, list_length(unrecognized_protocol_options)); + foreach(lc, unrecognized_protocol_options) + pq_sendstring(&buf, lfirst(lc)); + pq_endmessage(&buf); + + /* no need to flush, some other message will follow */ +} + +/* + * The client has sent a cancel request packet, not a normal + * start-a-new-connection packet. Perform the necessary processing. + * Nothing is sent back to the client. + */ +static void +processCancelRequest(Port *port, void *pkt) +{ + CancelRequestPacket *canc = (CancelRequestPacket *) pkt; + int backendPID; + int32 cancelAuthCode; + Backend *bp; + +#ifndef EXEC_BACKEND + dlist_iter iter; +#else + int i; +#endif + + backendPID = (int) pg_ntoh32(canc->backendPID); + cancelAuthCode = (int32) pg_ntoh32(canc->cancelAuthCode); + + /* + * See if we have a matching backend. In the EXEC_BACKEND case, we can no + * longer access the postmaster's own backend list, and must rely on the + * duplicate array in shared memory. + */ +#ifndef EXEC_BACKEND + dlist_foreach(iter, &BackendList) + { + bp = dlist_container(Backend, elem, iter.cur); +#else + for (i = MaxLivePostmasterChildren() - 1; i >= 0; i--) + { + bp = (Backend *) &ShmemBackendArray[i]; +#endif + if (bp->pid == backendPID) + { + if (bp->cancel_key == cancelAuthCode) + { + /* Found a match; signal that backend to cancel current op */ + ereport(DEBUG2, + (errmsg_internal("processing cancel request: sending SIGINT to process %d", + backendPID))); + signal_child(bp->pid, SIGINT); + } + else + /* Right PID, wrong key: no way, Jose */ + ereport(LOG, + (errmsg("wrong key in cancel request for process %d", + backendPID))); + return; + } +#ifndef EXEC_BACKEND /* make GNU Emacs 26.1 see brace balance */ + } +#else + } +#endif + + /* No matching backend */ + ereport(LOG, + (errmsg("PID %d in cancel request did not match any process", + backendPID))); +} + +/* + * canAcceptConnections --- check to see if database state allows connections + * of the specified type. backend_type can be BACKEND_TYPE_NORMAL, + * BACKEND_TYPE_AUTOVAC, or BACKEND_TYPE_BGWORKER. (Note that we don't yet + * know whether a NORMAL connection might turn into a walsender.) + */ +static CAC_state +canAcceptConnections(int backend_type) +{ + CAC_state result = CAC_OK; + + /* + * Can't start backends when in startup/shutdown/inconsistent recovery + * state. We treat autovac workers the same as user backends for this + * purpose. However, bgworkers are excluded from this test; we expect + * bgworker_should_start_now() decided whether the DB state allows them. + */ + if (pmState != PM_RUN && pmState != PM_HOT_STANDBY && + backend_type != BACKEND_TYPE_BGWORKER) + { + if (Shutdown > NoShutdown) + return CAC_SHUTDOWN; /* shutdown is pending */ + else if (!FatalError && pmState == PM_STARTUP) + return CAC_STARTUP; /* normal startup */ + else if (!FatalError && pmState == PM_RECOVERY) + return CAC_NOTCONSISTENT; /* not yet at consistent recovery + * state */ + else + return CAC_RECOVERY; /* else must be crash recovery */ + } + + /* + * "Smart shutdown" restrictions are applied only to normal connections, + * not to autovac workers or bgworkers. + */ + if (!connsAllowed && backend_type == BACKEND_TYPE_NORMAL) + return CAC_SHUTDOWN; /* shutdown is pending */ + + /* + * Don't start too many children. + * + * We allow more connections here than we can have backends because some + * might still be authenticating; they might fail auth, or some existing + * backend might exit before the auth cycle is completed. The exact + * MaxBackends limit is enforced when a new backend tries to join the + * shared-inval backend array. + * + * The limit here must match the sizes of the per-child-process arrays; + * see comments for MaxLivePostmasterChildren(). + */ + if (CountChildren(BACKEND_TYPE_ALL) >= MaxLivePostmasterChildren()) + result = CAC_TOOMANY; + + return result; +} + + +/* + * ConnCreate -- create a local connection data structure + * + * Returns NULL on failure, other than out-of-memory which is fatal. + */ +static Port * +ConnCreate(int serverFd) +{ + Port *port; + + if (!(port = (Port *) calloc(1, sizeof(Port)))) + { + ereport(LOG, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + ExitPostmaster(1); + } + + if (StreamConnection(serverFd, port) != STATUS_OK) + { + if (port->sock != PGINVALID_SOCKET) + StreamClose(port->sock); + ConnFree(port); + return NULL; + } + + return port; +} + + +/* + * ConnFree -- free a local connection data structure + * + * Caller has already closed the socket if any, so there's not much + * to do here. + */ +static void +ConnFree(Port *port) +{ + free(port); +} + + +/* + * ClosePostmasterPorts -- close all the postmaster's open sockets + * + * This is called during child process startup to release file descriptors + * that are not needed by that child process. The postmaster still has + * them open, of course. + * + * Note: we pass am_syslogger as a boolean because we don't want to set + * the global variable yet when this is called. + */ +void +ClosePostmasterPorts(bool am_syslogger) +{ + int i; + + /* Release resources held by the postmaster's WaitEventSet. */ + if (pm_wait_set) + { + FreeWaitEventSetAfterFork(pm_wait_set); + pm_wait_set = NULL; + } + +#ifndef WIN32 + + /* + * Close the write end of postmaster death watch pipe. It's important to + * do this as early as possible, so that if postmaster dies, others won't + * think that it's still running because we're holding the pipe open. + */ + if (close(postmaster_alive_fds[POSTMASTER_FD_OWN]) != 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg_internal("could not close postmaster death monitoring pipe in child process: %m"))); + postmaster_alive_fds[POSTMASTER_FD_OWN] = -1; + /* Notify fd.c that we released one pipe FD. */ + ReleaseExternalFD(); +#endif + + /* + * Close the postmaster's listen sockets. These aren't tracked by fd.c, + * so we don't call ReleaseExternalFD() here. + */ + for (i = 0; i < MAXLISTEN; i++) + { + if (ListenSocket[i] != PGINVALID_SOCKET) + { + StreamClose(ListenSocket[i]); + ListenSocket[i] = PGINVALID_SOCKET; + } + } + + /* + * If using syslogger, close the read side of the pipe. We don't bother + * tracking this in fd.c, either. + */ + if (!am_syslogger) + { +#ifndef WIN32 + if (syslogPipe[0] >= 0) + close(syslogPipe[0]); + syslogPipe[0] = -1; +#else + if (syslogPipe[0]) + CloseHandle(syslogPipe[0]); + syslogPipe[0] = 0; +#endif + } + +#ifdef USE_BONJOUR + /* If using Bonjour, close the connection to the mDNS daemon */ + if (bonjour_sdref) + close(DNSServiceRefSockFD(bonjour_sdref)); +#endif +} + + +/* + * InitProcessGlobals -- set MyProcPid, MyStartTime[stamp], random seeds + * + * Called early in the postmaster and every backend. + */ +void +InitProcessGlobals(void) +{ + MyProcPid = getpid(); + MyStartTimestamp = GetCurrentTimestamp(); + MyStartTime = timestamptz_to_time_t(MyStartTimestamp); + + /* + * Set a different global seed in every process. We want something + * unpredictable, so if possible, use high-quality random bits for the + * seed. Otherwise, fall back to a seed based on timestamp and PID. + */ + if (unlikely(!pg_prng_strong_seed(&pg_global_prng_state))) + { + uint64 rseed; + + /* + * Since PIDs and timestamps tend to change more frequently in their + * least significant bits, shift the timestamp left to allow a larger + * total number of seeds in a given time period. Since that would + * leave only 20 bits of the timestamp that cycle every ~1 second, + * also mix in some higher bits. + */ + rseed = ((uint64) MyProcPid) ^ + ((uint64) MyStartTimestamp << 12) ^ + ((uint64) MyStartTimestamp >> 20); + + pg_prng_seed(&pg_global_prng_state, rseed); + } + + /* + * Also make sure that we've set a good seed for random(3). Use of that + * is deprecated in core Postgres, but extensions might use it. + */ +#ifndef WIN32 + srandom(pg_prng_uint32(&pg_global_prng_state)); +#endif +} + +/* + * Child processes use SIGUSR1 to notify us of 'pmsignals'. pg_ctl uses + * SIGUSR1 to ask postmaster to check for logrotate and promote files. + */ +static void +handle_pm_pmsignal_signal(SIGNAL_ARGS) +{ + int save_errno = errno; + + pending_pm_pmsignal = true; + SetLatch(MyLatch); + + errno = save_errno; +} + +/* + * pg_ctl uses SIGHUP to request a reload of the configuration files. + */ +static void +handle_pm_reload_request_signal(SIGNAL_ARGS) +{ + int save_errno = errno; + + pending_pm_reload_request = true; + SetLatch(MyLatch); + + errno = save_errno; +} + +/* + * Re-read config files, and tell children to do same. + */ +static void +process_pm_reload_request(void) +{ + pending_pm_reload_request = false; + + ereport(DEBUG2, + (errmsg_internal("postmaster received reload request signal"))); + + if (Shutdown <= SmartShutdown) + { + ereport(LOG, + (errmsg("received SIGHUP, reloading configuration files"))); + ProcessConfigFile(PGC_SIGHUP); + SignalChildren(SIGHUP); + if (StartupPID != 0) + signal_child(StartupPID, SIGHUP); + if (BgWriterPID != 0) + signal_child(BgWriterPID, SIGHUP); + if (CheckpointerPID != 0) + signal_child(CheckpointerPID, SIGHUP); + if (WalWriterPID != 0) + signal_child(WalWriterPID, SIGHUP); + if (WalReceiverPID != 0) + signal_child(WalReceiverPID, SIGHUP); + if (AutoVacPID != 0) + signal_child(AutoVacPID, SIGHUP); + if (PgArchPID != 0) + signal_child(PgArchPID, SIGHUP); + if (SysLoggerPID != 0) + signal_child(SysLoggerPID, SIGHUP); + + /* Reload authentication config files too */ + if (!load_hba()) + ereport(LOG, + /* translator: %s is a configuration file */ + (errmsg("%s was not reloaded", HbaFileName))); + + if (!load_ident()) + ereport(LOG, + (errmsg("%s was not reloaded", IdentFileName))); + +#ifdef USE_SSL + /* Reload SSL configuration as well */ + if (EnableSSL) + { + if (secure_initialize(false) == 0) + LoadedSSL = true; + else + ereport(LOG, + (errmsg("SSL configuration was not reloaded"))); + } + else + { + secure_destroy(); + LoadedSSL = false; + } +#endif + +#ifdef EXEC_BACKEND + /* Update the starting-point file for future children */ + write_nondefault_variables(PGC_SIGHUP); +#endif + } +} + +/* + * pg_ctl uses SIGTERM, SIGINT and SIGQUIT to request different types of + * shutdown. + */ +static void +handle_pm_shutdown_request_signal(SIGNAL_ARGS) +{ + int save_errno = errno; + + switch (postgres_signal_arg) + { + case SIGTERM: + /* smart is implied if the other two flags aren't set */ + pending_pm_shutdown_request = true; + break; + case SIGINT: + pending_pm_fast_shutdown_request = true; + pending_pm_shutdown_request = true; + break; + case SIGQUIT: + pending_pm_immediate_shutdown_request = true; + pending_pm_shutdown_request = true; + break; + } + SetLatch(MyLatch); + + errno = save_errno; +} + +/* + * Process shutdown request. + */ +static void +process_pm_shutdown_request(void) +{ + int mode; + + ereport(DEBUG2, + (errmsg_internal("postmaster received shutdown request signal"))); + + pending_pm_shutdown_request = false; + + /* + * If more than one shutdown request signal arrived since the last server + * loop, take the one that is the most immediate. That matches the + * priority that would apply if we processed them one by one in any order. + */ + if (pending_pm_immediate_shutdown_request) + { + pending_pm_immediate_shutdown_request = false; + pending_pm_fast_shutdown_request = false; + mode = ImmediateShutdown; + } + else if (pending_pm_fast_shutdown_request) + { + pending_pm_fast_shutdown_request = false; + mode = FastShutdown; + } + else + mode = SmartShutdown; + + switch (mode) + { + case SmartShutdown: + + /* + * Smart Shutdown: + * + * Wait for children to end their work, then shut down. + */ + if (Shutdown >= SmartShutdown) + break; + Shutdown = SmartShutdown; + ereport(LOG, + (errmsg("received smart shutdown request"))); + + /* Report status */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STOPPING); +#ifdef USE_SYSTEMD + sd_notify(0, "STOPPING=1"); +#endif + + /* + * If we reached normal running, we go straight to waiting for + * client backends to exit. If already in PM_STOP_BACKENDS or a + * later state, do not change it. + */ + if (pmState == PM_RUN || pmState == PM_HOT_STANDBY) + connsAllowed = false; + else if (pmState == PM_STARTUP || pmState == PM_RECOVERY) + { + /* There should be no clients, so proceed to stop children */ + pmState = PM_STOP_BACKENDS; + } + + /* + * Now wait for online backup mode to end and backends to exit. If + * that is already the case, PostmasterStateMachine will take the + * next step. + */ + PostmasterStateMachine(); + break; + + case FastShutdown: + + /* + * Fast Shutdown: + * + * Abort all children with SIGTERM (rollback active transactions + * and exit) and shut down when they are gone. + */ + if (Shutdown >= FastShutdown) + break; + Shutdown = FastShutdown; + ereport(LOG, + (errmsg("received fast shutdown request"))); + + /* Report status */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STOPPING); +#ifdef USE_SYSTEMD + sd_notify(0, "STOPPING=1"); +#endif + + if (pmState == PM_STARTUP || pmState == PM_RECOVERY) + { + /* Just shut down background processes silently */ + pmState = PM_STOP_BACKENDS; + } + else if (pmState == PM_RUN || + pmState == PM_HOT_STANDBY) + { + /* Report that we're about to zap live client sessions */ + ereport(LOG, + (errmsg("aborting any active transactions"))); + pmState = PM_STOP_BACKENDS; + } + + /* + * PostmasterStateMachine will issue any necessary signals, or + * take the next step if no child processes need to be killed. + */ + PostmasterStateMachine(); + break; + + case ImmediateShutdown: + + /* + * Immediate Shutdown: + * + * abort all children with SIGQUIT, wait for them to exit, + * terminate remaining ones with SIGKILL, then exit without + * attempt to properly shut down the data base system. + */ + if (Shutdown >= ImmediateShutdown) + break; + Shutdown = ImmediateShutdown; + ereport(LOG, + (errmsg("received immediate shutdown request"))); + + /* Report status */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STOPPING); +#ifdef USE_SYSTEMD + sd_notify(0, "STOPPING=1"); +#endif + + /* tell children to shut down ASAP */ + /* (note we don't apply send_abort_for_crash here) */ + SetQuitSignalReason(PMQUIT_FOR_STOP); + TerminateChildren(SIGQUIT); + pmState = PM_WAIT_BACKENDS; + + /* set stopwatch for them to die */ + AbortStartTime = time(NULL); + + /* + * Now wait for backends to exit. If there are none, + * PostmasterStateMachine will take the next step. + */ + PostmasterStateMachine(); + break; + } +} + +static void +handle_pm_child_exit_signal(SIGNAL_ARGS) +{ + int save_errno = errno; + + pending_pm_child_exit = true; + SetLatch(MyLatch); + + errno = save_errno; +} + +/* + * Cleanup after a child process dies. + */ +static void +process_pm_child_exit(void) +{ + int pid; /* process id of dead child process */ + int exitstatus; /* its exit status */ + + pending_pm_child_exit = false; + + ereport(DEBUG4, + (errmsg_internal("reaping dead processes"))); + + while ((pid = waitpid(-1, &exitstatus, WNOHANG)) > 0) + { + /* + * Check if this child was a startup process. + */ + if (pid == StartupPID) + { + StartupPID = 0; + + /* + * Startup process exited in response to a shutdown request (or it + * completed normally regardless of the shutdown request). + */ + if (Shutdown > NoShutdown && + (EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus))) + { + StartupStatus = STARTUP_NOT_RUNNING; + pmState = PM_WAIT_BACKENDS; + /* PostmasterStateMachine logic does the rest */ + continue; + } + + if (EXIT_STATUS_3(exitstatus)) + { + ereport(LOG, + (errmsg("shutdown at recovery target"))); + StartupStatus = STARTUP_NOT_RUNNING; + Shutdown = Max(Shutdown, SmartShutdown); + TerminateChildren(SIGTERM); + pmState = PM_WAIT_BACKENDS; + /* PostmasterStateMachine logic does the rest */ + continue; + } + + /* + * Unexpected exit of startup process (including FATAL exit) + * during PM_STARTUP is treated as catastrophic. There are no + * other processes running yet, so we can just exit. + */ + if (pmState == PM_STARTUP && + StartupStatus != STARTUP_SIGNALED && + !EXIT_STATUS_0(exitstatus)) + { + LogChildExit(LOG, _("startup process"), + pid, exitstatus); + ereport(LOG, + (errmsg("aborting startup due to startup process failure"))); + ExitPostmaster(1); + } + + /* + * After PM_STARTUP, any unexpected exit (including FATAL exit) of + * the startup process is catastrophic, so kill other children, + * and set StartupStatus so we don't try to reinitialize after + * they're gone. Exception: if StartupStatus is STARTUP_SIGNALED, + * then we previously sent the startup process a SIGQUIT; so + * that's probably the reason it died, and we do want to try to + * restart in that case. + * + * This stanza also handles the case where we sent a SIGQUIT + * during PM_STARTUP due to some dead_end child crashing: in that + * situation, if the startup process dies on the SIGQUIT, we need + * to transition to PM_WAIT_BACKENDS state which will allow + * PostmasterStateMachine to restart the startup process. (On the + * other hand, the startup process might complete normally, if we + * were too late with the SIGQUIT. In that case we'll fall + * through and commence normal operations.) + */ + if (!EXIT_STATUS_0(exitstatus)) + { + if (StartupStatus == STARTUP_SIGNALED) + { + StartupStatus = STARTUP_NOT_RUNNING; + if (pmState == PM_STARTUP) + pmState = PM_WAIT_BACKENDS; + } + else + StartupStatus = STARTUP_CRASHED; + HandleChildCrash(pid, exitstatus, + _("startup process")); + continue; + } + + /* + * Startup succeeded, commence normal operations + */ + StartupStatus = STARTUP_NOT_RUNNING; + FatalError = false; + AbortStartTime = 0; + ReachedNormalRunning = true; + pmState = PM_RUN; + connsAllowed = true; + + /* + * Crank up the background tasks, if we didn't do that already + * when we entered consistent recovery state. It doesn't matter + * if this fails, we'll just try again later. + */ + if (CheckpointerPID == 0) + CheckpointerPID = StartCheckpointer(); + if (BgWriterPID == 0) + BgWriterPID = StartBackgroundWriter(); + if (WalWriterPID == 0) + WalWriterPID = StartWalWriter(); + + /* + * Likewise, start other special children as needed. In a restart + * situation, some of them may be alive already. + */ + if (!IsBinaryUpgrade && AutoVacuumingActive() && AutoVacPID == 0) + AutoVacPID = StartAutoVacLauncher(); + if (PgArchStartupAllowed() && PgArchPID == 0) + PgArchPID = StartArchiver(); + + /* workers may be scheduled to start now */ + maybe_start_bgworkers(); + + /* at this point we are really open for business */ + ereport(LOG, + (errmsg("database system is ready to accept connections"))); + + /* Report status */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_READY); +#ifdef USE_SYSTEMD + sd_notify(0, "READY=1"); +#endif + + continue; + } + + /* + * Was it the bgwriter? Normal exit can be ignored; we'll start a new + * one at the next iteration of the postmaster's main loop, if + * necessary. Any other exit condition is treated as a crash. + */ + if (pid == BgWriterPID) + { + BgWriterPID = 0; + if (!EXIT_STATUS_0(exitstatus)) + HandleChildCrash(pid, exitstatus, + _("background writer process")); + continue; + } + + /* + * Was it the checkpointer? + */ + if (pid == CheckpointerPID) + { + CheckpointerPID = 0; + if (EXIT_STATUS_0(exitstatus) && pmState == PM_SHUTDOWN) + { + /* + * OK, we saw normal exit of the checkpointer after it's been + * told to shut down. We expect that it wrote a shutdown + * checkpoint. (If for some reason it didn't, recovery will + * occur on next postmaster start.) + * + * At this point we should have no normal backend children + * left (else we'd not be in PM_SHUTDOWN state) but we might + * have dead_end children to wait for. + * + * If we have an archiver subprocess, tell it to do a last + * archive cycle and quit. Likewise, if we have walsender + * processes, tell them to send any remaining WAL and quit. + */ + Assert(Shutdown > NoShutdown); + + /* Waken archiver for the last time */ + if (PgArchPID != 0) + signal_child(PgArchPID, SIGUSR2); + + /* + * Waken walsenders for the last time. No regular backends + * should be around anymore. + */ + SignalChildren(SIGUSR2); + + pmState = PM_SHUTDOWN_2; + } + else + { + /* + * Any unexpected exit of the checkpointer (including FATAL + * exit) is treated as a crash. + */ + HandleChildCrash(pid, exitstatus, + _("checkpointer process")); + } + + continue; + } + + /* + * Was it the wal writer? Normal exit can be ignored; we'll start a + * new one at the next iteration of the postmaster's main loop, if + * necessary. Any other exit condition is treated as a crash. + */ + if (pid == WalWriterPID) + { + WalWriterPID = 0; + if (!EXIT_STATUS_0(exitstatus)) + HandleChildCrash(pid, exitstatus, + _("WAL writer process")); + continue; + } + + /* + * Was it the wal receiver? If exit status is zero (normal) or one + * (FATAL exit), we assume everything is all right just like normal + * backends. (If we need a new wal receiver, we'll start one at the + * next iteration of the postmaster's main loop.) + */ + if (pid == WalReceiverPID) + { + WalReceiverPID = 0; + if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus)) + HandleChildCrash(pid, exitstatus, + _("WAL receiver process")); + continue; + } + + /* + * Was it the autovacuum launcher? Normal exit can be ignored; we'll + * start a new one at the next iteration of the postmaster's main + * loop, if necessary. Any other exit condition is treated as a + * crash. + */ + if (pid == AutoVacPID) + { + AutoVacPID = 0; + if (!EXIT_STATUS_0(exitstatus)) + HandleChildCrash(pid, exitstatus, + _("autovacuum launcher process")); + continue; + } + + /* + * Was it the archiver? If exit status is zero (normal) or one (FATAL + * exit), we assume everything is all right just like normal backends + * and just try to restart a new one so that we immediately retry + * archiving remaining files. (If fail, we'll try again in future + * cycles of the postmaster's main loop.) Unless we were waiting for + * it to shut down; don't restart it in that case, and + * PostmasterStateMachine() will advance to the next shutdown step. + */ + if (pid == PgArchPID) + { + PgArchPID = 0; + if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus)) + HandleChildCrash(pid, exitstatus, + _("archiver process")); + if (PgArchStartupAllowed()) + PgArchPID = StartArchiver(); + continue; + } + + /* Was it the system logger? If so, try to start a new one */ + if (pid == SysLoggerPID) + { + SysLoggerPID = 0; + /* for safety's sake, launch new logger *first* */ + SysLoggerPID = SysLogger_Start(); + if (!EXIT_STATUS_0(exitstatus)) + LogChildExit(LOG, _("system logger process"), + pid, exitstatus); + continue; + } + + /* Was it one of our background workers? */ + if (CleanupBackgroundWorker(pid, exitstatus)) + { + /* have it be restarted */ + HaveCrashedWorker = true; + continue; + } + + /* + * Else do standard backend child cleanup. + */ + CleanupBackend(pid, exitstatus); + } /* loop over pending child-death reports */ + + /* + * After cleaning out the SIGCHLD queue, see if we have any state changes + * or actions to make. + */ + PostmasterStateMachine(); +} + +/* + * Scan the bgworkers list and see if the given PID (which has just stopped + * or crashed) is in it. Handle its shutdown if so, and return true. If not a + * bgworker, return false. + * + * This is heavily based on CleanupBackend. One important difference is that + * we don't know yet that the dying process is a bgworker, so we must be silent + * until we're sure it is. + */ +static bool +CleanupBackgroundWorker(int pid, + int exitstatus) /* child's exit status */ +{ + char namebuf[MAXPGPATH]; + slist_mutable_iter iter; + + slist_foreach_modify(iter, &BackgroundWorkerList) + { + RegisteredBgWorker *rw; + + rw = slist_container(RegisteredBgWorker, rw_lnode, iter.cur); + + if (rw->rw_pid != pid) + continue; + +#ifdef WIN32 + /* see CleanupBackend */ + if (exitstatus == ERROR_WAIT_NO_CHILDREN) + exitstatus = 0; +#endif + + snprintf(namebuf, MAXPGPATH, _("background worker \"%s\""), + rw->rw_worker.bgw_type); + + + if (!EXIT_STATUS_0(exitstatus)) + { + /* Record timestamp, so we know when to restart the worker. */ + rw->rw_crashed_at = GetCurrentTimestamp(); + } + else + { + /* Zero exit status means terminate */ + rw->rw_crashed_at = 0; + rw->rw_terminate = true; + } + + /* + * Additionally, just like a backend, any exit status other than 0 or + * 1 is considered a crash and causes a system-wide restart. + */ + if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus)) + { + HandleChildCrash(pid, exitstatus, namebuf); + return true; + } + + /* + * We must release the postmaster child slot. If the worker failed to + * do so, it did not clean up after itself, requiring a crash-restart + * cycle. + */ + if (!ReleasePostmasterChildSlot(rw->rw_child_slot)) + { + HandleChildCrash(pid, exitstatus, namebuf); + return true; + } + + /* Get it out of the BackendList and clear out remaining data */ + dlist_delete(&rw->rw_backend->elem); +#ifdef EXEC_BACKEND + ShmemBackendArrayRemove(rw->rw_backend); +#endif + + /* + * It's possible that this background worker started some OTHER + * background worker and asked to be notified when that worker started + * or stopped. If so, cancel any notifications destined for the + * now-dead backend. + */ + if (rw->rw_backend->bgworker_notify) + BackgroundWorkerStopNotifications(rw->rw_pid); + free(rw->rw_backend); + rw->rw_backend = NULL; + rw->rw_pid = 0; + rw->rw_child_slot = 0; + ReportBackgroundWorkerExit(&iter); /* report child death */ + + LogChildExit(EXIT_STATUS_0(exitstatus) ? DEBUG1 : LOG, + namebuf, pid, exitstatus); + + return true; + } + + return false; +} + +/* + * CleanupBackend -- cleanup after terminated backend. + * + * Remove all local state associated with backend. + * + * If you change this, see also CleanupBackgroundWorker. + */ +static void +CleanupBackend(int pid, + int exitstatus) /* child's exit status. */ +{ + dlist_mutable_iter iter; + + LogChildExit(DEBUG2, _("server process"), pid, exitstatus); + + /* + * If a backend dies in an ugly way then we must signal all other backends + * to quickdie. If exit status is zero (normal) or one (FATAL exit), we + * assume everything is all right and proceed to remove the backend from + * the active backend list. + */ + +#ifdef WIN32 + + /* + * On win32, also treat ERROR_WAIT_NO_CHILDREN (128) as nonfatal case, + * since that sometimes happens under load when the process fails to start + * properly (long before it starts using shared memory). Microsoft reports + * it is related to mutex failure: + * http://archives.postgresql.org/pgsql-hackers/2010-09/msg00790.php + */ + if (exitstatus == ERROR_WAIT_NO_CHILDREN) + { + LogChildExit(LOG, _("server process"), pid, exitstatus); + exitstatus = 0; + } +#endif + + if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus)) + { + HandleChildCrash(pid, exitstatus, _("server process")); + return; + } + + dlist_foreach_modify(iter, &BackendList) + { + Backend *bp = dlist_container(Backend, elem, iter.cur); + + if (bp->pid == pid) + { + if (!bp->dead_end) + { + if (!ReleasePostmasterChildSlot(bp->child_slot)) + { + /* + * Uh-oh, the child failed to clean itself up. Treat as a + * crash after all. + */ + HandleChildCrash(pid, exitstatus, _("server process")); + return; + } +#ifdef EXEC_BACKEND + ShmemBackendArrayRemove(bp); +#endif + } + if (bp->bgworker_notify) + { + /* + * This backend may have been slated to receive SIGUSR1 when + * some background worker started or stopped. Cancel those + * notifications, as we don't want to signal PIDs that are not + * PostgreSQL backends. This gets skipped in the (probably + * very common) case where the backend has never requested any + * such notifications. + */ + BackgroundWorkerStopNotifications(bp->pid); + } + dlist_delete(iter.cur); + free(bp); + break; + } + } +} + +/* + * HandleChildCrash -- cleanup after failed backend, bgwriter, checkpointer, + * walwriter, autovacuum, archiver or background worker. + * + * The objectives here are to clean up our local state about the child + * process, and to signal all other remaining children to quickdie. + */ +static void +HandleChildCrash(int pid, int exitstatus, const char *procname) +{ + dlist_mutable_iter iter; + slist_iter siter; + Backend *bp; + bool take_action; + + /* + * We only log messages and send signals if this is the first process + * crash and we're not doing an immediate shutdown; otherwise, we're only + * here to update postmaster's idea of live processes. If we have already + * signaled children, nonzero exit status is to be expected, so don't + * clutter log. + */ + take_action = !FatalError && Shutdown != ImmediateShutdown; + + if (take_action) + { + LogChildExit(LOG, procname, pid, exitstatus); + ereport(LOG, + (errmsg("terminating any other active server processes"))); + SetQuitSignalReason(PMQUIT_FOR_CRASH); + } + + /* Process background workers. */ + slist_foreach(siter, &BackgroundWorkerList) + { + RegisteredBgWorker *rw; + + rw = slist_container(RegisteredBgWorker, rw_lnode, siter.cur); + if (rw->rw_pid == 0) + continue; /* not running */ + if (rw->rw_pid == pid) + { + /* + * Found entry for freshly-dead worker, so remove it. + */ + (void) ReleasePostmasterChildSlot(rw->rw_child_slot); + dlist_delete(&rw->rw_backend->elem); +#ifdef EXEC_BACKEND + ShmemBackendArrayRemove(rw->rw_backend); +#endif + free(rw->rw_backend); + rw->rw_backend = NULL; + rw->rw_pid = 0; + rw->rw_child_slot = 0; + /* don't reset crashed_at */ + /* don't report child stop, either */ + /* Keep looping so we can signal remaining workers */ + } + else + { + /* + * This worker is still alive. Unless we did so already, tell it + * to commit hara-kiri. + */ + if (take_action) + sigquit_child(rw->rw_pid); + } + } + + /* Process regular backends */ + dlist_foreach_modify(iter, &BackendList) + { + bp = dlist_container(Backend, elem, iter.cur); + + if (bp->pid == pid) + { + /* + * Found entry for freshly-dead backend, so remove it. + */ + if (!bp->dead_end) + { + (void) ReleasePostmasterChildSlot(bp->child_slot); +#ifdef EXEC_BACKEND + ShmemBackendArrayRemove(bp); +#endif + } + dlist_delete(iter.cur); + free(bp); + /* Keep looping so we can signal remaining backends */ + } + else + { + /* + * This backend is still alive. Unless we did so already, tell it + * to commit hara-kiri. + * + * We could exclude dead_end children here, but at least when + * sending SIGABRT it seems better to include them. + * + * Background workers were already processed above; ignore them + * here. + */ + if (bp->bkend_type == BACKEND_TYPE_BGWORKER) + continue; + + if (take_action) + sigquit_child(bp->pid); + } + } + + /* Take care of the startup process too */ + if (pid == StartupPID) + { + StartupPID = 0; + /* Caller adjusts StartupStatus, so don't touch it here */ + } + else if (StartupPID != 0 && take_action) + { + sigquit_child(StartupPID); + StartupStatus = STARTUP_SIGNALED; + } + + /* Take care of the bgwriter too */ + if (pid == BgWriterPID) + BgWriterPID = 0; + else if (BgWriterPID != 0 && take_action) + sigquit_child(BgWriterPID); + + /* Take care of the checkpointer too */ + if (pid == CheckpointerPID) + CheckpointerPID = 0; + else if (CheckpointerPID != 0 && take_action) + sigquit_child(CheckpointerPID); + + /* Take care of the walwriter too */ + if (pid == WalWriterPID) + WalWriterPID = 0; + else if (WalWriterPID != 0 && take_action) + sigquit_child(WalWriterPID); + + /* Take care of the walreceiver too */ + if (pid == WalReceiverPID) + WalReceiverPID = 0; + else if (WalReceiverPID != 0 && take_action) + sigquit_child(WalReceiverPID); + + /* Take care of the autovacuum launcher too */ + if (pid == AutoVacPID) + AutoVacPID = 0; + else if (AutoVacPID != 0 && take_action) + sigquit_child(AutoVacPID); + + /* Take care of the archiver too */ + if (pid == PgArchPID) + PgArchPID = 0; + else if (PgArchPID != 0 && take_action) + sigquit_child(PgArchPID); + + /* We do NOT restart the syslogger */ + + if (Shutdown != ImmediateShutdown) + FatalError = true; + + /* We now transit into a state of waiting for children to die */ + if (pmState == PM_RECOVERY || + pmState == PM_HOT_STANDBY || + pmState == PM_RUN || + pmState == PM_STOP_BACKENDS || + pmState == PM_SHUTDOWN) + pmState = PM_WAIT_BACKENDS; + + /* + * .. and if this doesn't happen quickly enough, now the clock is ticking + * for us to kill them without mercy. + */ + if (AbortStartTime == 0) + AbortStartTime = time(NULL); +} + +/* + * Log the death of a child process. + */ +static void +LogChildExit(int lev, const char *procname, int pid, int exitstatus) +{ + /* + * size of activity_buffer is arbitrary, but set equal to default + * track_activity_query_size + */ + char activity_buffer[1024]; + const char *activity = NULL; + + if (!EXIT_STATUS_0(exitstatus)) + activity = pgstat_get_crashed_backend_activity(pid, + activity_buffer, + sizeof(activity_buffer)); + + if (WIFEXITED(exitstatus)) + ereport(lev, + + /*------ + translator: %s is a noun phrase describing a child process, such as + "server process" */ + (errmsg("%s (PID %d) exited with exit code %d", + procname, pid, WEXITSTATUS(exitstatus)), + activity ? errdetail("Failed process was running: %s", activity) : 0)); + else if (WIFSIGNALED(exitstatus)) + { +#if defined(WIN32) + ereport(lev, + + /*------ + translator: %s is a noun phrase describing a child process, such as + "server process" */ + (errmsg("%s (PID %d) was terminated by exception 0x%X", + procname, pid, WTERMSIG(exitstatus)), + errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value."), + activity ? errdetail("Failed process was running: %s", activity) : 0)); +#else + ereport(lev, + + /*------ + translator: %s is a noun phrase describing a child process, such as + "server process" */ + (errmsg("%s (PID %d) was terminated by signal %d: %s", + procname, pid, WTERMSIG(exitstatus), + pg_strsignal(WTERMSIG(exitstatus))), + activity ? errdetail("Failed process was running: %s", activity) : 0)); +#endif + } + else + ereport(lev, + + /*------ + translator: %s is a noun phrase describing a child process, such as + "server process" */ + (errmsg("%s (PID %d) exited with unrecognized status %d", + procname, pid, exitstatus), + activity ? errdetail("Failed process was running: %s", activity) : 0)); +} + +/* + * Advance the postmaster's state machine and take actions as appropriate + * + * This is common code for process_pm_shutdown_request(), + * process_pm_child_exit() and process_pm_pmsignal(), which process the signals + * that might mean we need to change state. + */ +static void +PostmasterStateMachine(void) +{ + /* If we're doing a smart shutdown, try to advance that state. */ + if (pmState == PM_RUN || pmState == PM_HOT_STANDBY) + { + if (!connsAllowed) + { + /* + * This state ends when we have no normal client backends running. + * Then we're ready to stop other children. + */ + if (CountChildren(BACKEND_TYPE_NORMAL) == 0) + pmState = PM_STOP_BACKENDS; + } + } + + /* + * If we're ready to do so, signal child processes to shut down. (This + * isn't a persistent state, but treating it as a distinct pmState allows + * us to share this code across multiple shutdown code paths.) + */ + if (pmState == PM_STOP_BACKENDS) + { + /* + * Forget any pending requests for background workers, since we're no + * longer willing to launch any new workers. (If additional requests + * arrive, BackgroundWorkerStateChange will reject them.) + */ + ForgetUnstartedBackgroundWorkers(); + + /* Signal all backend children except walsenders */ + SignalSomeChildren(SIGTERM, + BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND); + /* and the autovac launcher too */ + if (AutoVacPID != 0) + signal_child(AutoVacPID, SIGTERM); + /* and the bgwriter too */ + if (BgWriterPID != 0) + signal_child(BgWriterPID, SIGTERM); + /* and the walwriter too */ + if (WalWriterPID != 0) + signal_child(WalWriterPID, SIGTERM); + /* If we're in recovery, also stop startup and walreceiver procs */ + if (StartupPID != 0) + signal_child(StartupPID, SIGTERM); + if (WalReceiverPID != 0) + signal_child(WalReceiverPID, SIGTERM); + /* checkpointer, archiver, stats, and syslogger may continue for now */ + + /* Now transition to PM_WAIT_BACKENDS state to wait for them to die */ + pmState = PM_WAIT_BACKENDS; + } + + /* + * If we are in a state-machine state that implies waiting for backends to + * exit, see if they're all gone, and change state if so. + */ + if (pmState == PM_WAIT_BACKENDS) + { + /* + * PM_WAIT_BACKENDS state ends when we have no regular backends + * (including autovac workers), no bgworkers (including unconnected + * ones), and no walwriter, autovac launcher or bgwriter. If we are + * doing crash recovery or an immediate shutdown then we expect the + * checkpointer to exit as well, otherwise not. The stats and + * syslogger processes are disregarded since they are not connected to + * shared memory; we also disregard dead_end children here. Walsenders + * and archiver are also disregarded, they will be terminated later + * after writing the checkpoint record. + */ + if (CountChildren(BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND) == 0 && + StartupPID == 0 && + WalReceiverPID == 0 && + BgWriterPID == 0 && + (CheckpointerPID == 0 || + (!FatalError && Shutdown < ImmediateShutdown)) && + WalWriterPID == 0 && + AutoVacPID == 0) + { + if (Shutdown >= ImmediateShutdown || FatalError) + { + /* + * Start waiting for dead_end children to die. This state + * change causes ServerLoop to stop creating new ones. + */ + pmState = PM_WAIT_DEAD_END; + + /* + * We already SIGQUIT'd the archiver and stats processes, if + * any, when we started immediate shutdown or entered + * FatalError state. + */ + } + else + { + /* + * If we get here, we are proceeding with normal shutdown. All + * the regular children are gone, and it's time to tell the + * checkpointer to do a shutdown checkpoint. + */ + Assert(Shutdown > NoShutdown); + /* Start the checkpointer if not running */ + if (CheckpointerPID == 0) + CheckpointerPID = StartCheckpointer(); + /* And tell it to shut down */ + if (CheckpointerPID != 0) + { + signal_child(CheckpointerPID, SIGUSR2); + pmState = PM_SHUTDOWN; + } + else + { + /* + * If we failed to fork a checkpointer, just shut down. + * Any required cleanup will happen at next restart. We + * set FatalError so that an "abnormal shutdown" message + * gets logged when we exit. + * + * We don't consult send_abort_for_crash here, as it's + * unlikely that dumping cores would illuminate the reason + * for checkpointer fork failure. + */ + FatalError = true; + pmState = PM_WAIT_DEAD_END; + + /* Kill the walsenders and archiver too */ + SignalChildren(SIGQUIT); + if (PgArchPID != 0) + signal_child(PgArchPID, SIGQUIT); + } + } + } + } + + if (pmState == PM_SHUTDOWN_2) + { + /* + * PM_SHUTDOWN_2 state ends when there's no other children than + * dead_end children left. There shouldn't be any regular backends + * left by now anyway; what we're really waiting for is walsenders and + * archiver. + */ + if (PgArchPID == 0 && CountChildren(BACKEND_TYPE_ALL) == 0) + { + pmState = PM_WAIT_DEAD_END; + } + } + + if (pmState == PM_WAIT_DEAD_END) + { + /* Don't allow any new socket connection events. */ + ConfigurePostmasterWaitSet(false); + + /* + * PM_WAIT_DEAD_END state ends when the BackendList is entirely empty + * (ie, no dead_end children remain), and the archiver is gone too. + * + * The reason we wait for those two is to protect them against a new + * postmaster starting conflicting subprocesses; this isn't an + * ironclad protection, but it at least helps in the + * shutdown-and-immediately-restart scenario. Note that they have + * already been sent appropriate shutdown signals, either during a + * normal state transition leading up to PM_WAIT_DEAD_END, or during + * FatalError processing. + */ + if (dlist_is_empty(&BackendList) && PgArchPID == 0) + { + /* These other guys should be dead already */ + Assert(StartupPID == 0); + Assert(WalReceiverPID == 0); + Assert(BgWriterPID == 0); + Assert(CheckpointerPID == 0); + Assert(WalWriterPID == 0); + Assert(AutoVacPID == 0); + /* syslogger is not considered here */ + pmState = PM_NO_CHILDREN; + } + } + + /* + * If we've been told to shut down, we exit as soon as there are no + * remaining children. If there was a crash, cleanup will occur at the + * next startup. (Before PostgreSQL 8.3, we tried to recover from the + * crash before exiting, but that seems unwise if we are quitting because + * we got SIGTERM from init --- there may well not be time for recovery + * before init decides to SIGKILL us.) + * + * Note that the syslogger continues to run. It will exit when it sees + * EOF on its input pipe, which happens when there are no more upstream + * processes. + */ + if (Shutdown > NoShutdown && pmState == PM_NO_CHILDREN) + { + if (FatalError) + { + ereport(LOG, (errmsg("abnormal database system shutdown"))); + ExitPostmaster(1); + } + else + { + /* + * Normal exit from the postmaster is here. We don't need to log + * anything here, since the UnlinkLockFiles proc_exit callback + * will do so, and that should be the last user-visible action. + */ + ExitPostmaster(0); + } + } + + /* + * If the startup process failed, or the user does not want an automatic + * restart after backend crashes, wait for all non-syslogger children to + * exit, and then exit postmaster. We don't try to reinitialize when the + * startup process fails, because more than likely it will just fail again + * and we will keep trying forever. + */ + if (pmState == PM_NO_CHILDREN) + { + if (StartupStatus == STARTUP_CRASHED) + { + ereport(LOG, + (errmsg("shutting down due to startup process failure"))); + ExitPostmaster(1); + } + if (!restart_after_crash) + { + ereport(LOG, + (errmsg("shutting down because restart_after_crash is off"))); + ExitPostmaster(1); + } + } + + /* + * If we need to recover from a crash, wait for all non-syslogger children + * to exit, then reset shmem and StartupDataBase. + */ + if (FatalError && pmState == PM_NO_CHILDREN) + { + ereport(LOG, + (errmsg("all server processes terminated; reinitializing"))); + + /* remove leftover temporary files after a crash */ + if (remove_temp_files_after_crash) + RemovePgTempFiles(); + + /* allow background workers to immediately restart */ + ResetBackgroundWorkerCrashTimes(); + + shmem_exit(1); + + /* re-read control file into local memory */ + LocalProcessControlFile(true); + + /* re-create shared memory and semaphores */ + CreateSharedMemoryAndSemaphores(); + + StartupPID = StartupDataBase(); + Assert(StartupPID != 0); + StartupStatus = STARTUP_RUNNING; + pmState = PM_STARTUP; + /* crash recovery started, reset SIGKILL flag */ + AbortStartTime = 0; + + /* start accepting server socket connection events again */ + ConfigurePostmasterWaitSet(true); + } +} + + +/* + * Send a signal to a postmaster child process + * + * On systems that have setsid(), each child process sets itself up as a + * process group leader. For signals that are generally interpreted in the + * appropriate fashion, we signal the entire process group not just the + * direct child process. This allows us to, for example, SIGQUIT a blocked + * archive_recovery script, or SIGINT a script being run by a backend via + * system(). + * + * There is a race condition for recently-forked children: they might not + * have executed setsid() yet. So we signal the child directly as well as + * the group. We assume such a child will handle the signal before trying + * to spawn any grandchild processes. We also assume that signaling the + * child twice will not cause any problems. + */ +static void +signal_child(pid_t pid, int signal) +{ + if (kill(pid, signal) < 0) + elog(DEBUG3, "kill(%ld,%d) failed: %m", (long) pid, signal); +#ifdef HAVE_SETSID + switch (signal) + { + case SIGINT: + case SIGTERM: + case SIGQUIT: + case SIGKILL: + case SIGABRT: + if (kill(-pid, signal) < 0) + elog(DEBUG3, "kill(%ld,%d) failed: %m", (long) (-pid), signal); + break; + default: + break; + } +#endif +} + +/* + * Convenience function for killing a child process after a crash of some + * other child process. We log the action at a higher level than we would + * otherwise do, and we apply send_abort_for_crash to decide which signal + * to send. Normally it's SIGQUIT -- and most other comments in this file + * are written on the assumption that it is -- but developers might prefer + * to use SIGABRT to collect per-child core dumps. + */ +static void +sigquit_child(pid_t pid) +{ + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (send_abort_for_crash ? "SIGABRT" : "SIGQUIT"), + (int) pid))); + signal_child(pid, (send_abort_for_crash ? SIGABRT : SIGQUIT)); +} + +/* + * Send a signal to the targeted children (but NOT special children; + * dead_end children are never signaled, either). + */ +static bool +SignalSomeChildren(int signal, int target) +{ + dlist_iter iter; + bool signaled = false; + + dlist_foreach(iter, &BackendList) + { + Backend *bp = dlist_container(Backend, elem, iter.cur); + + if (bp->dead_end) + continue; + + /* + * Since target == BACKEND_TYPE_ALL is the most common case, we test + * it first and avoid touching shared memory for every child. + */ + if (target != BACKEND_TYPE_ALL) + { + /* + * Assign bkend_type for any recently announced WAL Sender + * processes. + */ + if (bp->bkend_type == BACKEND_TYPE_NORMAL && + IsPostmasterChildWalSender(bp->child_slot)) + bp->bkend_type = BACKEND_TYPE_WALSND; + + if (!(target & bp->bkend_type)) + continue; + } + + ereport(DEBUG4, + (errmsg_internal("sending signal %d to process %d", + signal, (int) bp->pid))); + signal_child(bp->pid, signal); + signaled = true; + } + return signaled; +} + +/* + * Send a termination signal to children. This considers all of our children + * processes, except syslogger and dead_end backends. + */ +static void +TerminateChildren(int signal) +{ + SignalChildren(signal); + if (StartupPID != 0) + { + signal_child(StartupPID, signal); + if (signal == SIGQUIT || signal == SIGKILL || signal == SIGABRT) + StartupStatus = STARTUP_SIGNALED; + } + if (BgWriterPID != 0) + signal_child(BgWriterPID, signal); + if (CheckpointerPID != 0) + signal_child(CheckpointerPID, signal); + if (WalWriterPID != 0) + signal_child(WalWriterPID, signal); + if (WalReceiverPID != 0) + signal_child(WalReceiverPID, signal); + if (AutoVacPID != 0) + signal_child(AutoVacPID, signal); + if (PgArchPID != 0) + signal_child(PgArchPID, signal); +} + +/* + * BackendStartup -- start backend process + * + * returns: STATUS_ERROR if the fork failed, STATUS_OK otherwise. + * + * Note: if you change this code, also consider StartAutovacuumWorker. + */ +static int +BackendStartup(Port *port) +{ + Backend *bn; /* for backend cleanup */ + pid_t pid; + + /* + * Create backend data structure. Better before the fork() so we can + * handle failure cleanly. + */ + bn = (Backend *) malloc(sizeof(Backend)); + if (!bn) + { + ereport(LOG, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + return STATUS_ERROR; + } + + /* + * Compute the cancel key that will be assigned to this backend. The + * backend will have its own copy in the forked-off process' value of + * MyCancelKey, so that it can transmit the key to the frontend. + */ + if (!RandomCancelKey(&MyCancelKey)) + { + free(bn); + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("could not generate random cancel key"))); + return STATUS_ERROR; + } + + bn->cancel_key = MyCancelKey; + + /* Pass down canAcceptConnections state */ + port->canAcceptConnections = canAcceptConnections(BACKEND_TYPE_NORMAL); + bn->dead_end = (port->canAcceptConnections != CAC_OK); + + /* + * Unless it's a dead_end child, assign it a child slot number + */ + if (!bn->dead_end) + bn->child_slot = MyPMChildSlot = AssignPostmasterChildSlot(); + else + bn->child_slot = 0; + + /* Hasn't asked to be notified about any bgworkers yet */ + bn->bgworker_notify = false; + +#ifdef EXEC_BACKEND + pid = backend_forkexec(port); +#else /* !EXEC_BACKEND */ + pid = fork_process(); + if (pid == 0) /* child */ + { + free(bn); + + /* Detangle from postmaster */ + InitPostmasterChild(); + + /* Close the postmaster's sockets */ + ClosePostmasterPorts(false); + + /* Perform additional initialization and collect startup packet */ + BackendInitialize(port); + + /* + * Create a per-backend PGPROC struct in shared memory. We must do + * this before we can use LWLocks. In the !EXEC_BACKEND case (here) + * this could be delayed a bit further, but EXEC_BACKEND needs to do + * stuff with LWLocks before PostgresMain(), so we do it here as well + * for symmetry. + */ + InitProcess(); + + /* And run the backend */ + BackendRun(port); + } +#endif /* EXEC_BACKEND */ + + if (pid < 0) + { + /* in parent, fork failed */ + int save_errno = errno; + + if (!bn->dead_end) + (void) ReleasePostmasterChildSlot(bn->child_slot); + free(bn); + errno = save_errno; + ereport(LOG, + (errmsg("could not fork new process for connection: %m"))); + report_fork_failure_to_client(port, save_errno); + return STATUS_ERROR; + } + + /* in parent, successful fork */ + ereport(DEBUG2, + (errmsg_internal("forked new backend, pid=%d socket=%d", + (int) pid, (int) port->sock))); + + /* + * Everything's been successful, it's safe to add this backend to our list + * of backends. + */ + bn->pid = pid; + bn->bkend_type = BACKEND_TYPE_NORMAL; /* Can change later to WALSND */ + dlist_push_head(&BackendList, &bn->elem); + +#ifdef EXEC_BACKEND + if (!bn->dead_end) + ShmemBackendArrayAdd(bn); +#endif + + return STATUS_OK; +} + +/* + * Try to report backend fork() failure to client before we close the + * connection. Since we do not care to risk blocking the postmaster on + * this connection, we set the connection to non-blocking and try only once. + * + * This is grungy special-purpose code; we cannot use backend libpq since + * it's not up and running. + */ +static void +report_fork_failure_to_client(Port *port, int errnum) +{ + char buffer[1000]; + int rc; + + /* Format the error message packet (always V2 protocol) */ + snprintf(buffer, sizeof(buffer), "E%s%s\n", + _("could not fork new process for connection: "), + strerror(errnum)); + + /* Set port to non-blocking. Don't do send() if this fails */ + if (!pg_set_noblock(port->sock)) + return; + + /* We'll retry after EINTR, but ignore all other failures */ + do + { + rc = send(port->sock, buffer, strlen(buffer) + 1, 0); + } while (rc < 0 && errno == EINTR); +} + + +/* + * BackendInitialize -- initialize an interactive (postmaster-child) + * backend process, and collect the client's startup packet. + * + * returns: nothing. Will not return at all if there's any failure. + * + * Note: this code does not depend on having any access to shared memory. + * Indeed, our approach to SIGTERM/timeout handling *requires* that + * shared memory not have been touched yet; see comments within. + * In the EXEC_BACKEND case, we are physically attached to shared memory + * but have not yet set up most of our local pointers to shmem structures. + */ +static void +BackendInitialize(Port *port) +{ + int status; + int ret; + char remote_host[NI_MAXHOST]; + char remote_port[NI_MAXSERV]; + StringInfoData ps_data; + + /* Save port etc. for ps status */ + MyProcPort = port; + + /* Tell fd.c about the long-lived FD associated with the port */ + ReserveExternalFD(); + + /* + * PreAuthDelay is a debugging aid for investigating problems in the + * authentication cycle: it can be set in postgresql.conf to allow time to + * attach to the newly-forked backend with a debugger. (See also + * PostAuthDelay, which we allow clients to pass through PGOPTIONS, but it + * is not honored until after authentication.) + */ + if (PreAuthDelay > 0) + pg_usleep(PreAuthDelay * 1000000L); + + /* This flag will remain set until InitPostgres finishes authentication */ + ClientAuthInProgress = true; /* limit visibility of log messages */ + + /* set these to empty in case they are needed before we set them up */ + port->remote_host = ""; + port->remote_port = ""; + + /* + * Initialize libpq and enable reporting of ereport errors to the client. + * Must do this now because authentication uses libpq to send messages. + */ + pq_init(); /* initialize libpq to talk to client */ + whereToSendOutput = DestRemote; /* now safe to ereport to client */ + + /* + * We arrange to do _exit(1) if we receive SIGTERM or timeout while trying + * to collect the startup packet; while SIGQUIT results in _exit(2). + * Otherwise the postmaster cannot shutdown the database FAST or IMMED + * cleanly if a buggy client fails to send the packet promptly. + * + * Exiting with _exit(1) is only possible because we have not yet touched + * shared memory; therefore no outside-the-process state needs to get + * cleaned up. + */ + pqsignal(SIGTERM, process_startup_packet_die); + /* SIGQUIT handler was already set up by InitPostmasterChild */ + InitializeTimeouts(); /* establishes SIGALRM handler */ + sigprocmask(SIG_SETMASK, &StartupBlockSig, NULL); + + /* + * Get the remote host name and port for logging and status display. + */ + remote_host[0] = '\0'; + remote_port[0] = '\0'; + if ((ret = pg_getnameinfo_all(&port->raddr.addr, port->raddr.salen, + remote_host, sizeof(remote_host), + remote_port, sizeof(remote_port), + (log_hostname ? 0 : NI_NUMERICHOST) | NI_NUMERICSERV)) != 0) + ereport(WARNING, + (errmsg_internal("pg_getnameinfo_all() failed: %s", + gai_strerror(ret)))); + + /* + * Save remote_host and remote_port in port structure (after this, they + * will appear in log_line_prefix data for log messages). + */ + port->remote_host = strdup(remote_host); + port->remote_port = strdup(remote_port); + + /* And now we can issue the Log_connections message, if wanted */ + if (Log_connections) + { + if (remote_port[0]) + ereport(LOG, + (errmsg("connection received: host=%s port=%s", + remote_host, + remote_port))); + else + ereport(LOG, + (errmsg("connection received: host=%s", + remote_host))); + } + + /* + * If we did a reverse lookup to name, we might as well save the results + * rather than possibly repeating the lookup during authentication. + * + * Note that we don't want to specify NI_NAMEREQD above, because then we'd + * get nothing useful for a client without an rDNS entry. Therefore, we + * must check whether we got a numeric IPv4 or IPv6 address, and not save + * it into remote_hostname if so. (This test is conservative and might + * sometimes classify a hostname as numeric, but an error in that + * direction is safe; it only results in a possible extra lookup.) + */ + if (log_hostname && + ret == 0 && + strspn(remote_host, "0123456789.") < strlen(remote_host) && + strspn(remote_host, "0123456789ABCDEFabcdef:") < strlen(remote_host)) + port->remote_hostname = strdup(remote_host); + + /* + * Ready to begin client interaction. We will give up and _exit(1) after + * a time delay, so that a broken client can't hog a connection + * indefinitely. PreAuthDelay and any DNS interactions above don't count + * against the time limit. + * + * Note: AuthenticationTimeout is applied here while waiting for the + * startup packet, and then again in InitPostgres for the duration of any + * authentication operations. So a hostile client could tie up the + * process for nearly twice AuthenticationTimeout before we kick him off. + * + * Note: because PostgresMain will call InitializeTimeouts again, the + * registration of STARTUP_PACKET_TIMEOUT will be lost. This is okay + * since we never use it again after this function. + */ + RegisterTimeout(STARTUP_PACKET_TIMEOUT, StartupPacketTimeoutHandler); + enable_timeout_after(STARTUP_PACKET_TIMEOUT, AuthenticationTimeout * 1000); + + /* + * Receive the startup packet (which might turn out to be a cancel request + * packet). + */ + status = ProcessStartupPacket(port, false, false); + + /* + * Disable the timeout, and prevent SIGTERM again. + */ + disable_timeout(STARTUP_PACKET_TIMEOUT, false); + sigprocmask(SIG_SETMASK, &BlockSig, NULL); + + /* + * As a safety check that nothing in startup has yet performed + * shared-memory modifications that would need to be undone if we had + * exited through SIGTERM or timeout above, check that no on_shmem_exit + * handlers have been registered yet. (This isn't terribly bulletproof, + * since someone might misuse an on_proc_exit handler for shmem cleanup, + * but it's a cheap and helpful check. We cannot disallow on_proc_exit + * handlers unfortunately, since pq_init() already registered one.) + */ + check_on_shmem_exit_lists_are_empty(); + + /* + * Stop here if it was bad or a cancel packet. ProcessStartupPacket + * already did any appropriate error reporting. + */ + if (status != STATUS_OK) + proc_exit(0); + + /* + * Now that we have the user and database name, we can set the process + * title for ps. It's good to do this as early as possible in startup. + */ + initStringInfo(&ps_data); + if (am_walsender) + appendStringInfo(&ps_data, "%s ", GetBackendTypeDesc(B_WAL_SENDER)); + appendStringInfo(&ps_data, "%s ", port->user_name); + if (port->database_name[0] != '\0') + appendStringInfo(&ps_data, "%s ", port->database_name); + appendStringInfoString(&ps_data, port->remote_host); + if (port->remote_port[0] != '\0') + appendStringInfo(&ps_data, "(%s)", port->remote_port); + + init_ps_display(ps_data.data); + pfree(ps_data.data); + + set_ps_display("initializing"); +} + + +/* + * BackendRun -- set up the backend's argument list and invoke PostgresMain() + * + * returns: + * Doesn't return at all. + */ +static void +BackendRun(Port *port) +{ + /* + * Make sure we aren't in PostmasterContext anymore. (We can't delete it + * just yet, though, because InitPostgres will need the HBA data.) + */ + MemoryContextSwitchTo(TopMemoryContext); + + PostgresMain(port->database_name, port->user_name); +} + + +#ifdef EXEC_BACKEND + +/* + * postmaster_forkexec -- fork and exec a postmaster subprocess + * + * The caller must have set up the argv array already, except for argv[2] + * which will be filled with the name of the temp variable file. + * + * Returns the child process PID, or -1 on fork failure (a suitable error + * message has been logged on failure). + * + * All uses of this routine will dispatch to SubPostmasterMain in the + * child process. + */ +pid_t +postmaster_forkexec(int argc, char *argv[]) +{ + Port port; + + /* This entry point passes dummy values for the Port variables */ + memset(&port, 0, sizeof(port)); + return internal_forkexec(argc, argv, &port); +} + +/* + * backend_forkexec -- fork/exec off a backend process + * + * Some operating systems (WIN32) don't have fork() so we have to simulate + * it by storing parameters that need to be passed to the child and + * then create a new child process. + * + * returns the pid of the fork/exec'd process, or -1 on failure + */ +static pid_t +backend_forkexec(Port *port) +{ + char *av[4]; + int ac = 0; + + av[ac++] = "postgres"; + av[ac++] = "--forkbackend"; + av[ac++] = NULL; /* filled in by internal_forkexec */ + + av[ac] = NULL; + Assert(ac < lengthof(av)); + + return internal_forkexec(ac, av, port); +} + +#ifndef WIN32 + +/* + * internal_forkexec non-win32 implementation + * + * - writes out backend variables to the parameter file + * - fork():s, and then exec():s the child process + */ +static pid_t +internal_forkexec(int argc, char *argv[], Port *port) +{ + static unsigned long tmpBackendFileNum = 0; + pid_t pid; + char tmpfilename[MAXPGPATH]; + BackendParameters param; + FILE *fp; + + if (!save_backend_variables(¶m, port)) + return -1; /* log made by save_backend_variables */ + + /* Calculate name for temp file */ + snprintf(tmpfilename, MAXPGPATH, "%s/%s.backend_var.%d.%lu", + PG_TEMP_FILES_DIR, PG_TEMP_FILE_PREFIX, + MyProcPid, ++tmpBackendFileNum); + + /* Open file */ + fp = AllocateFile(tmpfilename, PG_BINARY_W); + if (!fp) + { + /* + * As in OpenTemporaryFileInTablespace, try to make the temp-file + * directory, ignoring errors. + */ + (void) MakePGDirectory(PG_TEMP_FILES_DIR); + + fp = AllocateFile(tmpfilename, PG_BINARY_W); + if (!fp) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", + tmpfilename))); + return -1; + } + } + + if (fwrite(¶m, sizeof(param), 1, fp) != 1) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmpfilename))); + FreeFile(fp); + return -1; + } + + /* Release file */ + if (FreeFile(fp)) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmpfilename))); + return -1; + } + + /* Make sure caller set up argv properly */ + Assert(argc >= 3); + Assert(argv[argc] == NULL); + Assert(strncmp(argv[1], "--fork", 6) == 0); + Assert(argv[2] == NULL); + + /* Insert temp file name after --fork argument */ + argv[2] = tmpfilename; + + /* Fire off execv in child */ + if ((pid = fork_process()) == 0) + { + if (execv(postgres_exec_path, argv) < 0) + { + ereport(LOG, + (errmsg("could not execute server process \"%s\": %m", + postgres_exec_path))); + /* We're already in the child process here, can't return */ + exit(1); + } + } + + return pid; /* Parent returns pid, or -1 on fork failure */ +} +#else /* WIN32 */ + +/* + * internal_forkexec win32 implementation + * + * - starts backend using CreateProcess(), in suspended state + * - writes out backend variables to the parameter file + * - during this, duplicates handles and sockets required for + * inheritance into the new process + * - resumes execution of the new process once the backend parameter + * file is complete. + */ +static pid_t +internal_forkexec(int argc, char *argv[], Port *port) +{ + int retry_count = 0; + STARTUPINFO si; + PROCESS_INFORMATION pi; + int i; + int j; + char cmdLine[MAXPGPATH * 2]; + HANDLE paramHandle; + BackendParameters *param; + SECURITY_ATTRIBUTES sa; + char paramHandleStr[32]; + win32_deadchild_waitinfo *childinfo; + + /* Make sure caller set up argv properly */ + Assert(argc >= 3); + Assert(argv[argc] == NULL); + Assert(strncmp(argv[1], "--fork", 6) == 0); + Assert(argv[2] == NULL); + + /* Resume here if we need to retry */ +retry: + + /* Set up shared memory for parameter passing */ + ZeroMemory(&sa, sizeof(sa)); + sa.nLength = sizeof(sa); + sa.bInheritHandle = TRUE; + paramHandle = CreateFileMapping(INVALID_HANDLE_VALUE, + &sa, + PAGE_READWRITE, + 0, + sizeof(BackendParameters), + NULL); + if (paramHandle == INVALID_HANDLE_VALUE) + { + ereport(LOG, + (errmsg("could not create backend parameter file mapping: error code %lu", + GetLastError()))); + return -1; + } + + param = MapViewOfFile(paramHandle, FILE_MAP_WRITE, 0, 0, sizeof(BackendParameters)); + if (!param) + { + ereport(LOG, + (errmsg("could not map backend parameter memory: error code %lu", + GetLastError()))); + CloseHandle(paramHandle); + return -1; + } + + /* Insert temp file name after --fork argument */ +#ifdef _WIN64 + sprintf(paramHandleStr, "%llu", (LONG_PTR) paramHandle); +#else + sprintf(paramHandleStr, "%lu", (DWORD) paramHandle); +#endif + argv[2] = paramHandleStr; + + /* Format the cmd line */ + cmdLine[sizeof(cmdLine) - 1] = '\0'; + cmdLine[sizeof(cmdLine) - 2] = '\0'; + snprintf(cmdLine, sizeof(cmdLine) - 1, "\"%s\"", postgres_exec_path); + i = 0; + while (argv[++i] != NULL) + { + j = strlen(cmdLine); + snprintf(cmdLine + j, sizeof(cmdLine) - 1 - j, " \"%s\"", argv[i]); + } + if (cmdLine[sizeof(cmdLine) - 2] != '\0') + { + ereport(LOG, + (errmsg("subprocess command line too long"))); + UnmapViewOfFile(param); + CloseHandle(paramHandle); + return -1; + } + + memset(&pi, 0, sizeof(pi)); + memset(&si, 0, sizeof(si)); + si.cb = sizeof(si); + + /* + * Create the subprocess in a suspended state. This will be resumed later, + * once we have written out the parameter file. + */ + if (!CreateProcess(NULL, cmdLine, NULL, NULL, TRUE, CREATE_SUSPENDED, + NULL, NULL, &si, &pi)) + { + ereport(LOG, + (errmsg("CreateProcess() call failed: %m (error code %lu)", + GetLastError()))); + UnmapViewOfFile(param); + CloseHandle(paramHandle); + return -1; + } + + if (!save_backend_variables(param, port, pi.hProcess, pi.dwProcessId)) + { + /* + * log made by save_backend_variables, but we have to clean up the + * mess with the half-started process + */ + if (!TerminateProcess(pi.hProcess, 255)) + ereport(LOG, + (errmsg_internal("could not terminate unstarted process: error code %lu", + GetLastError()))); + CloseHandle(pi.hProcess); + CloseHandle(pi.hThread); + UnmapViewOfFile(param); + CloseHandle(paramHandle); + return -1; /* log made by save_backend_variables */ + } + + /* Drop the parameter shared memory that is now inherited to the backend */ + if (!UnmapViewOfFile(param)) + ereport(LOG, + (errmsg("could not unmap view of backend parameter file: error code %lu", + GetLastError()))); + if (!CloseHandle(paramHandle)) + ereport(LOG, + (errmsg("could not close handle to backend parameter file: error code %lu", + GetLastError()))); + + /* + * Reserve the memory region used by our main shared memory segment before + * we resume the child process. Normally this should succeed, but if ASLR + * is active then it might sometimes fail due to the stack or heap having + * gotten mapped into that range. In that case, just terminate the + * process and retry. + */ + if (!pgwin32_ReserveSharedMemoryRegion(pi.hProcess)) + { + /* pgwin32_ReserveSharedMemoryRegion already made a log entry */ + if (!TerminateProcess(pi.hProcess, 255)) + ereport(LOG, + (errmsg_internal("could not terminate process that failed to reserve memory: error code %lu", + GetLastError()))); + CloseHandle(pi.hProcess); + CloseHandle(pi.hThread); + if (++retry_count < 100) + goto retry; + ereport(LOG, + (errmsg("giving up after too many tries to reserve shared memory"), + errhint("This might be caused by ASLR or antivirus software."))); + return -1; + } + + /* + * Now that the backend variables are written out, we start the child + * thread so it can start initializing while we set up the rest of the + * parent state. + */ + if (ResumeThread(pi.hThread) == -1) + { + if (!TerminateProcess(pi.hProcess, 255)) + { + ereport(LOG, + (errmsg_internal("could not terminate unstartable process: error code %lu", + GetLastError()))); + CloseHandle(pi.hProcess); + CloseHandle(pi.hThread); + return -1; + } + CloseHandle(pi.hProcess); + CloseHandle(pi.hThread); + ereport(LOG, + (errmsg_internal("could not resume thread of unstarted process: error code %lu", + GetLastError()))); + return -1; + } + + /* + * Queue a waiter to signal when this child dies. The wait will be handled + * automatically by an operating system thread pool. The memory will be + * freed by a later call to waitpid(). + */ + childinfo = palloc(sizeof(win32_deadchild_waitinfo)); + childinfo->procHandle = pi.hProcess; + childinfo->procId = pi.dwProcessId; + + if (!RegisterWaitForSingleObject(&childinfo->waitHandle, + pi.hProcess, + pgwin32_deadchild_callback, + childinfo, + INFINITE, + WT_EXECUTEONLYONCE | WT_EXECUTEINWAITTHREAD)) + ereport(FATAL, + (errmsg_internal("could not register process for wait: error code %lu", + GetLastError()))); + + /* Don't close pi.hProcess here - waitpid() needs access to it */ + + CloseHandle(pi.hThread); + + return pi.dwProcessId; +} +#endif /* WIN32 */ + + +/* + * SubPostmasterMain -- Get the fork/exec'd process into a state equivalent + * to what it would be if we'd simply forked on Unix, and then + * dispatch to the appropriate place. + * + * The first two command line arguments are expected to be "--forkFOO" + * (where FOO indicates which postmaster child we are to become), and + * the name of a variables file that we can read to load data that would + * have been inherited by fork() on Unix. Remaining arguments go to the + * subprocess FooMain() routine. + */ +void +SubPostmasterMain(int argc, char *argv[]) +{ + Port port; + + /* In EXEC_BACKEND case we will not have inherited these settings */ + IsPostmasterEnvironment = true; + whereToSendOutput = DestNone; + + /* Setup essential subsystems (to ensure elog() behaves sanely) */ + InitializeGUCOptions(); + + /* Check we got appropriate args */ + if (argc < 3) + elog(FATAL, "invalid subpostmaster invocation"); + + /* Read in the variables file */ + memset(&port, 0, sizeof(Port)); + read_backend_variables(argv[2], &port); + + /* Close the postmaster's sockets (as soon as we know them) */ + ClosePostmasterPorts(strcmp(argv[1], "--forklog") == 0); + + /* Setup as postmaster child */ + InitPostmasterChild(); + + /* + * If appropriate, physically re-attach to shared memory segment. We want + * to do this before going any further to ensure that we can attach at the + * same address the postmaster used. On the other hand, if we choose not + * to re-attach, we may have other cleanup to do. + * + * If testing EXEC_BACKEND on Linux, you should run this as root before + * starting the postmaster: + * + * sysctl -w kernel.randomize_va_space=0 + * + * This prevents using randomized stack and code addresses that cause the + * child process's memory map to be different from the parent's, making it + * sometimes impossible to attach to shared memory at the desired address. + * Return the setting to its old value (usually '1' or '2') when finished. + */ + if (strcmp(argv[1], "--forkbackend") == 0 || + strcmp(argv[1], "--forkavlauncher") == 0 || + strcmp(argv[1], "--forkavworker") == 0 || + strcmp(argv[1], "--forkaux") == 0 || + strncmp(argv[1], "--forkbgworker=", 15) == 0) + PGSharedMemoryReAttach(); + else + PGSharedMemoryNoReAttach(); + + /* autovacuum needs this set before calling InitProcess */ + if (strcmp(argv[1], "--forkavlauncher") == 0) + AutovacuumLauncherIAm(); + if (strcmp(argv[1], "--forkavworker") == 0) + AutovacuumWorkerIAm(); + + /* Read in remaining GUC variables */ + read_nondefault_variables(); + + /* + * Check that the data directory looks valid, which will also check the + * privileges on the data directory and update our umask and file/group + * variables for creating files later. Note: this should really be done + * before we create any files or directories. + */ + checkDataDir(); + + /* + * (re-)read control file, as it contains config. The postmaster will + * already have read this, but this process doesn't know about that. + */ + LocalProcessControlFile(false); + + /* + * Reload any libraries that were preloaded by the postmaster. Since we + * exec'd this process, those libraries didn't come along with us; but we + * should load them into all child processes to be consistent with the + * non-EXEC_BACKEND behavior. + */ + process_shared_preload_libraries(); + + /* Run backend or appropriate child */ + if (strcmp(argv[1], "--forkbackend") == 0) + { + Assert(argc == 3); /* shouldn't be any more args */ + + /* + * Need to reinitialize the SSL library in the backend, since the + * context structures contain function pointers and cannot be passed + * through the parameter file. + * + * If for some reason reload fails (maybe the user installed broken + * key files), soldier on without SSL; that's better than all + * connections becoming impossible. + * + * XXX should we do this in all child processes? For the moment it's + * enough to do it in backend children. + */ +#ifdef USE_SSL + if (EnableSSL) + { + if (secure_initialize(false) == 0) + LoadedSSL = true; + else + ereport(LOG, + (errmsg("SSL configuration could not be loaded in child process"))); + } +#endif + + /* + * Perform additional initialization and collect startup packet. + * + * We want to do this before InitProcess() for a couple of reasons: 1. + * so that we aren't eating up a PGPROC slot while waiting on the + * client. 2. so that if InitProcess() fails due to being out of + * PGPROC slots, we have already initialized libpq and are able to + * report the error to the client. + */ + BackendInitialize(&port); + + /* Restore basic shared memory pointers */ + InitShmemAccess(UsedShmemSegAddr); + + /* Need a PGPROC to run CreateSharedMemoryAndSemaphores */ + InitProcess(); + + /* Attach process to shared data structures */ + CreateSharedMemoryAndSemaphores(); + + /* And run the backend */ + BackendRun(&port); /* does not return */ + } + if (strcmp(argv[1], "--forkaux") == 0) + { + AuxProcType auxtype; + + Assert(argc == 4); + + /* Restore basic shared memory pointers */ + InitShmemAccess(UsedShmemSegAddr); + + /* Need a PGPROC to run CreateSharedMemoryAndSemaphores */ + InitAuxiliaryProcess(); + + /* Attach process to shared data structures */ + CreateSharedMemoryAndSemaphores(); + + auxtype = atoi(argv[3]); + AuxiliaryProcessMain(auxtype); /* does not return */ + } + if (strcmp(argv[1], "--forkavlauncher") == 0) + { + /* Restore basic shared memory pointers */ + InitShmemAccess(UsedShmemSegAddr); + + /* Need a PGPROC to run CreateSharedMemoryAndSemaphores */ + InitProcess(); + + /* Attach process to shared data structures */ + CreateSharedMemoryAndSemaphores(); + + AutoVacLauncherMain(argc - 2, argv + 2); /* does not return */ + } + if (strcmp(argv[1], "--forkavworker") == 0) + { + /* Restore basic shared memory pointers */ + InitShmemAccess(UsedShmemSegAddr); + + /* Need a PGPROC to run CreateSharedMemoryAndSemaphores */ + InitProcess(); + + /* Attach process to shared data structures */ + CreateSharedMemoryAndSemaphores(); + + AutoVacWorkerMain(argc - 2, argv + 2); /* does not return */ + } + if (strncmp(argv[1], "--forkbgworker=", 15) == 0) + { + int shmem_slot; + + /* do this as early as possible; in particular, before InitProcess() */ + IsBackgroundWorker = true; + + /* Restore basic shared memory pointers */ + InitShmemAccess(UsedShmemSegAddr); + + /* Need a PGPROC to run CreateSharedMemoryAndSemaphores */ + InitProcess(); + + /* Attach process to shared data structures */ + CreateSharedMemoryAndSemaphores(); + + /* Fetch MyBgworkerEntry from shared memory */ + shmem_slot = atoi(argv[1] + 15); + MyBgworkerEntry = BackgroundWorkerEntry(shmem_slot); + + StartBackgroundWorker(); + } + if (strcmp(argv[1], "--forklog") == 0) + { + /* Do not want to attach to shared memory */ + + SysLoggerMain(argc, argv); /* does not return */ + } + + abort(); /* shouldn't get here */ +} +#endif /* EXEC_BACKEND */ + + +/* + * ExitPostmaster -- cleanup + * + * Do NOT call exit() directly --- always go through here! + */ +static void +ExitPostmaster(int status) +{ +#ifdef HAVE_PTHREAD_IS_THREADED_NP + + /* + * There is no known cause for a postmaster to become multithreaded after + * startup. Recheck to account for the possibility of unknown causes. + * This message uses LOG level, because an unclean shutdown at this point + * would usually not look much different from a clean shutdown. + */ + if (pthread_is_threaded_np() != 0) + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg_internal("postmaster became multithreaded"), + errdetail("Please report this to <%s>.", PACKAGE_BUGREPORT))); +#endif + + /* should cleanup shared memory and kill all backends */ + + /* + * Not sure of the semantics here. When the Postmaster dies, should the + * backends all be killed? probably not. + * + * MUST -- vadim 05-10-1999 + */ + + proc_exit(status); +} + +/* + * Handle pmsignal conditions representing requests from backends, + * and check for promote and logrotate requests from pg_ctl. + */ +static void +process_pm_pmsignal(void) +{ + pending_pm_pmsignal = false; + + ereport(DEBUG2, + (errmsg_internal("postmaster received pmsignal signal"))); + + /* + * RECOVERY_STARTED and BEGIN_HOT_STANDBY signals are ignored in + * unexpected states. If the startup process quickly starts up, completes + * recovery, exits, we might process the death of the startup process + * first. We don't want to go back to recovery in that case. + */ + if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_STARTED) && + pmState == PM_STARTUP && Shutdown == NoShutdown) + { + /* WAL redo has started. We're out of reinitialization. */ + FatalError = false; + AbortStartTime = 0; + + /* + * Start the archiver if we're responsible for (re-)archiving received + * files. + */ + Assert(PgArchPID == 0); + if (XLogArchivingAlways()) + PgArchPID = StartArchiver(); + + /* + * If we aren't planning to enter hot standby mode later, treat + * RECOVERY_STARTED as meaning we're out of startup, and report status + * accordingly. + */ + if (!EnableHotStandby) + { + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STANDBY); +#ifdef USE_SYSTEMD + sd_notify(0, "READY=1"); +#endif + } + + pmState = PM_RECOVERY; + } + + if (CheckPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY) && + pmState == PM_RECOVERY && Shutdown == NoShutdown) + { + ereport(LOG, + (errmsg("database system is ready to accept read-only connections"))); + + /* Report status */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_READY); +#ifdef USE_SYSTEMD + sd_notify(0, "READY=1"); +#endif + + pmState = PM_HOT_STANDBY; + connsAllowed = true; + + /* Some workers may be scheduled to start now */ + StartWorkerNeeded = true; + } + + /* Process background worker state changes. */ + if (CheckPostmasterSignal(PMSIGNAL_BACKGROUND_WORKER_CHANGE)) + { + /* Accept new worker requests only if not stopping. */ + BackgroundWorkerStateChange(pmState < PM_STOP_BACKENDS); + StartWorkerNeeded = true; + } + + if (StartWorkerNeeded || HaveCrashedWorker) + maybe_start_bgworkers(); + + /* Tell syslogger to rotate logfile if requested */ + if (SysLoggerPID != 0) + { + if (CheckLogrotateSignal()) + { + signal_child(SysLoggerPID, SIGUSR1); + RemoveLogrotateSignalFiles(); + } + else if (CheckPostmasterSignal(PMSIGNAL_ROTATE_LOGFILE)) + { + signal_child(SysLoggerPID, SIGUSR1); + } + } + + if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER) && + Shutdown <= SmartShutdown && pmState < PM_STOP_BACKENDS) + { + /* + * Start one iteration of the autovacuum daemon, even if autovacuuming + * is nominally not enabled. This is so we can have an active defense + * against transaction ID wraparound. We set a flag for the main loop + * to do it rather than trying to do it here --- this is because the + * autovac process itself may send the signal, and we want to handle + * that by launching another iteration as soon as the current one + * completes. + */ + start_autovac_launcher = true; + } + + if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER) && + Shutdown <= SmartShutdown && pmState < PM_STOP_BACKENDS) + { + /* The autovacuum launcher wants us to start a worker process. */ + StartAutovacuumWorker(); + } + + if (CheckPostmasterSignal(PMSIGNAL_START_WALRECEIVER)) + { + /* Startup Process wants us to start the walreceiver process. */ + /* Start immediately if possible, else remember request for later. */ + WalReceiverRequested = true; + MaybeStartWalReceiver(); + } + + /* + * Try to advance postmaster's state machine, if a child requests it. + * + * Be careful about the order of this action relative to this function's + * other actions. Generally, this should be after other actions, in case + * they have effects PostmasterStateMachine would need to know about. + * However, we should do it before the CheckPromoteSignal step, which + * cannot have any (immediate) effect on the state machine, but does + * depend on what state we're in now. + */ + if (CheckPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE)) + { + PostmasterStateMachine(); + } + + if (StartupPID != 0 && + (pmState == PM_STARTUP || pmState == PM_RECOVERY || + pmState == PM_HOT_STANDBY) && + CheckPromoteSignal()) + { + /* + * Tell startup process to finish recovery. + * + * Leave the promote signal file in place and let the Startup process + * do the unlink. + */ + signal_child(StartupPID, SIGUSR2); + } +} + +/* + * SIGTERM while processing startup packet. + * + * Running proc_exit() from a signal handler would be quite unsafe. + * However, since we have not yet touched shared memory, we can just + * pull the plug and exit without running any atexit handlers. + * + * One might be tempted to try to send a message, or log one, indicating + * why we are disconnecting. However, that would be quite unsafe in itself. + * Also, it seems undesirable to provide clues about the database's state + * to a client that has not yet completed authentication, or even sent us + * a startup packet. + */ +static void +process_startup_packet_die(SIGNAL_ARGS) +{ + _exit(1); +} + +/* + * Dummy signal handler + * + * We use this for signals that we don't actually use in the postmaster, + * but we do use in backends. If we were to SIG_IGN such signals in the + * postmaster, then a newly started backend might drop a signal that arrives + * before it's able to reconfigure its signal processing. (See notes in + * tcop/postgres.c.) + */ +static void +dummy_handler(SIGNAL_ARGS) +{ +} + +/* + * Timeout while processing startup packet. + * As for process_startup_packet_die(), we exit via _exit(1). + */ +static void +StartupPacketTimeoutHandler(void) +{ + _exit(1); +} + + +/* + * Generate a random cancel key. + */ +static bool +RandomCancelKey(int32 *cancel_key) +{ + return pg_strong_random(cancel_key, sizeof(int32)); +} + +/* + * Count up number of child processes of specified types (dead_end children + * are always excluded). + */ +static int +CountChildren(int target) +{ + dlist_iter iter; + int cnt = 0; + + dlist_foreach(iter, &BackendList) + { + Backend *bp = dlist_container(Backend, elem, iter.cur); + + if (bp->dead_end) + continue; + + /* + * Since target == BACKEND_TYPE_ALL is the most common case, we test + * it first and avoid touching shared memory for every child. + */ + if (target != BACKEND_TYPE_ALL) + { + /* + * Assign bkend_type for any recently announced WAL Sender + * processes. + */ + if (bp->bkend_type == BACKEND_TYPE_NORMAL && + IsPostmasterChildWalSender(bp->child_slot)) + bp->bkend_type = BACKEND_TYPE_WALSND; + + if (!(target & bp->bkend_type)) + continue; + } + + cnt++; + } + return cnt; +} + + +/* + * StartChildProcess -- start an auxiliary process for the postmaster + * + * "type" determines what kind of child will be started. All child types + * initially go to AuxiliaryProcessMain, which will handle common setup. + * + * Return value of StartChildProcess is subprocess' PID, or 0 if failed + * to start subprocess. + */ +static pid_t +StartChildProcess(AuxProcType type) +{ + pid_t pid; + +#ifdef EXEC_BACKEND + { + char *av[10]; + int ac = 0; + char typebuf[32]; + + /* + * Set up command-line arguments for subprocess + */ + av[ac++] = "postgres"; + av[ac++] = "--forkaux"; + av[ac++] = NULL; /* filled in by postmaster_forkexec */ + + snprintf(typebuf, sizeof(typebuf), "%d", type); + av[ac++] = typebuf; + + av[ac] = NULL; + Assert(ac < lengthof(av)); + + pid = postmaster_forkexec(ac, av); + } +#else /* !EXEC_BACKEND */ + pid = fork_process(); + + if (pid == 0) /* child */ + { + InitPostmasterChild(); + + /* Close the postmaster's sockets */ + ClosePostmasterPorts(false); + + /* Release postmaster's working memory context */ + MemoryContextSwitchTo(TopMemoryContext); + MemoryContextDelete(PostmasterContext); + PostmasterContext = NULL; + + AuxiliaryProcessMain(type); /* does not return */ + } +#endif /* EXEC_BACKEND */ + + if (pid < 0) + { + /* in parent, fork failed */ + int save_errno = errno; + + errno = save_errno; + switch (type) + { + case StartupProcess: + ereport(LOG, + (errmsg("could not fork startup process: %m"))); + break; + case ArchiverProcess: + ereport(LOG, + (errmsg("could not fork archiver process: %m"))); + break; + case BgWriterProcess: + ereport(LOG, + (errmsg("could not fork background writer process: %m"))); + break; + case CheckpointerProcess: + ereport(LOG, + (errmsg("could not fork checkpointer process: %m"))); + break; + case WalWriterProcess: + ereport(LOG, + (errmsg("could not fork WAL writer process: %m"))); + break; + case WalReceiverProcess: + ereport(LOG, + (errmsg("could not fork WAL receiver process: %m"))); + break; + default: + ereport(LOG, + (errmsg("could not fork process: %m"))); + break; + } + + /* + * fork failure is fatal during startup, but there's no need to choke + * immediately if starting other child types fails. + */ + if (type == StartupProcess) + ExitPostmaster(1); + return 0; + } + + /* + * in parent, successful fork + */ + return pid; +} + +/* + * StartAutovacuumWorker + * Start an autovac worker process. + * + * This function is here because it enters the resulting PID into the + * postmaster's private backends list. + * + * NB -- this code very roughly matches BackendStartup. + */ +static void +StartAutovacuumWorker(void) +{ + Backend *bn; + + /* + * If not in condition to run a process, don't try, but handle it like a + * fork failure. This does not normally happen, since the signal is only + * supposed to be sent by autovacuum launcher when it's OK to do it, but + * we have to check to avoid race-condition problems during DB state + * changes. + */ + if (canAcceptConnections(BACKEND_TYPE_AUTOVAC) == CAC_OK) + { + /* + * Compute the cancel key that will be assigned to this session. We + * probably don't need cancel keys for autovac workers, but we'd + * better have something random in the field to prevent unfriendly + * people from sending cancels to them. + */ + if (!RandomCancelKey(&MyCancelKey)) + { + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("could not generate random cancel key"))); + return; + } + + bn = (Backend *) malloc(sizeof(Backend)); + if (bn) + { + bn->cancel_key = MyCancelKey; + + /* Autovac workers are not dead_end and need a child slot */ + bn->dead_end = false; + bn->child_slot = MyPMChildSlot = AssignPostmasterChildSlot(); + bn->bgworker_notify = false; + + bn->pid = StartAutoVacWorker(); + if (bn->pid > 0) + { + bn->bkend_type = BACKEND_TYPE_AUTOVAC; + dlist_push_head(&BackendList, &bn->elem); +#ifdef EXEC_BACKEND + ShmemBackendArrayAdd(bn); +#endif + /* all OK */ + return; + } + + /* + * fork failed, fall through to report -- actual error message was + * logged by StartAutoVacWorker + */ + (void) ReleasePostmasterChildSlot(bn->child_slot); + free(bn); + } + else + ereport(LOG, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + /* + * Report the failure to the launcher, if it's running. (If it's not, we + * might not even be connected to shared memory, so don't try to call + * AutoVacWorkerFailed.) Note that we also need to signal it so that it + * responds to the condition, but we don't do that here, instead waiting + * for ServerLoop to do it. This way we avoid a ping-pong signaling in + * quick succession between the autovac launcher and postmaster in case + * things get ugly. + */ + if (AutoVacPID != 0) + { + AutoVacWorkerFailed(); + avlauncher_needs_signal = true; + } +} + +/* + * MaybeStartWalReceiver + * Start the WAL receiver process, if not running and our state allows. + * + * Note: if WalReceiverPID is already nonzero, it might seem that we should + * clear WalReceiverRequested. However, there's a race condition if the + * walreceiver terminates and the startup process immediately requests a new + * one: it's quite possible to get the signal for the request before reaping + * the dead walreceiver process. Better to risk launching an extra + * walreceiver than to miss launching one we need. (The walreceiver code + * has logic to recognize that it should go away if not needed.) + */ +static void +MaybeStartWalReceiver(void) +{ + if (WalReceiverPID == 0 && + (pmState == PM_STARTUP || pmState == PM_RECOVERY || + pmState == PM_HOT_STANDBY) && + Shutdown <= SmartShutdown) + { + WalReceiverPID = StartWalReceiver(); + if (WalReceiverPID != 0) + WalReceiverRequested = false; + /* else leave the flag set, so we'll try again later */ + } +} + + +/* + * Create the opts file + */ +static bool +CreateOptsFile(int argc, char *argv[], char *fullprogname) +{ + FILE *fp; + int i; + +#define OPTS_FILE "postmaster.opts" + + if ((fp = fopen(OPTS_FILE, "w")) == NULL) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", OPTS_FILE))); + return false; + } + + fprintf(fp, "%s", fullprogname); + for (i = 1; i < argc; i++) + fprintf(fp, " \"%s\"", argv[i]); + fputs("\n", fp); + + if (fclose(fp)) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", OPTS_FILE))); + return false; + } + + return true; +} + + +/* + * MaxLivePostmasterChildren + * + * This reports the number of entries needed in per-child-process arrays + * (the PMChildFlags array, and if EXEC_BACKEND the ShmemBackendArray). + * These arrays include regular backends, autovac workers, walsenders + * and background workers, but not special children nor dead_end children. + * This allows the arrays to have a fixed maximum size, to wit the same + * too-many-children limit enforced by canAcceptConnections(). The exact value + * isn't too critical as long as it's more than MaxBackends. + */ +int +MaxLivePostmasterChildren(void) +{ + return 2 * (MaxConnections + autovacuum_max_workers + 1 + + max_wal_senders + max_worker_processes); +} + +/* + * Connect background worker to a database. + */ +void +BackgroundWorkerInitializeConnection(const char *dbname, const char *username, uint32 flags) +{ + BackgroundWorker *worker = MyBgworkerEntry; + + /* XXX is this the right errcode? */ + if (!(worker->bgw_flags & BGWORKER_BACKEND_DATABASE_CONNECTION)) + ereport(FATAL, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("database connection requirement not indicated during registration"))); + + InitPostgres(dbname, InvalidOid, /* database to connect to */ + username, InvalidOid, /* role to connect as */ + false, /* never honor session_preload_libraries */ + (flags & BGWORKER_BYPASS_ALLOWCONN) != 0, /* ignore datallowconn? */ + NULL); /* no out_dbname */ + + /* it had better not gotten out of "init" mode yet */ + if (!IsInitProcessingMode()) + ereport(ERROR, + (errmsg("invalid processing mode in background worker"))); + SetProcessingMode(NormalProcessing); +} + +/* + * Connect background worker to a database using OIDs. + */ +void +BackgroundWorkerInitializeConnectionByOid(Oid dboid, Oid useroid, uint32 flags) +{ + BackgroundWorker *worker = MyBgworkerEntry; + + /* XXX is this the right errcode? */ + if (!(worker->bgw_flags & BGWORKER_BACKEND_DATABASE_CONNECTION)) + ereport(FATAL, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("database connection requirement not indicated during registration"))); + + InitPostgres(NULL, dboid, /* database to connect to */ + NULL, useroid, /* role to connect as */ + false, /* never honor session_preload_libraries */ + (flags & BGWORKER_BYPASS_ALLOWCONN) != 0, /* ignore datallowconn? */ + NULL); /* no out_dbname */ + + /* it had better not gotten out of "init" mode yet */ + if (!IsInitProcessingMode()) + ereport(ERROR, + (errmsg("invalid processing mode in background worker"))); + SetProcessingMode(NormalProcessing); +} + +/* + * Block/unblock signals in a background worker + */ +void +BackgroundWorkerBlockSignals(void) +{ + sigprocmask(SIG_SETMASK, &BlockSig, NULL); +} + +void +BackgroundWorkerUnblockSignals(void) +{ + sigprocmask(SIG_SETMASK, &UnBlockSig, NULL); +} + +#ifdef EXEC_BACKEND +static pid_t +bgworker_forkexec(int shmem_slot) +{ + char *av[10]; + int ac = 0; + char forkav[MAXPGPATH]; + + snprintf(forkav, MAXPGPATH, "--forkbgworker=%d", shmem_slot); + + av[ac++] = "postgres"; + av[ac++] = forkav; + av[ac++] = NULL; /* filled in by postmaster_forkexec */ + av[ac] = NULL; + + Assert(ac < lengthof(av)); + + return postmaster_forkexec(ac, av); +} +#endif + +/* + * Start a new bgworker. + * Starting time conditions must have been checked already. + * + * Returns true on success, false on failure. + * In either case, update the RegisteredBgWorker's state appropriately. + * + * This code is heavily based on autovacuum.c, q.v. + */ +static bool +do_start_bgworker(RegisteredBgWorker *rw) +{ + pid_t worker_pid; + + Assert(rw->rw_pid == 0); + + /* + * Allocate and assign the Backend element. Note we must do this before + * forking, so that we can handle failures (out of memory or child-process + * slots) cleanly. + * + * Treat failure as though the worker had crashed. That way, the + * postmaster will wait a bit before attempting to start it again; if we + * tried again right away, most likely we'd find ourselves hitting the + * same resource-exhaustion condition. + */ + if (!assign_backendlist_entry(rw)) + { + rw->rw_crashed_at = GetCurrentTimestamp(); + return false; + } + + ereport(DEBUG1, + (errmsg_internal("starting background worker process \"%s\"", + rw->rw_worker.bgw_name))); + +#ifdef EXEC_BACKEND + switch ((worker_pid = bgworker_forkexec(rw->rw_shmem_slot))) +#else + switch ((worker_pid = fork_process())) +#endif + { + case -1: + /* in postmaster, fork failed ... */ + ereport(LOG, + (errmsg("could not fork worker process: %m"))); + /* undo what assign_backendlist_entry did */ + ReleasePostmasterChildSlot(rw->rw_child_slot); + rw->rw_child_slot = 0; + free(rw->rw_backend); + rw->rw_backend = NULL; + /* mark entry as crashed, so we'll try again later */ + rw->rw_crashed_at = GetCurrentTimestamp(); + break; + +#ifndef EXEC_BACKEND + case 0: + /* in postmaster child ... */ + InitPostmasterChild(); + + /* Close the postmaster's sockets */ + ClosePostmasterPorts(false); + + /* + * Before blowing away PostmasterContext, save this bgworker's + * data where it can find it. + */ + MyBgworkerEntry = (BackgroundWorker *) + MemoryContextAlloc(TopMemoryContext, sizeof(BackgroundWorker)); + memcpy(MyBgworkerEntry, &rw->rw_worker, sizeof(BackgroundWorker)); + + /* Release postmaster's working memory context */ + MemoryContextSwitchTo(TopMemoryContext); + MemoryContextDelete(PostmasterContext); + PostmasterContext = NULL; + + StartBackgroundWorker(); + + exit(1); /* should not get here */ + break; +#endif + default: + /* in postmaster, fork successful ... */ + rw->rw_pid = worker_pid; + rw->rw_backend->pid = rw->rw_pid; + ReportBackgroundWorkerPID(rw); + /* add new worker to lists of backends */ + dlist_push_head(&BackendList, &rw->rw_backend->elem); +#ifdef EXEC_BACKEND + ShmemBackendArrayAdd(rw->rw_backend); +#endif + return true; + } + + return false; +} + +/* + * Does the current postmaster state require starting a worker with the + * specified start_time? + */ +static bool +bgworker_should_start_now(BgWorkerStartTime start_time) +{ + switch (pmState) + { + case PM_NO_CHILDREN: + case PM_WAIT_DEAD_END: + case PM_SHUTDOWN_2: + case PM_SHUTDOWN: + case PM_WAIT_BACKENDS: + case PM_STOP_BACKENDS: + break; + + case PM_RUN: + if (start_time == BgWorkerStart_RecoveryFinished) + return true; + /* fall through */ + + case PM_HOT_STANDBY: + if (start_time == BgWorkerStart_ConsistentState) + return true; + /* fall through */ + + case PM_RECOVERY: + case PM_STARTUP: + case PM_INIT: + if (start_time == BgWorkerStart_PostmasterStart) + return true; + /* fall through */ + } + + return false; +} + +/* + * Allocate the Backend struct for a connected background worker, but don't + * add it to the list of backends just yet. + * + * On failure, return false without changing any worker state. + * + * Some info from the Backend is copied into the passed rw. + */ +static bool +assign_backendlist_entry(RegisteredBgWorker *rw) +{ + Backend *bn; + + /* + * Check that database state allows another connection. Currently the + * only possible failure is CAC_TOOMANY, so we just log an error message + * based on that rather than checking the error code precisely. + */ + if (canAcceptConnections(BACKEND_TYPE_BGWORKER) != CAC_OK) + { + ereport(LOG, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("no slot available for new worker process"))); + return false; + } + + /* + * Compute the cancel key that will be assigned to this session. We + * probably don't need cancel keys for background workers, but we'd better + * have something random in the field to prevent unfriendly people from + * sending cancels to them. + */ + if (!RandomCancelKey(&MyCancelKey)) + { + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("could not generate random cancel key"))); + return false; + } + + bn = malloc(sizeof(Backend)); + if (bn == NULL) + { + ereport(LOG, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + return false; + } + + bn->cancel_key = MyCancelKey; + bn->child_slot = MyPMChildSlot = AssignPostmasterChildSlot(); + bn->bkend_type = BACKEND_TYPE_BGWORKER; + bn->dead_end = false; + bn->bgworker_notify = false; + + rw->rw_backend = bn; + rw->rw_child_slot = bn->child_slot; + + return true; +} + +/* + * If the time is right, start background worker(s). + * + * As a side effect, the bgworker control variables are set or reset + * depending on whether more workers may need to be started. + * + * We limit the number of workers started per call, to avoid consuming the + * postmaster's attention for too long when many such requests are pending. + * As long as StartWorkerNeeded is true, ServerLoop will not block and will + * call this function again after dealing with any other issues. + */ +static void +maybe_start_bgworkers(void) +{ +#define MAX_BGWORKERS_TO_LAUNCH 100 + int num_launched = 0; + TimestampTz now = 0; + slist_mutable_iter iter; + + /* + * During crash recovery, we have no need to be called until the state + * transition out of recovery. + */ + if (FatalError) + { + StartWorkerNeeded = false; + HaveCrashedWorker = false; + return; + } + + /* Don't need to be called again unless we find a reason for it below */ + StartWorkerNeeded = false; + HaveCrashedWorker = false; + + slist_foreach_modify(iter, &BackgroundWorkerList) + { + RegisteredBgWorker *rw; + + rw = slist_container(RegisteredBgWorker, rw_lnode, iter.cur); + + /* ignore if already running */ + if (rw->rw_pid != 0) + continue; + + /* if marked for death, clean up and remove from list */ + if (rw->rw_terminate) + { + ForgetBackgroundWorker(&iter); + continue; + } + + /* + * If this worker has crashed previously, maybe it needs to be + * restarted (unless on registration it specified it doesn't want to + * be restarted at all). Check how long ago did a crash last happen. + * If the last crash is too recent, don't start it right away; let it + * be restarted once enough time has passed. + */ + if (rw->rw_crashed_at != 0) + { + if (rw->rw_worker.bgw_restart_time == BGW_NEVER_RESTART) + { + int notify_pid; + + notify_pid = rw->rw_worker.bgw_notify_pid; + + ForgetBackgroundWorker(&iter); + + /* Report worker is gone now. */ + if (notify_pid != 0) + kill(notify_pid, SIGUSR1); + + continue; + } + + /* read system time only when needed */ + if (now == 0) + now = GetCurrentTimestamp(); + + if (!TimestampDifferenceExceeds(rw->rw_crashed_at, now, + rw->rw_worker.bgw_restart_time * 1000)) + { + /* Set flag to remember that we have workers to start later */ + HaveCrashedWorker = true; + continue; + } + } + + if (bgworker_should_start_now(rw->rw_worker.bgw_start_time)) + { + /* reset crash time before trying to start worker */ + rw->rw_crashed_at = 0; + + /* + * Try to start the worker. + * + * On failure, give up processing workers for now, but set + * StartWorkerNeeded so we'll come back here on the next iteration + * of ServerLoop to try again. (We don't want to wait, because + * there might be additional ready-to-run workers.) We could set + * HaveCrashedWorker as well, since this worker is now marked + * crashed, but there's no need because the next run of this + * function will do that. + */ + if (!do_start_bgworker(rw)) + { + StartWorkerNeeded = true; + return; + } + + /* + * If we've launched as many workers as allowed, quit, but have + * ServerLoop call us again to look for additional ready-to-run + * workers. There might not be any, but we'll find out the next + * time we run. + */ + if (++num_launched >= MAX_BGWORKERS_TO_LAUNCH) + { + StartWorkerNeeded = true; + return; + } + } + } +} + +/* + * When a backend asks to be notified about worker state changes, we + * set a flag in its backend entry. The background worker machinery needs + * to know when such backends exit. + */ +bool +PostmasterMarkPIDForWorkerNotify(int pid) +{ + dlist_iter iter; + Backend *bp; + + dlist_foreach(iter, &BackendList) + { + bp = dlist_container(Backend, elem, iter.cur); + if (bp->pid == pid) + { + bp->bgworker_notify = true; + return true; + } + } + return false; +} + +#ifdef EXEC_BACKEND + +/* + * The following need to be available to the save/restore_backend_variables + * functions. They are marked NON_EXEC_STATIC in their home modules. + */ +extern slock_t *ShmemLock; +extern slock_t *ProcStructLock; +extern PGPROC *AuxiliaryProcs; +extern PMSignalData *PMSignalState; +extern pg_time_t first_syslogger_file_time; + +#ifndef WIN32 +#define write_inheritable_socket(dest, src, childpid) ((*(dest) = (src)), true) +#define read_inheritable_socket(dest, src) (*(dest) = *(src)) +#else +static bool write_duplicated_handle(HANDLE *dest, HANDLE src, HANDLE child); +static bool write_inheritable_socket(InheritableSocket *dest, SOCKET src, + pid_t childPid); +static void read_inheritable_socket(SOCKET *dest, InheritableSocket *src); +#endif + + +/* Save critical backend variables into the BackendParameters struct */ +#ifndef WIN32 +static bool +save_backend_variables(BackendParameters *param, Port *port) +#else +static bool +save_backend_variables(BackendParameters *param, Port *port, + HANDLE childProcess, pid_t childPid) +#endif +{ + memcpy(¶m->port, port, sizeof(Port)); + if (!write_inheritable_socket(¶m->portsocket, port->sock, childPid)) + return false; + + strlcpy(param->DataDir, DataDir, MAXPGPATH); + + memcpy(¶m->ListenSocket, &ListenSocket, sizeof(ListenSocket)); + + param->MyCancelKey = MyCancelKey; + param->MyPMChildSlot = MyPMChildSlot; + +#ifdef WIN32 + param->ShmemProtectiveRegion = ShmemProtectiveRegion; +#endif + param->UsedShmemSegID = UsedShmemSegID; + param->UsedShmemSegAddr = UsedShmemSegAddr; + + param->ShmemLock = ShmemLock; + param->ShmemVariableCache = ShmemVariableCache; + param->ShmemBackendArray = ShmemBackendArray; + +#ifndef HAVE_SPINLOCKS + param->SpinlockSemaArray = SpinlockSemaArray; +#endif + param->NamedLWLockTrancheRequests = NamedLWLockTrancheRequests; + param->NamedLWLockTrancheArray = NamedLWLockTrancheArray; + param->MainLWLockArray = MainLWLockArray; + param->ProcStructLock = ProcStructLock; + param->ProcGlobal = ProcGlobal; + param->AuxiliaryProcs = AuxiliaryProcs; + param->PreparedXactProcs = PreparedXactProcs; + param->PMSignalState = PMSignalState; + + param->PostmasterPid = PostmasterPid; + param->PgStartTime = PgStartTime; + param->PgReloadTime = PgReloadTime; + param->first_syslogger_file_time = first_syslogger_file_time; + + param->redirection_done = redirection_done; + param->IsBinaryUpgrade = IsBinaryUpgrade; + param->query_id_enabled = query_id_enabled; + param->max_safe_fds = max_safe_fds; + + param->MaxBackends = MaxBackends; + +#ifdef WIN32 + param->PostmasterHandle = PostmasterHandle; + if (!write_duplicated_handle(¶m->initial_signal_pipe, + pgwin32_create_signal_listener(childPid), + childProcess)) + return false; +#else + memcpy(¶m->postmaster_alive_fds, &postmaster_alive_fds, + sizeof(postmaster_alive_fds)); +#endif + + memcpy(¶m->syslogPipe, &syslogPipe, sizeof(syslogPipe)); + + strlcpy(param->my_exec_path, my_exec_path, MAXPGPATH); + + strlcpy(param->pkglib_path, pkglib_path, MAXPGPATH); + + return true; +} + + +#ifdef WIN32 +/* + * Duplicate a handle for usage in a child process, and write the child + * process instance of the handle to the parameter file. + */ +static bool +write_duplicated_handle(HANDLE *dest, HANDLE src, HANDLE childProcess) +{ + HANDLE hChild = INVALID_HANDLE_VALUE; + + if (!DuplicateHandle(GetCurrentProcess(), + src, + childProcess, + &hChild, + 0, + TRUE, + DUPLICATE_CLOSE_SOURCE | DUPLICATE_SAME_ACCESS)) + { + ereport(LOG, + (errmsg_internal("could not duplicate handle to be written to backend parameter file: error code %lu", + GetLastError()))); + return false; + } + + *dest = hChild; + return true; +} + +/* + * Duplicate a socket for usage in a child process, and write the resulting + * structure to the parameter file. + * This is required because a number of LSPs (Layered Service Providers) very + * common on Windows (antivirus, firewalls, download managers etc) break + * straight socket inheritance. + */ +static bool +write_inheritable_socket(InheritableSocket *dest, SOCKET src, pid_t childpid) +{ + dest->origsocket = src; + if (src != 0 && src != PGINVALID_SOCKET) + { + /* Actual socket */ + if (WSADuplicateSocket(src, childpid, &dest->wsainfo) != 0) + { + ereport(LOG, + (errmsg("could not duplicate socket %d for use in backend: error code %d", + (int) src, WSAGetLastError()))); + return false; + } + } + return true; +} + +/* + * Read a duplicate socket structure back, and get the socket descriptor. + */ +static void +read_inheritable_socket(SOCKET *dest, InheritableSocket *src) +{ + SOCKET s; + + if (src->origsocket == PGINVALID_SOCKET || src->origsocket == 0) + { + /* Not a real socket! */ + *dest = src->origsocket; + } + else + { + /* Actual socket, so create from structure */ + s = WSASocket(FROM_PROTOCOL_INFO, + FROM_PROTOCOL_INFO, + FROM_PROTOCOL_INFO, + &src->wsainfo, + 0, + 0); + if (s == INVALID_SOCKET) + { + write_stderr("could not create inherited socket: error code %d\n", + WSAGetLastError()); + exit(1); + } + *dest = s; + + /* + * To make sure we don't get two references to the same socket, close + * the original one. (This would happen when inheritance actually + * works.. + */ + closesocket(src->origsocket); + } +} +#endif + +static void +read_backend_variables(char *id, Port *port) +{ + BackendParameters param; + +#ifndef WIN32 + /* Non-win32 implementation reads from file */ + FILE *fp; + + /* Open file */ + fp = AllocateFile(id, PG_BINARY_R); + if (!fp) + { + write_stderr("could not open backend variables file \"%s\": %s\n", + id, strerror(errno)); + exit(1); + } + + if (fread(¶m, sizeof(param), 1, fp) != 1) + { + write_stderr("could not read from backend variables file \"%s\": %s\n", + id, strerror(errno)); + exit(1); + } + + /* Release file */ + FreeFile(fp); + if (unlink(id) != 0) + { + write_stderr("could not remove file \"%s\": %s\n", + id, strerror(errno)); + exit(1); + } +#else + /* Win32 version uses mapped file */ + HANDLE paramHandle; + BackendParameters *paramp; + +#ifdef _WIN64 + paramHandle = (HANDLE) _atoi64(id); +#else + paramHandle = (HANDLE) atol(id); +#endif + paramp = MapViewOfFile(paramHandle, FILE_MAP_READ, 0, 0, 0); + if (!paramp) + { + write_stderr("could not map view of backend variables: error code %lu\n", + GetLastError()); + exit(1); + } + + memcpy(¶m, paramp, sizeof(BackendParameters)); + + if (!UnmapViewOfFile(paramp)) + { + write_stderr("could not unmap view of backend variables: error code %lu\n", + GetLastError()); + exit(1); + } + + if (!CloseHandle(paramHandle)) + { + write_stderr("could not close handle to backend parameter variables: error code %lu\n", + GetLastError()); + exit(1); + } +#endif + + restore_backend_variables(¶m, port); +} + +/* Restore critical backend variables from the BackendParameters struct */ +static void +restore_backend_variables(BackendParameters *param, Port *port) +{ + memcpy(port, ¶m->port, sizeof(Port)); + read_inheritable_socket(&port->sock, ¶m->portsocket); + + SetDataDir(param->DataDir); + + memcpy(&ListenSocket, ¶m->ListenSocket, sizeof(ListenSocket)); + + MyCancelKey = param->MyCancelKey; + MyPMChildSlot = param->MyPMChildSlot; + +#ifdef WIN32 + ShmemProtectiveRegion = param->ShmemProtectiveRegion; +#endif + UsedShmemSegID = param->UsedShmemSegID; + UsedShmemSegAddr = param->UsedShmemSegAddr; + + ShmemLock = param->ShmemLock; + ShmemVariableCache = param->ShmemVariableCache; + ShmemBackendArray = param->ShmemBackendArray; + +#ifndef HAVE_SPINLOCKS + SpinlockSemaArray = param->SpinlockSemaArray; +#endif + NamedLWLockTrancheRequests = param->NamedLWLockTrancheRequests; + NamedLWLockTrancheArray = param->NamedLWLockTrancheArray; + MainLWLockArray = param->MainLWLockArray; + ProcStructLock = param->ProcStructLock; + ProcGlobal = param->ProcGlobal; + AuxiliaryProcs = param->AuxiliaryProcs; + PreparedXactProcs = param->PreparedXactProcs; + PMSignalState = param->PMSignalState; + + PostmasterPid = param->PostmasterPid; + PgStartTime = param->PgStartTime; + PgReloadTime = param->PgReloadTime; + first_syslogger_file_time = param->first_syslogger_file_time; + + redirection_done = param->redirection_done; + IsBinaryUpgrade = param->IsBinaryUpgrade; + query_id_enabled = param->query_id_enabled; + max_safe_fds = param->max_safe_fds; + + MaxBackends = param->MaxBackends; + +#ifdef WIN32 + PostmasterHandle = param->PostmasterHandle; + pgwin32_initial_signal_pipe = param->initial_signal_pipe; +#else + memcpy(&postmaster_alive_fds, ¶m->postmaster_alive_fds, + sizeof(postmaster_alive_fds)); +#endif + + memcpy(&syslogPipe, ¶m->syslogPipe, sizeof(syslogPipe)); + + strlcpy(my_exec_path, param->my_exec_path, MAXPGPATH); + + strlcpy(pkglib_path, param->pkglib_path, MAXPGPATH); + + /* + * We need to restore fd.c's counts of externally-opened FDs; to avoid + * confusion, be sure to do this after restoring max_safe_fds. (Note: + * BackendInitialize will handle this for port->sock.) + */ +#ifndef WIN32 + if (postmaster_alive_fds[0] >= 0) + ReserveExternalFD(); + if (postmaster_alive_fds[1] >= 0) + ReserveExternalFD(); +#endif +} + + +Size +ShmemBackendArraySize(void) +{ + return mul_size(MaxLivePostmasterChildren(), sizeof(Backend)); +} + +void +ShmemBackendArrayAllocation(void) +{ + Size size = ShmemBackendArraySize(); + + ShmemBackendArray = (Backend *) ShmemAlloc(size); + /* Mark all slots as empty */ + memset(ShmemBackendArray, 0, size); +} + +static void +ShmemBackendArrayAdd(Backend *bn) +{ + /* The array slot corresponding to my PMChildSlot should be free */ + int i = bn->child_slot - 1; + + Assert(ShmemBackendArray[i].pid == 0); + ShmemBackendArray[i] = *bn; +} + +static void +ShmemBackendArrayRemove(Backend *bn) +{ + int i = bn->child_slot - 1; + + Assert(ShmemBackendArray[i].pid == bn->pid); + /* Mark the slot as empty */ + ShmemBackendArray[i].pid = 0; +} +#endif /* EXEC_BACKEND */ + + +#ifdef WIN32 + +/* + * Subset implementation of waitpid() for Windows. We assume pid is -1 + * (that is, check all child processes) and options is WNOHANG (don't wait). + */ +static pid_t +waitpid(pid_t pid, int *exitstatus, int options) +{ + win32_deadchild_waitinfo *childinfo; + DWORD exitcode; + DWORD dwd; + ULONG_PTR key; + OVERLAPPED *ovl; + + /* Try to consume one win32_deadchild_waitinfo from the queue. */ + if (!GetQueuedCompletionStatus(win32ChildQueue, &dwd, &key, &ovl, 0)) + { + errno = EAGAIN; + return -1; + } + + childinfo = (win32_deadchild_waitinfo *) key; + pid = childinfo->procId; + + /* + * Remove handle from wait - required even though it's set to wait only + * once + */ + UnregisterWaitEx(childinfo->waitHandle, NULL); + + if (!GetExitCodeProcess(childinfo->procHandle, &exitcode)) + { + /* + * Should never happen. Inform user and set a fixed exitcode. + */ + write_stderr("could not read exit code for process\n"); + exitcode = 255; + } + *exitstatus = exitcode; + + /* + * Close the process handle. Only after this point can the PID can be + * recycled by the kernel. + */ + CloseHandle(childinfo->procHandle); + + /* + * Free struct that was allocated before the call to + * RegisterWaitForSingleObject() + */ + pfree(childinfo); + + return pid; +} + +/* + * Note! Code below executes on a thread pool! All operations must + * be thread safe! Note that elog() and friends must *not* be used. + */ +static void WINAPI +pgwin32_deadchild_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired) +{ + /* Should never happen, since we use INFINITE as timeout value. */ + if (TimerOrWaitFired) + return; + + /* + * Post the win32_deadchild_waitinfo object for waitpid() to deal with. If + * that fails, we leak the object, but we also leak a whole process and + * get into an unrecoverable state, so there's not much point in worrying + * about that. We'd like to panic, but we can't use that infrastructure + * from this thread. + */ + if (!PostQueuedCompletionStatus(win32ChildQueue, + 0, + (ULONG_PTR) lpParameter, + NULL)) + write_stderr("could not post child completion status\n"); + + /* Queue SIGCHLD signal. */ + pg_queue_signal(SIGCHLD); +} +#endif /* WIN32 */ + +/* + * Initialize one and only handle for monitoring postmaster death. + * + * Called once in the postmaster, so that child processes can subsequently + * monitor if their parent is dead. + */ +static void +InitPostmasterDeathWatchHandle(void) +{ +#ifndef WIN32 + + /* + * Create a pipe. Postmaster holds the write end of the pipe open + * (POSTMASTER_FD_OWN), and children hold the read end. Children can pass + * the read file descriptor to select() to wake up in case postmaster + * dies, or check for postmaster death with a (read() == 0). Children must + * close the write end as soon as possible after forking, because EOF + * won't be signaled in the read end until all processes have closed the + * write fd. That is taken care of in ClosePostmasterPorts(). + */ + Assert(MyProcPid == PostmasterPid); + if (pipe(postmaster_alive_fds) < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg_internal("could not create pipe to monitor postmaster death: %m"))); + + /* Notify fd.c that we've eaten two FDs for the pipe. */ + ReserveExternalFD(); + ReserveExternalFD(); + + /* + * Set O_NONBLOCK to allow testing for the fd's presence with a read() + * call. + */ + if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1) + ereport(FATAL, + (errcode_for_socket_access(), + errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m"))); +#else + + /* + * On Windows, we use a process handle for the same purpose. + */ + if (DuplicateHandle(GetCurrentProcess(), + GetCurrentProcess(), + GetCurrentProcess(), + &PostmasterHandle, + 0, + TRUE, + DUPLICATE_SAME_ACCESS) == 0) + ereport(FATAL, + (errmsg_internal("could not duplicate postmaster handle: error code %lu", + GetLastError()))); +#endif /* WIN32 */ +} diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c new file mode 100644 index 0000000..0e7de26 --- /dev/null +++ b/src/backend/postmaster/startup.c @@ -0,0 +1,402 @@ +/*------------------------------------------------------------------------- + * + * startup.c + * + * The Startup process initialises the server and performs any recovery + * actions that have been specified. Notice that there is no "main loop" + * since the Startup process ends as soon as initialisation is complete. + * (in standby mode, one can think of the replay loop as a main loop, + * though.) + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/postmaster/startup.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <unistd.h> + +#include "access/xlog.h" +#include "access/xlogrecovery.h" +#include "access/xlogutils.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/interrupt.h" +#include "postmaster/startup.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/pmsignal.h" +#include "storage/procsignal.h" +#include "storage/standby.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/timeout.h" + + +#ifndef USE_POSTMASTER_DEATH_SIGNAL +/* + * On systems that need to make a system call to find out if the postmaster has + * gone away, we'll do so only every Nth call to HandleStartupProcInterrupts(). + * This only affects how long it takes us to detect the condition while we're + * busy replaying WAL. Latch waits and similar which should react immediately + * through the usual techniques. + */ +#define POSTMASTER_POLL_RATE_LIMIT 1024 +#endif + +/* + * Flags set by interrupt handlers for later service in the redo loop. + */ +static volatile sig_atomic_t got_SIGHUP = false; +static volatile sig_atomic_t shutdown_requested = false; +static volatile sig_atomic_t promote_signaled = false; + +/* + * Flag set when executing a restore command, to tell SIGTERM signal handler + * that it's safe to just proc_exit. + */ +static volatile sig_atomic_t in_restore_command = false; + +/* + * Time at which the most recent startup operation started. + */ +static TimestampTz startup_progress_phase_start_time; + +/* + * Indicates whether the startup progress interval mentioned by the user is + * elapsed or not. TRUE if timeout occurred, FALSE otherwise. + */ +static volatile sig_atomic_t startup_progress_timer_expired = false; + +/* + * Time between progress updates for long-running startup operations. + */ +int log_startup_progress_interval = 10000; /* 10 sec */ + +/* Signal handlers */ +static void StartupProcTriggerHandler(SIGNAL_ARGS); +static void StartupProcSigHupHandler(SIGNAL_ARGS); + +/* Callbacks */ +static void StartupProcExit(int code, Datum arg); + + +/* -------------------------------- + * signal handler routines + * -------------------------------- + */ + +/* SIGUSR2: set flag to finish recovery */ +static void +StartupProcTriggerHandler(SIGNAL_ARGS) +{ + int save_errno = errno; + + promote_signaled = true; + WakeupRecovery(); + + errno = save_errno; +} + +/* SIGHUP: set flag to re-read config file at next convenient time */ +static void +StartupProcSigHupHandler(SIGNAL_ARGS) +{ + int save_errno = errno; + + got_SIGHUP = true; + WakeupRecovery(); + + errno = save_errno; +} + +/* SIGTERM: set flag to abort redo and exit */ +static void +StartupProcShutdownHandler(SIGNAL_ARGS) +{ + int save_errno = errno; + + if (in_restore_command) + { + /* + * If we are in a child process (e.g., forked by system() in + * RestoreArchivedFile()), we don't want to call any exit callbacks. + * The parent will take care of that. + */ + if (MyProcPid == (int) getpid()) + proc_exit(1); + else + { + write_stderr_signal_safe("StartupProcShutdownHandler() called in child process\n"); + _exit(1); + } + } + else + shutdown_requested = true; + WakeupRecovery(); + + errno = save_errno; +} + +/* + * Re-read the config file. + * + * If one of the critical walreceiver options has changed, flag xlog.c + * to restart it. + */ +static void +StartupRereadConfig(void) +{ + char *conninfo = pstrdup(PrimaryConnInfo); + char *slotname = pstrdup(PrimarySlotName); + bool tempSlot = wal_receiver_create_temp_slot; + bool conninfoChanged; + bool slotnameChanged; + bool tempSlotChanged = false; + + ProcessConfigFile(PGC_SIGHUP); + + conninfoChanged = strcmp(conninfo, PrimaryConnInfo) != 0; + slotnameChanged = strcmp(slotname, PrimarySlotName) != 0; + + /* + * wal_receiver_create_temp_slot is used only when we have no slot + * configured. We do not need to track this change if it has no effect. + */ + if (!slotnameChanged && strcmp(PrimarySlotName, "") == 0) + tempSlotChanged = tempSlot != wal_receiver_create_temp_slot; + pfree(conninfo); + pfree(slotname); + + if (conninfoChanged || slotnameChanged || tempSlotChanged) + StartupRequestWalReceiverRestart(); +} + +/* Handle various signals that might be sent to the startup process */ +void +HandleStartupProcInterrupts(void) +{ +#ifdef POSTMASTER_POLL_RATE_LIMIT + static uint32 postmaster_poll_count = 0; +#endif + + /* + * Process any requests or signals received recently. + */ + if (got_SIGHUP) + { + got_SIGHUP = false; + StartupRereadConfig(); + } + + /* + * Check if we were requested to exit without finishing recovery. + */ + if (shutdown_requested) + proc_exit(1); + + /* + * Emergency bailout if postmaster has died. This is to avoid the + * necessity for manual cleanup of all postmaster children. Do this less + * frequently on systems for which we don't have signals to make that + * cheap. + */ + if (IsUnderPostmaster && +#ifdef POSTMASTER_POLL_RATE_LIMIT + postmaster_poll_count++ % POSTMASTER_POLL_RATE_LIMIT == 0 && +#endif + !PostmasterIsAlive()) + exit(1); + + /* Process barrier events */ + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); + + /* Perform logging of memory contexts of this process */ + if (LogMemoryContextPending) + ProcessLogMemoryContextInterrupt(); +} + + +/* -------------------------------- + * signal handler routines + * -------------------------------- + */ +static void +StartupProcExit(int code, Datum arg) +{ + /* Shutdown the recovery environment */ + if (standbyState != STANDBY_DISABLED) + ShutdownRecoveryTransactionEnvironment(); +} + + +/* ---------------------------------- + * Startup Process main entry point + * ---------------------------------- + */ +void +StartupProcessMain(void) +{ + /* Arrange to clean up at startup process exit */ + on_shmem_exit(StartupProcExit, 0); + + /* + * Properly accept or ignore signals the postmaster might send us. + */ + pqsignal(SIGHUP, StartupProcSigHupHandler); /* reload config file */ + pqsignal(SIGINT, SIG_IGN); /* ignore query cancel */ + pqsignal(SIGTERM, StartupProcShutdownHandler); /* request shutdown */ + /* SIGQUIT handler was already set up by InitPostmasterChild */ + InitializeTimeouts(); /* establishes SIGALRM handler */ + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, StartupProcTriggerHandler); + + /* + * Reset some signals that are accepted by postmaster but not here + */ + pqsignal(SIGCHLD, SIG_DFL); + + /* + * Register timeouts needed for standby mode + */ + RegisterTimeout(STANDBY_DEADLOCK_TIMEOUT, StandbyDeadLockHandler); + RegisterTimeout(STANDBY_TIMEOUT, StandbyTimeoutHandler); + RegisterTimeout(STANDBY_LOCK_TIMEOUT, StandbyLockTimeoutHandler); + + /* + * Unblock signals (they were blocked when the postmaster forked us) + */ + sigprocmask(SIG_SETMASK, &UnBlockSig, NULL); + + /* + * Do what we came for. + */ + StartupXLOG(); + + /* + * Exit normally. Exit code 0 tells postmaster that we completed recovery + * successfully. + */ + proc_exit(0); +} + +void +PreRestoreCommand(void) +{ + /* + * Set in_restore_command to tell the signal handler that we should exit + * right away on SIGTERM. We know that we're at a safe point to do that. + * Check if we had already received the signal, so that we don't miss a + * shutdown request received just before this. + */ + in_restore_command = true; + if (shutdown_requested) + proc_exit(1); +} + +void +PostRestoreCommand(void) +{ + in_restore_command = false; +} + +bool +IsPromoteSignaled(void) +{ + return promote_signaled; +} + +void +ResetPromoteSignaled(void) +{ + promote_signaled = false; +} + +/* + * Set a flag indicating that it's time to log a progress report. + */ +void +startup_progress_timeout_handler(void) +{ + startup_progress_timer_expired = true; +} + +void +disable_startup_progress_timeout(void) +{ + /* Feature is disabled. */ + if (log_startup_progress_interval == 0) + return; + + disable_timeout(STARTUP_PROGRESS_TIMEOUT, false); + startup_progress_timer_expired = false; +} + +/* + * Set the start timestamp of the current operation and enable the timeout. + */ +void +enable_startup_progress_timeout(void) +{ + TimestampTz fin_time; + + /* Feature is disabled. */ + if (log_startup_progress_interval == 0) + return; + + startup_progress_phase_start_time = GetCurrentTimestamp(); + fin_time = TimestampTzPlusMilliseconds(startup_progress_phase_start_time, + log_startup_progress_interval); + enable_timeout_every(STARTUP_PROGRESS_TIMEOUT, fin_time, + log_startup_progress_interval); +} + +/* + * A thin wrapper to first disable and then enable the startup progress + * timeout. + */ +void +begin_startup_progress_phase(void) +{ + /* Feature is disabled. */ + if (log_startup_progress_interval == 0) + return; + + disable_startup_progress_timeout(); + enable_startup_progress_timeout(); +} + +/* + * Report whether startup progress timeout has occurred. Reset the timer flag + * if it did, set the elapsed time to the out parameters and return true, + * otherwise return false. + */ +bool +has_startup_progress_timeout_expired(long *secs, int *usecs) +{ + long seconds; + int useconds; + TimestampTz now; + + /* No timeout has occurred. */ + if (!startup_progress_timer_expired) + return false; + + /* Calculate the elapsed time. */ + now = GetCurrentTimestamp(); + TimestampDifference(startup_progress_phase_start_time, now, &seconds, &useconds); + + *secs = seconds; + *usecs = useconds; + startup_progress_timer_expired = false; + + return true; +} diff --git a/src/backend/postmaster/syslogger.c b/src/backend/postmaster/syslogger.c new file mode 100644 index 0000000..858a2f6 --- /dev/null +++ b/src/backend/postmaster/syslogger.c @@ -0,0 +1,1651 @@ +/*------------------------------------------------------------------------- + * + * syslogger.c + * + * The system logger (syslogger) appeared in Postgres 8.0. It catches all + * stderr output from the postmaster, backends, and other subprocesses + * by redirecting to a pipe, and writes it to a set of logfiles. + * It's possible to have size and age limits for the logfile configured + * in postgresql.conf. If these limits are reached or passed, the + * current logfile is closed and a new one is created (rotated). + * The logfiles are stored in a subdirectory (configurable in + * postgresql.conf), using a user-selectable naming scheme. + * + * Author: Andreas Pflug <pgadmin@pse-consulting.de> + * + * Copyright (c) 2004-2023, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/postmaster/syslogger.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <fcntl.h> +#include <limits.h> +#include <signal.h> +#include <time.h> +#include <unistd.h> +#include <sys/stat.h> +#include <sys/time.h> + +#include "common/file_perm.h" +#include "lib/stringinfo.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "nodes/pg_list.h" +#include "pgstat.h" +#include "pgtime.h" +#include "port/pg_bitutils.h" +#include "postmaster/fork_process.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "postmaster/syslogger.h" +#include "storage/dsm.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/pg_shmem.h" +#include "tcop/tcopprot.h" +#include "utils/guc.h" +#include "utils/ps_status.h" +#include "utils/timestamp.h" + +/* + * We read() into a temp buffer twice as big as a chunk, so that any fragment + * left after processing can be moved down to the front and we'll still have + * room to read a full chunk. + */ +#define READ_BUF_SIZE (2 * PIPE_CHUNK_SIZE) + +/* Log rotation signal file path, relative to $PGDATA */ +#define LOGROTATE_SIGNAL_FILE "logrotate" + + +/* + * GUC parameters. Logging_collector cannot be changed after postmaster + * start, but the rest can change at SIGHUP. + */ +bool Logging_collector = false; +int Log_RotationAge = HOURS_PER_DAY * MINS_PER_HOUR; +int Log_RotationSize = 10 * 1024; +char *Log_directory = NULL; +char *Log_filename = NULL; +bool Log_truncate_on_rotation = false; +int Log_file_mode = S_IRUSR | S_IWUSR; + +extern bool redirection_done; + +/* + * Private state + */ +static pg_time_t next_rotation_time; +static bool pipe_eof_seen = false; +static bool rotation_disabled = false; +static FILE *syslogFile = NULL; +static FILE *csvlogFile = NULL; +static FILE *jsonlogFile = NULL; +NON_EXEC_STATIC pg_time_t first_syslogger_file_time = 0; +static char *last_sys_file_name = NULL; +static char *last_csv_file_name = NULL; +static char *last_json_file_name = NULL; + +/* + * Buffers for saving partial messages from different backends. + * + * Keep NBUFFER_LISTS lists of these, with the entry for a given source pid + * being in the list numbered (pid % NBUFFER_LISTS), so as to cut down on + * the number of entries we have to examine for any one incoming message. + * There must never be more than one entry for the same source pid. + * + * An inactive buffer is not removed from its list, just held for re-use. + * An inactive buffer has pid == 0 and undefined contents of data. + */ +typedef struct +{ + int32 pid; /* PID of source process */ + StringInfoData data; /* accumulated data, as a StringInfo */ +} save_buffer; + +#define NBUFFER_LISTS 256 +static List *buffer_lists[NBUFFER_LISTS]; + +/* These must be exported for EXEC_BACKEND case ... annoying */ +#ifndef WIN32 +int syslogPipe[2] = {-1, -1}; +#else +HANDLE syslogPipe[2] = {0, 0}; +#endif + +#ifdef WIN32 +static HANDLE threadHandle = 0; +static CRITICAL_SECTION sysloggerSection; +#endif + +/* + * Flags set by interrupt handlers for later service in the main loop. + */ +static volatile sig_atomic_t rotation_requested = false; + + +/* Local subroutines */ +#ifdef EXEC_BACKEND +static int syslogger_fdget(FILE *file); +static FILE *syslogger_fdopen(int fd); +static pid_t syslogger_forkexec(void); +static void syslogger_parseArgs(int argc, char *argv[]); +#endif +NON_EXEC_STATIC void SysLoggerMain(int argc, char *argv[]) pg_attribute_noreturn(); +static void process_pipe_input(char *logbuffer, int *bytes_in_logbuffer); +static void flush_pipe_input(char *logbuffer, int *bytes_in_logbuffer); +static FILE *logfile_open(const char *filename, const char *mode, + bool allow_errors); + +#ifdef WIN32 +static unsigned int __stdcall pipeThread(void *arg); +#endif +static void logfile_rotate(bool time_based_rotation, int size_rotation_for); +static bool logfile_rotate_dest(bool time_based_rotation, + int size_rotation_for, pg_time_t fntime, + int target_dest, char **last_file_name, + FILE **logFile); +static char *logfile_getname(pg_time_t timestamp, const char *suffix); +static void set_next_rotation_time(void); +static void sigUsr1Handler(SIGNAL_ARGS); +static void update_metainfo_datafile(void); + + +/* + * Main entry point for syslogger process + * argc/argv parameters are valid only in EXEC_BACKEND case. + */ +NON_EXEC_STATIC void +SysLoggerMain(int argc, char *argv[]) +{ +#ifndef WIN32 + char logbuffer[READ_BUF_SIZE]; + int bytes_in_logbuffer = 0; +#endif + char *currentLogDir; + char *currentLogFilename; + int currentLogRotationAge; + pg_time_t now; + WaitEventSet *wes; + + now = MyStartTime; + +#ifdef EXEC_BACKEND + syslogger_parseArgs(argc, argv); +#endif /* EXEC_BACKEND */ + + MyBackendType = B_LOGGER; + init_ps_display(NULL); + + /* + * If we restarted, our stderr is already redirected into our own input + * pipe. This is of course pretty useless, not to mention that it + * interferes with detecting pipe EOF. Point stderr to /dev/null. This + * assumes that all interesting messages generated in the syslogger will + * come through elog.c and will be sent to write_syslogger_file. + */ + if (redirection_done) + { + int fd = open(DEVNULL, O_WRONLY, 0); + + /* + * The closes might look redundant, but they are not: we want to be + * darn sure the pipe gets closed even if the open failed. We can + * survive running with stderr pointing nowhere, but we can't afford + * to have extra pipe input descriptors hanging around. + * + * As we're just trying to reset these to go to DEVNULL, there's not + * much point in checking for failure from the close/dup2 calls here, + * if they fail then presumably the file descriptors are closed and + * any writes will go into the bitbucket anyway. + */ + close(STDOUT_FILENO); + close(STDERR_FILENO); + if (fd != -1) + { + (void) dup2(fd, STDOUT_FILENO); + (void) dup2(fd, STDERR_FILENO); + close(fd); + } + } + + /* + * Syslogger's own stderr can't be the syslogPipe, so set it back to text + * mode if we didn't just close it. (It was set to binary in + * SubPostmasterMain). + */ +#ifdef WIN32 + else + _setmode(STDERR_FILENO, _O_TEXT); +#endif + + /* + * Also close our copy of the write end of the pipe. This is needed to + * ensure we can detect pipe EOF correctly. (But note that in the restart + * case, the postmaster already did this.) + */ +#ifndef WIN32 + if (syslogPipe[1] >= 0) + close(syslogPipe[1]); + syslogPipe[1] = -1; +#else + if (syslogPipe[1]) + CloseHandle(syslogPipe[1]); + syslogPipe[1] = 0; +#endif + + /* + * Properly accept or ignore signals the postmaster might send us + * + * Note: we ignore all termination signals, and instead exit only when all + * upstream processes are gone, to ensure we don't miss any dying gasps of + * broken backends... + */ + + pqsignal(SIGHUP, SignalHandlerForConfigReload); /* set flag to read config + * file */ + pqsignal(SIGINT, SIG_IGN); + pqsignal(SIGTERM, SIG_IGN); + pqsignal(SIGQUIT, SIG_IGN); + pqsignal(SIGALRM, SIG_IGN); + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, sigUsr1Handler); /* request log rotation */ + pqsignal(SIGUSR2, SIG_IGN); + + /* + * Reset some signals that are accepted by postmaster but not here + */ + pqsignal(SIGCHLD, SIG_DFL); + + sigprocmask(SIG_SETMASK, &UnBlockSig, NULL); + +#ifdef WIN32 + /* Fire up separate data transfer thread */ + InitializeCriticalSection(&sysloggerSection); + EnterCriticalSection(&sysloggerSection); + + threadHandle = (HANDLE) _beginthreadex(NULL, 0, pipeThread, NULL, 0, NULL); + if (threadHandle == 0) + elog(FATAL, "could not create syslogger data transfer thread: %m"); +#endif /* WIN32 */ + + /* + * Remember active logfiles' name(s). We recompute 'em from the reference + * time because passing down just the pg_time_t is a lot cheaper than + * passing a whole file path in the EXEC_BACKEND case. + */ + last_sys_file_name = logfile_getname(first_syslogger_file_time, NULL); + if (csvlogFile != NULL) + last_csv_file_name = logfile_getname(first_syslogger_file_time, ".csv"); + if (jsonlogFile != NULL) + last_json_file_name = logfile_getname(first_syslogger_file_time, ".json"); + + /* remember active logfile parameters */ + currentLogDir = pstrdup(Log_directory); + currentLogFilename = pstrdup(Log_filename); + currentLogRotationAge = Log_RotationAge; + /* set next planned rotation time */ + set_next_rotation_time(); + update_metainfo_datafile(); + + /* + * Reset whereToSendOutput, as the postmaster will do (but hasn't yet, at + * the point where we forked). This prevents duplicate output of messages + * from syslogger itself. + */ + whereToSendOutput = DestNone; + + /* + * Set up a reusable WaitEventSet object we'll use to wait for our latch, + * and (except on Windows) our socket. + * + * Unlike all other postmaster child processes, we'll ignore postmaster + * death because we want to collect final log output from all backends and + * then exit last. We'll do that by running until we see EOF on the + * syslog pipe, which implies that all other backends have exited + * (including the postmaster). + */ + wes = CreateWaitEventSet(CurrentMemoryContext, 2); + AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); +#ifndef WIN32 + AddWaitEventToSet(wes, WL_SOCKET_READABLE, syslogPipe[0], NULL, NULL); +#endif + + /* main worker loop */ + for (;;) + { + bool time_based_rotation = false; + int size_rotation_for = 0; + long cur_timeout; + WaitEvent event; + +#ifndef WIN32 + int rc; +#endif + + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + /* + * Process any requests or signals received recently. + */ + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + + /* + * Check if the log directory or filename pattern changed in + * postgresql.conf. If so, force rotation to make sure we're + * writing the logfiles in the right place. + */ + if (strcmp(Log_directory, currentLogDir) != 0) + { + pfree(currentLogDir); + currentLogDir = pstrdup(Log_directory); + rotation_requested = true; + + /* + * Also, create new directory if not present; ignore errors + */ + (void) MakePGDirectory(Log_directory); + } + if (strcmp(Log_filename, currentLogFilename) != 0) + { + pfree(currentLogFilename); + currentLogFilename = pstrdup(Log_filename); + rotation_requested = true; + } + + /* + * Force a rotation if CSVLOG output was just turned on or off and + * we need to open or close csvlogFile accordingly. + */ + if (((Log_destination & LOG_DESTINATION_CSVLOG) != 0) != + (csvlogFile != NULL)) + rotation_requested = true; + + /* + * Force a rotation if JSONLOG output was just turned on or off + * and we need to open or close jsonlogFile accordingly. + */ + if (((Log_destination & LOG_DESTINATION_JSONLOG) != 0) != + (jsonlogFile != NULL)) + rotation_requested = true; + + /* + * If rotation time parameter changed, reset next rotation time, + * but don't immediately force a rotation. + */ + if (currentLogRotationAge != Log_RotationAge) + { + currentLogRotationAge = Log_RotationAge; + set_next_rotation_time(); + } + + /* + * If we had a rotation-disabling failure, re-enable rotation + * attempts after SIGHUP, and force one immediately. + */ + if (rotation_disabled) + { + rotation_disabled = false; + rotation_requested = true; + } + + /* + * Force rewriting last log filename when reloading configuration. + * Even if rotation_requested is false, log_destination may have + * been changed and we don't want to wait the next file rotation. + */ + update_metainfo_datafile(); + } + + if (Log_RotationAge > 0 && !rotation_disabled) + { + /* Do a logfile rotation if it's time */ + now = (pg_time_t) time(NULL); + if (now >= next_rotation_time) + rotation_requested = time_based_rotation = true; + } + + if (!rotation_requested && Log_RotationSize > 0 && !rotation_disabled) + { + /* Do a rotation if file is too big */ + if (ftell(syslogFile) >= Log_RotationSize * 1024L) + { + rotation_requested = true; + size_rotation_for |= LOG_DESTINATION_STDERR; + } + if (csvlogFile != NULL && + ftell(csvlogFile) >= Log_RotationSize * 1024L) + { + rotation_requested = true; + size_rotation_for |= LOG_DESTINATION_CSVLOG; + } + if (jsonlogFile != NULL && + ftell(jsonlogFile) >= Log_RotationSize * 1024L) + { + rotation_requested = true; + size_rotation_for |= LOG_DESTINATION_JSONLOG; + } + } + + if (rotation_requested) + { + /* + * Force rotation when both values are zero. It means the request + * was sent by pg_rotate_logfile() or "pg_ctl logrotate". + */ + if (!time_based_rotation && size_rotation_for == 0) + size_rotation_for = LOG_DESTINATION_STDERR | + LOG_DESTINATION_CSVLOG | + LOG_DESTINATION_JSONLOG; + logfile_rotate(time_based_rotation, size_rotation_for); + } + + /* + * Calculate time till next time-based rotation, so that we don't + * sleep longer than that. We assume the value of "now" obtained + * above is still close enough. Note we can't make this calculation + * until after calling logfile_rotate(), since it will advance + * next_rotation_time. + * + * Also note that we need to beware of overflow in calculation of the + * timeout: with large settings of Log_RotationAge, next_rotation_time + * could be more than INT_MAX msec in the future. In that case we'll + * wait no more than INT_MAX msec, and try again. + */ + if (Log_RotationAge > 0 && !rotation_disabled) + { + pg_time_t delay; + + delay = next_rotation_time - now; + if (delay > 0) + { + if (delay > INT_MAX / 1000) + delay = INT_MAX / 1000; + cur_timeout = delay * 1000L; /* msec */ + } + else + cur_timeout = 0; + } + else + cur_timeout = -1L; + + /* + * Sleep until there's something to do + */ +#ifndef WIN32 + rc = WaitEventSetWait(wes, cur_timeout, &event, 1, + WAIT_EVENT_SYSLOGGER_MAIN); + + if (rc == 1 && event.events == WL_SOCKET_READABLE) + { + int bytesRead; + + bytesRead = read(syslogPipe[0], + logbuffer + bytes_in_logbuffer, + sizeof(logbuffer) - bytes_in_logbuffer); + if (bytesRead < 0) + { + if (errno != EINTR) + ereport(LOG, + (errcode_for_socket_access(), + errmsg("could not read from logger pipe: %m"))); + } + else if (bytesRead > 0) + { + bytes_in_logbuffer += bytesRead; + process_pipe_input(logbuffer, &bytes_in_logbuffer); + continue; + } + else + { + /* + * Zero bytes read when select() is saying read-ready means + * EOF on the pipe: that is, there are no longer any processes + * with the pipe write end open. Therefore, the postmaster + * and all backends are shut down, and we are done. + */ + pipe_eof_seen = true; + + /* if there's any data left then force it out now */ + flush_pipe_input(logbuffer, &bytes_in_logbuffer); + } + } +#else /* WIN32 */ + + /* + * On Windows we leave it to a separate thread to transfer data and + * detect pipe EOF. The main thread just wakes up to handle SIGHUP + * and rotation conditions. + * + * Server code isn't generally thread-safe, so we ensure that only one + * of the threads is active at a time by entering the critical section + * whenever we're not sleeping. + */ + LeaveCriticalSection(&sysloggerSection); + + (void) WaitEventSetWait(wes, cur_timeout, &event, 1, + WAIT_EVENT_SYSLOGGER_MAIN); + + EnterCriticalSection(&sysloggerSection); +#endif /* WIN32 */ + + if (pipe_eof_seen) + { + /* + * seeing this message on the real stderr is annoying - so we make + * it DEBUG1 to suppress in normal use. + */ + ereport(DEBUG1, + (errmsg_internal("logger shutting down"))); + + /* + * Normal exit from the syslogger is here. Note that we + * deliberately do not close syslogFile before exiting; this is to + * allow for the possibility of elog messages being generated + * inside proc_exit. Regular exit() will take care of flushing + * and closing stdio channels. + */ + proc_exit(0); + } + } +} + +/* + * Postmaster subroutine to start a syslogger subprocess. + */ +int +SysLogger_Start(void) +{ + pid_t sysloggerPid; + char *filename; + + if (!Logging_collector) + return 0; + + /* + * If first time through, create the pipe which will receive stderr + * output. + * + * If the syslogger crashes and needs to be restarted, we continue to use + * the same pipe (indeed must do so, since extant backends will be writing + * into that pipe). + * + * This means the postmaster must continue to hold the read end of the + * pipe open, so we can pass it down to the reincarnated syslogger. This + * is a bit klugy but we have little choice. + * + * Also note that we don't bother counting the pipe FDs by calling + * Reserve/ReleaseExternalFD. There's no real need to account for them + * accurately in the postmaster or syslogger process, and both ends of the + * pipe will wind up closed in all other postmaster children. + */ +#ifndef WIN32 + if (syslogPipe[0] < 0) + { + if (pipe(syslogPipe) < 0) + ereport(FATAL, + (errcode_for_socket_access(), + errmsg("could not create pipe for syslog: %m"))); + } +#else + if (!syslogPipe[0]) + { + SECURITY_ATTRIBUTES sa; + + memset(&sa, 0, sizeof(SECURITY_ATTRIBUTES)); + sa.nLength = sizeof(SECURITY_ATTRIBUTES); + sa.bInheritHandle = TRUE; + + if (!CreatePipe(&syslogPipe[0], &syslogPipe[1], &sa, 32768)) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not create pipe for syslog: %m"))); + } +#endif + + /* + * Create log directory if not present; ignore errors + */ + (void) MakePGDirectory(Log_directory); + + /* + * The initial logfile is created right in the postmaster, to verify that + * the Log_directory is writable. We save the reference time so that the + * syslogger child process can recompute this file name. + * + * It might look a bit strange to re-do this during a syslogger restart, + * but we must do so since the postmaster closed syslogFile after the + * previous fork (and remembering that old file wouldn't be right anyway). + * Note we always append here, we won't overwrite any existing file. This + * is consistent with the normal rules, because by definition this is not + * a time-based rotation. + */ + first_syslogger_file_time = time(NULL); + + filename = logfile_getname(first_syslogger_file_time, NULL); + + syslogFile = logfile_open(filename, "a", false); + + pfree(filename); + + /* + * Likewise for the initial CSV log file, if that's enabled. (Note that + * we open syslogFile even when only CSV output is nominally enabled, + * since some code paths will write to syslogFile anyway.) + */ + if (Log_destination & LOG_DESTINATION_CSVLOG) + { + filename = logfile_getname(first_syslogger_file_time, ".csv"); + + csvlogFile = logfile_open(filename, "a", false); + + pfree(filename); + } + + /* + * Likewise for the initial JSON log file, if that's enabled. (Note that + * we open syslogFile even when only JSON output is nominally enabled, + * since some code paths will write to syslogFile anyway.) + */ + if (Log_destination & LOG_DESTINATION_JSONLOG) + { + filename = logfile_getname(first_syslogger_file_time, ".json"); + + jsonlogFile = logfile_open(filename, "a", false); + + pfree(filename); + } + +#ifdef EXEC_BACKEND + switch ((sysloggerPid = syslogger_forkexec())) +#else + switch ((sysloggerPid = fork_process())) +#endif + { + case -1: + ereport(LOG, + (errmsg("could not fork system logger: %m"))); + return 0; + +#ifndef EXEC_BACKEND + case 0: + /* in postmaster child ... */ + InitPostmasterChild(); + + /* Close the postmaster's sockets */ + ClosePostmasterPorts(true); + + /* Drop our connection to postmaster's shared memory, as well */ + dsm_detach_all(); + PGSharedMemoryDetach(); + + /* do the work */ + SysLoggerMain(0, NULL); + break; +#endif + + default: + /* success, in postmaster */ + + /* now we redirect stderr, if not done already */ + if (!redirection_done) + { +#ifdef WIN32 + int fd; +#endif + + /* + * Leave a breadcrumb trail when redirecting, in case the user + * forgets that redirection is active and looks only at the + * original stderr target file. + */ + ereport(LOG, + (errmsg("redirecting log output to logging collector process"), + errhint("Future log output will appear in directory \"%s\".", + Log_directory))); + +#ifndef WIN32 + fflush(stdout); + if (dup2(syslogPipe[1], STDOUT_FILENO) < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not redirect stdout: %m"))); + fflush(stderr); + if (dup2(syslogPipe[1], STDERR_FILENO) < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not redirect stderr: %m"))); + /* Now we are done with the write end of the pipe. */ + close(syslogPipe[1]); + syslogPipe[1] = -1; +#else + + /* + * open the pipe in binary mode and make sure stderr is binary + * after it's been dup'ed into, to avoid disturbing the pipe + * chunking protocol. + */ + fflush(stderr); + fd = _open_osfhandle((intptr_t) syslogPipe[1], + _O_APPEND | _O_BINARY); + if (dup2(fd, STDERR_FILENO) < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not redirect stderr: %m"))); + close(fd); + _setmode(STDERR_FILENO, _O_BINARY); + + /* + * Now we are done with the write end of the pipe. + * CloseHandle() must not be called because the preceding + * close() closes the underlying handle. + */ + syslogPipe[1] = 0; +#endif + redirection_done = true; + } + + /* postmaster will never write the file(s); close 'em */ + fclose(syslogFile); + syslogFile = NULL; + if (csvlogFile != NULL) + { + fclose(csvlogFile); + csvlogFile = NULL; + } + if (jsonlogFile != NULL) + { + fclose(jsonlogFile); + jsonlogFile = NULL; + } + return (int) sysloggerPid; + } + + /* we should never reach here */ + return 0; +} + + +#ifdef EXEC_BACKEND + +/* + * syslogger_fdget() - + * + * Utility wrapper to grab the file descriptor of an opened error output + * file. Used when building the command to fork the logging collector. + */ +static int +syslogger_fdget(FILE *file) +{ +#ifndef WIN32 + if (file != NULL) + return fileno(file); + else + return -1; +#else + if (file != NULL) + return (int) _get_osfhandle(_fileno(file)); + else + return 0; +#endif /* WIN32 */ +} + +/* + * syslogger_fdopen() - + * + * Utility wrapper to re-open an error output file, using the given file + * descriptor. Used when parsing arguments in a forked logging collector. + */ +static FILE * +syslogger_fdopen(int fd) +{ + FILE *file = NULL; + +#ifndef WIN32 + if (fd != -1) + { + file = fdopen(fd, "a"); + setvbuf(file, NULL, PG_IOLBF, 0); + } +#else /* WIN32 */ + if (fd != 0) + { + fd = _open_osfhandle(fd, _O_APPEND | _O_TEXT); + if (fd > 0) + { + file = fdopen(fd, "a"); + setvbuf(file, NULL, PG_IOLBF, 0); + } + } +#endif /* WIN32 */ + + return file; +} + +/* + * syslogger_forkexec() - + * + * Format up the arglist for, then fork and exec, a syslogger process + */ +static pid_t +syslogger_forkexec(void) +{ + char *av[10]; + int ac = 0; + char filenobuf[32]; + char csvfilenobuf[32]; + char jsonfilenobuf[32]; + + av[ac++] = "postgres"; + av[ac++] = "--forklog"; + av[ac++] = NULL; /* filled in by postmaster_forkexec */ + + /* static variables (those not passed by write_backend_variables) */ + snprintf(filenobuf, sizeof(filenobuf), "%d", + syslogger_fdget(syslogFile)); + av[ac++] = filenobuf; + snprintf(csvfilenobuf, sizeof(csvfilenobuf), "%d", + syslogger_fdget(csvlogFile)); + av[ac++] = csvfilenobuf; + snprintf(jsonfilenobuf, sizeof(jsonfilenobuf), "%d", + syslogger_fdget(jsonlogFile)); + av[ac++] = jsonfilenobuf; + + av[ac] = NULL; + Assert(ac < lengthof(av)); + + return postmaster_forkexec(ac, av); +} + +/* + * syslogger_parseArgs() - + * + * Extract data from the arglist for exec'ed syslogger process + */ +static void +syslogger_parseArgs(int argc, char *argv[]) +{ + int fd; + + Assert(argc == 6); + argv += 3; + + /* + * Re-open the error output files that were opened by SysLogger_Start(). + * + * We expect this will always succeed, which is too optimistic, but if it + * fails there's not a lot we can do to report the problem anyway. As + * coded, we'll just crash on a null pointer dereference after failure... + */ + fd = atoi(*argv++); + syslogFile = syslogger_fdopen(fd); + fd = atoi(*argv++); + csvlogFile = syslogger_fdopen(fd); + fd = atoi(*argv++); + jsonlogFile = syslogger_fdopen(fd); +} +#endif /* EXEC_BACKEND */ + + +/* -------------------------------- + * pipe protocol handling + * -------------------------------- + */ + +/* + * Process data received through the syslogger pipe. + * + * This routine interprets the log pipe protocol which sends log messages as + * (hopefully atomic) chunks - such chunks are detected and reassembled here. + * + * The protocol has a header that starts with two nul bytes, then has a 16 bit + * length, the pid of the sending process, and a flag to indicate if it is + * the last chunk in a message. Incomplete chunks are saved until we read some + * more, and non-final chunks are accumulated until we get the final chunk. + * + * All of this is to avoid 2 problems: + * . partial messages being written to logfiles (messes rotation), and + * . messages from different backends being interleaved (messages garbled). + * + * Any non-protocol messages are written out directly. These should only come + * from non-PostgreSQL sources, however (e.g. third party libraries writing to + * stderr). + * + * logbuffer is the data input buffer, and *bytes_in_logbuffer is the number + * of bytes present. On exit, any not-yet-eaten data is left-justified in + * logbuffer, and *bytes_in_logbuffer is updated. + */ +static void +process_pipe_input(char *logbuffer, int *bytes_in_logbuffer) +{ + char *cursor = logbuffer; + int count = *bytes_in_logbuffer; + int dest = LOG_DESTINATION_STDERR; + + /* While we have enough for a header, process data... */ + while (count >= (int) (offsetof(PipeProtoHeader, data) + 1)) + { + PipeProtoHeader p; + int chunklen; + bits8 dest_flags; + + /* Do we have a valid header? */ + memcpy(&p, cursor, offsetof(PipeProtoHeader, data)); + dest_flags = p.flags & (PIPE_PROTO_DEST_STDERR | + PIPE_PROTO_DEST_CSVLOG | + PIPE_PROTO_DEST_JSONLOG); + if (p.nuls[0] == '\0' && p.nuls[1] == '\0' && + p.len > 0 && p.len <= PIPE_MAX_PAYLOAD && + p.pid != 0 && + pg_popcount((char *) &dest_flags, 1) == 1) + { + List *buffer_list; + ListCell *cell; + save_buffer *existing_slot = NULL, + *free_slot = NULL; + StringInfo str; + + chunklen = PIPE_HEADER_SIZE + p.len; + + /* Fall out of loop if we don't have the whole chunk yet */ + if (count < chunklen) + break; + + if ((p.flags & PIPE_PROTO_DEST_STDERR) != 0) + dest = LOG_DESTINATION_STDERR; + else if ((p.flags & PIPE_PROTO_DEST_CSVLOG) != 0) + dest = LOG_DESTINATION_CSVLOG; + else if ((p.flags & PIPE_PROTO_DEST_JSONLOG) != 0) + dest = LOG_DESTINATION_JSONLOG; + else + { + /* this should never happen as of the header validation */ + Assert(false); + } + + /* Locate any existing buffer for this source pid */ + buffer_list = buffer_lists[p.pid % NBUFFER_LISTS]; + foreach(cell, buffer_list) + { + save_buffer *buf = (save_buffer *) lfirst(cell); + + if (buf->pid == p.pid) + { + existing_slot = buf; + break; + } + if (buf->pid == 0 && free_slot == NULL) + free_slot = buf; + } + + if ((p.flags & PIPE_PROTO_IS_LAST) == 0) + { + /* + * Save a complete non-final chunk in a per-pid buffer + */ + if (existing_slot != NULL) + { + /* Add chunk to data from preceding chunks */ + str = &(existing_slot->data); + appendBinaryStringInfo(str, + cursor + PIPE_HEADER_SIZE, + p.len); + } + else + { + /* First chunk of message, save in a new buffer */ + if (free_slot == NULL) + { + /* + * Need a free slot, but there isn't one in the list, + * so create a new one and extend the list with it. + */ + free_slot = palloc(sizeof(save_buffer)); + buffer_list = lappend(buffer_list, free_slot); + buffer_lists[p.pid % NBUFFER_LISTS] = buffer_list; + } + free_slot->pid = p.pid; + str = &(free_slot->data); + initStringInfo(str); + appendBinaryStringInfo(str, + cursor + PIPE_HEADER_SIZE, + p.len); + } + } + else + { + /* + * Final chunk --- add it to anything saved for that pid, and + * either way write the whole thing out. + */ + if (existing_slot != NULL) + { + str = &(existing_slot->data); + appendBinaryStringInfo(str, + cursor + PIPE_HEADER_SIZE, + p.len); + write_syslogger_file(str->data, str->len, dest); + /* Mark the buffer unused, and reclaim string storage */ + existing_slot->pid = 0; + pfree(str->data); + } + else + { + /* The whole message was one chunk, evidently. */ + write_syslogger_file(cursor + PIPE_HEADER_SIZE, p.len, + dest); + } + } + + /* Finished processing this chunk */ + cursor += chunklen; + count -= chunklen; + } + else + { + /* Process non-protocol data */ + + /* + * Look for the start of a protocol header. If found, dump data + * up to there and repeat the loop. Otherwise, dump it all and + * fall out of the loop. (Note: we want to dump it all if at all + * possible, so as to avoid dividing non-protocol messages across + * logfiles. We expect that in many scenarios, a non-protocol + * message will arrive all in one read(), and we want to respect + * the read() boundary if possible.) + */ + for (chunklen = 1; chunklen < count; chunklen++) + { + if (cursor[chunklen] == '\0') + break; + } + /* fall back on the stderr log as the destination */ + write_syslogger_file(cursor, chunklen, LOG_DESTINATION_STDERR); + cursor += chunklen; + count -= chunklen; + } + } + + /* We don't have a full chunk, so left-align what remains in the buffer */ + if (count > 0 && cursor != logbuffer) + memmove(logbuffer, cursor, count); + *bytes_in_logbuffer = count; +} + +/* + * Force out any buffered data + * + * This is currently used only at syslogger shutdown, but could perhaps be + * useful at other times, so it is careful to leave things in a clean state. + */ +static void +flush_pipe_input(char *logbuffer, int *bytes_in_logbuffer) +{ + int i; + + /* Dump any incomplete protocol messages */ + for (i = 0; i < NBUFFER_LISTS; i++) + { + List *list = buffer_lists[i]; + ListCell *cell; + + foreach(cell, list) + { + save_buffer *buf = (save_buffer *) lfirst(cell); + + if (buf->pid != 0) + { + StringInfo str = &(buf->data); + + write_syslogger_file(str->data, str->len, + LOG_DESTINATION_STDERR); + /* Mark the buffer unused, and reclaim string storage */ + buf->pid = 0; + pfree(str->data); + } + } + } + + /* + * Force out any remaining pipe data as-is; we don't bother trying to + * remove any protocol headers that may exist in it. + */ + if (*bytes_in_logbuffer > 0) + write_syslogger_file(logbuffer, *bytes_in_logbuffer, + LOG_DESTINATION_STDERR); + *bytes_in_logbuffer = 0; +} + + +/* -------------------------------- + * logfile routines + * -------------------------------- + */ + +/* + * Write text to the currently open logfile + * + * This is exported so that elog.c can call it when MyBackendType is B_LOGGER. + * This allows the syslogger process to record elog messages of its own, + * even though its stderr does not point at the syslog pipe. + */ +void +write_syslogger_file(const char *buffer, int count, int destination) +{ + int rc; + FILE *logfile; + + /* + * If we're told to write to a structured log file, but it's not open, + * dump the data to syslogFile (which is always open) instead. This can + * happen if structured output is enabled after postmaster start and we've + * been unable to open logFile. There are also race conditions during a + * parameter change whereby backends might send us structured output + * before we open the logFile or after we close it. Writing formatted + * output to the regular log file isn't great, but it beats dropping log + * output on the floor. + * + * Think not to improve this by trying to open logFile on-the-fly. Any + * failure in that would lead to recursion. + */ + if ((destination & LOG_DESTINATION_CSVLOG) && csvlogFile != NULL) + logfile = csvlogFile; + else if ((destination & LOG_DESTINATION_JSONLOG) && jsonlogFile != NULL) + logfile = jsonlogFile; + else + logfile = syslogFile; + + rc = fwrite(buffer, 1, count, logfile); + + /* + * Try to report any failure. We mustn't use ereport because it would + * just recurse right back here, but write_stderr is OK: it will write + * either to the postmaster's original stderr, or to /dev/null, but never + * to our input pipe which would result in a different sort of looping. + */ + if (rc != count) + write_stderr("could not write to log file: %s\n", strerror(errno)); +} + +#ifdef WIN32 + +/* + * Worker thread to transfer data from the pipe to the current logfile. + * + * We need this because on Windows, WaitForMultipleObjects does not work on + * unnamed pipes: it always reports "signaled", so the blocking ReadFile won't + * allow for SIGHUP; and select is for sockets only. + */ +static unsigned int __stdcall +pipeThread(void *arg) +{ + char logbuffer[READ_BUF_SIZE]; + int bytes_in_logbuffer = 0; + + for (;;) + { + DWORD bytesRead; + BOOL result; + + result = ReadFile(syslogPipe[0], + logbuffer + bytes_in_logbuffer, + sizeof(logbuffer) - bytes_in_logbuffer, + &bytesRead, 0); + + /* + * Enter critical section before doing anything that might touch + * global state shared by the main thread. Anything that uses + * palloc()/pfree() in particular are not safe outside the critical + * section. + */ + EnterCriticalSection(&sysloggerSection); + if (!result) + { + DWORD error = GetLastError(); + + if (error == ERROR_HANDLE_EOF || + error == ERROR_BROKEN_PIPE) + break; + _dosmaperr(error); + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not read from logger pipe: %m"))); + } + else if (bytesRead > 0) + { + bytes_in_logbuffer += bytesRead; + process_pipe_input(logbuffer, &bytes_in_logbuffer); + } + + /* + * If we've filled the current logfile, nudge the main thread to do a + * log rotation. + */ + if (Log_RotationSize > 0) + { + if (ftell(syslogFile) >= Log_RotationSize * 1024L || + (csvlogFile != NULL && ftell(csvlogFile) >= Log_RotationSize * 1024L) || + (jsonlogFile != NULL && ftell(jsonlogFile) >= Log_RotationSize * 1024L)) + SetLatch(MyLatch); + } + LeaveCriticalSection(&sysloggerSection); + } + + /* We exit the above loop only upon detecting pipe EOF */ + pipe_eof_seen = true; + + /* if there's any data left then force it out now */ + flush_pipe_input(logbuffer, &bytes_in_logbuffer); + + /* set the latch to waken the main thread, which will quit */ + SetLatch(MyLatch); + + LeaveCriticalSection(&sysloggerSection); + _endthread(); + return 0; +} +#endif /* WIN32 */ + +/* + * Open a new logfile with proper permissions and buffering options. + * + * If allow_errors is true, we just log any open failure and return NULL + * (with errno still correct for the fopen failure). + * Otherwise, errors are treated as fatal. + */ +static FILE * +logfile_open(const char *filename, const char *mode, bool allow_errors) +{ + FILE *fh; + mode_t oumask; + + /* + * Note we do not let Log_file_mode disable IWUSR, since we certainly want + * to be able to write the files ourselves. + */ + oumask = umask((mode_t) ((~(Log_file_mode | S_IWUSR)) & (S_IRWXU | S_IRWXG | S_IRWXO))); + fh = fopen(filename, mode); + umask(oumask); + + if (fh) + { + setvbuf(fh, NULL, PG_IOLBF, 0); + +#ifdef WIN32 + /* use CRLF line endings on Windows */ + _setmode(_fileno(fh), _O_TEXT); +#endif + } + else + { + int save_errno = errno; + + ereport(allow_errors ? LOG : FATAL, + (errcode_for_file_access(), + errmsg("could not open log file \"%s\": %m", + filename))); + errno = save_errno; + } + + return fh; +} + +/* + * Do logfile rotation for a single destination, as specified by target_dest. + * The information stored in *last_file_name and *logFile is updated on a + * successful file rotation. + * + * Returns false if the rotation has been stopped, or true to move on to + * the processing of other formats. + */ +static bool +logfile_rotate_dest(bool time_based_rotation, int size_rotation_for, + pg_time_t fntime, int target_dest, + char **last_file_name, FILE **logFile) +{ + char *logFileExt = NULL; + char *filename; + FILE *fh; + + /* + * If the target destination was just turned off, close the previous file + * and unregister its data. This cannot happen for stderr as syslogFile + * is assumed to be always opened even if stderr is disabled in + * log_destination. + */ + if ((Log_destination & target_dest) == 0 && + target_dest != LOG_DESTINATION_STDERR) + { + if (*logFile != NULL) + fclose(*logFile); + *logFile = NULL; + if (*last_file_name != NULL) + pfree(*last_file_name); + *last_file_name = NULL; + return true; + } + + /* + * Leave if it is not time for a rotation or if the target destination has + * no need to do a rotation based on the size of its file. + */ + if (!time_based_rotation && (size_rotation_for & target_dest) == 0) + return true; + + /* file extension depends on the destination type */ + if (target_dest == LOG_DESTINATION_STDERR) + logFileExt = NULL; + else if (target_dest == LOG_DESTINATION_CSVLOG) + logFileExt = ".csv"; + else if (target_dest == LOG_DESTINATION_JSONLOG) + logFileExt = ".json"; + else + { + /* cannot happen */ + Assert(false); + } + + /* build the new file name */ + filename = logfile_getname(fntime, logFileExt); + + /* + * Decide whether to overwrite or append. We can overwrite if (a) + * Log_truncate_on_rotation is set, (b) the rotation was triggered by + * elapsed time and not something else, and (c) the computed file name is + * different from what we were previously logging into. + */ + if (Log_truncate_on_rotation && time_based_rotation && + *last_file_name != NULL && + strcmp(filename, *last_file_name) != 0) + fh = logfile_open(filename, "w", true); + else + fh = logfile_open(filename, "a", true); + + if (!fh) + { + /* + * ENFILE/EMFILE are not too surprising on a busy system; just keep + * using the old file till we manage to get a new one. Otherwise, + * assume something's wrong with Log_directory and stop trying to + * create files. + */ + if (errno != ENFILE && errno != EMFILE) + { + ereport(LOG, + (errmsg("disabling automatic rotation (use SIGHUP to re-enable)"))); + rotation_disabled = true; + } + + if (filename) + pfree(filename); + return false; + } + + /* fill in the new information */ + if (*logFile != NULL) + fclose(*logFile); + *logFile = fh; + + /* instead of pfree'ing filename, remember it for next time */ + if (*last_file_name != NULL) + pfree(*last_file_name); + *last_file_name = filename; + + return true; +} + +/* + * perform logfile rotation + */ +static void +logfile_rotate(bool time_based_rotation, int size_rotation_for) +{ + pg_time_t fntime; + + rotation_requested = false; + + /* + * When doing a time-based rotation, invent the new logfile name based on + * the planned rotation time, not current time, to avoid "slippage" in the + * file name when we don't do the rotation immediately. + */ + if (time_based_rotation) + fntime = next_rotation_time; + else + fntime = time(NULL); + + /* file rotation for stderr */ + if (!logfile_rotate_dest(time_based_rotation, size_rotation_for, fntime, + LOG_DESTINATION_STDERR, &last_sys_file_name, + &syslogFile)) + return; + + /* file rotation for csvlog */ + if (!logfile_rotate_dest(time_based_rotation, size_rotation_for, fntime, + LOG_DESTINATION_CSVLOG, &last_csv_file_name, + &csvlogFile)) + return; + + /* file rotation for jsonlog */ + if (!logfile_rotate_dest(time_based_rotation, size_rotation_for, fntime, + LOG_DESTINATION_JSONLOG, &last_json_file_name, + &jsonlogFile)) + return; + + update_metainfo_datafile(); + + set_next_rotation_time(); +} + + +/* + * construct logfile name using timestamp information + * + * If suffix isn't NULL, append it to the name, replacing any ".log" + * that may be in the pattern. + * + * Result is palloc'd. + */ +static char * +logfile_getname(pg_time_t timestamp, const char *suffix) +{ + char *filename; + int len; + + filename = palloc(MAXPGPATH); + + snprintf(filename, MAXPGPATH, "%s/", Log_directory); + + len = strlen(filename); + + /* treat Log_filename as a strftime pattern */ + pg_strftime(filename + len, MAXPGPATH - len, Log_filename, + pg_localtime(×tamp, log_timezone)); + + if (suffix != NULL) + { + len = strlen(filename); + if (len > 4 && (strcmp(filename + (len - 4), ".log") == 0)) + len -= 4; + strlcpy(filename + len, suffix, MAXPGPATH - len); + } + + return filename; +} + +/* + * Determine the next planned rotation time, and store in next_rotation_time. + */ +static void +set_next_rotation_time(void) +{ + pg_time_t now; + struct pg_tm *tm; + int rotinterval; + + /* nothing to do if time-based rotation is disabled */ + if (Log_RotationAge <= 0) + return; + + /* + * The requirements here are to choose the next time > now that is a + * "multiple" of the log rotation interval. "Multiple" can be interpreted + * fairly loosely. In this version we align to log_timezone rather than + * GMT. + */ + rotinterval = Log_RotationAge * SECS_PER_MINUTE; /* convert to seconds */ + now = (pg_time_t) time(NULL); + tm = pg_localtime(&now, log_timezone); + now += tm->tm_gmtoff; + now -= now % rotinterval; + now += rotinterval; + now -= tm->tm_gmtoff; + next_rotation_time = now; +} + +/* + * Store the name of the file(s) where the log collector, when enabled, writes + * log messages. Useful for finding the name(s) of the current log file(s) + * when there is time-based logfile rotation. Filenames are stored in a + * temporary file and which is renamed into the final destination for + * atomicity. The file is opened with the same permissions as what gets + * created in the data directory and has proper buffering options. + */ +static void +update_metainfo_datafile(void) +{ + FILE *fh; + mode_t oumask; + + if (!(Log_destination & LOG_DESTINATION_STDERR) && + !(Log_destination & LOG_DESTINATION_CSVLOG) && + !(Log_destination & LOG_DESTINATION_JSONLOG)) + { + if (unlink(LOG_METAINFO_DATAFILE) < 0 && errno != ENOENT) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + LOG_METAINFO_DATAFILE))); + return; + } + + /* use the same permissions as the data directory for the new file */ + oumask = umask(pg_mode_mask); + fh = fopen(LOG_METAINFO_DATAFILE_TMP, "w"); + umask(oumask); + + if (fh) + { + setvbuf(fh, NULL, PG_IOLBF, 0); + +#ifdef WIN32 + /* use CRLF line endings on Windows */ + _setmode(_fileno(fh), _O_TEXT); +#endif + } + else + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + LOG_METAINFO_DATAFILE_TMP))); + return; + } + + if (last_sys_file_name && (Log_destination & LOG_DESTINATION_STDERR)) + { + if (fprintf(fh, "stderr %s\n", last_sys_file_name) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + LOG_METAINFO_DATAFILE_TMP))); + fclose(fh); + return; + } + } + + if (last_csv_file_name && (Log_destination & LOG_DESTINATION_CSVLOG)) + { + if (fprintf(fh, "csvlog %s\n", last_csv_file_name) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + LOG_METAINFO_DATAFILE_TMP))); + fclose(fh); + return; + } + } + + if (last_json_file_name && (Log_destination & LOG_DESTINATION_JSONLOG)) + { + if (fprintf(fh, "jsonlog %s\n", last_json_file_name) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + LOG_METAINFO_DATAFILE_TMP))); + fclose(fh); + return; + } + } + fclose(fh); + + if (rename(LOG_METAINFO_DATAFILE_TMP, LOG_METAINFO_DATAFILE) != 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + LOG_METAINFO_DATAFILE_TMP, LOG_METAINFO_DATAFILE))); +} + +/* -------------------------------- + * signal handler routines + * -------------------------------- + */ + +/* + * Check to see if a log rotation request has arrived. Should be + * called by postmaster after receiving SIGUSR1. + */ +bool +CheckLogrotateSignal(void) +{ + struct stat stat_buf; + + if (stat(LOGROTATE_SIGNAL_FILE, &stat_buf) == 0) + return true; + + return false; +} + +/* + * Remove the file signaling a log rotation request. + */ +void +RemoveLogrotateSignalFiles(void) +{ + unlink(LOGROTATE_SIGNAL_FILE); +} + +/* SIGUSR1: set flag to rotate logfile */ +static void +sigUsr1Handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + rotation_requested = true; + SetLatch(MyLatch); + + errno = save_errno; +} diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c new file mode 100644 index 0000000..266fbc2 --- /dev/null +++ b/src/backend/postmaster/walwriter.c @@ -0,0 +1,300 @@ +/*------------------------------------------------------------------------- + * + * walwriter.c + * + * The WAL writer background process is new as of Postgres 8.3. It attempts + * to keep regular backends from having to write out (and fsync) WAL pages. + * Also, it guarantees that transaction commit records that weren't synced + * to disk immediately upon commit (ie, were "asynchronously committed") + * will reach disk within a knowable time --- which, as it happens, is at + * most three times the wal_writer_delay cycle time. + * + * Note that as with the bgwriter for shared buffers, regular backends are + * still empowered to issue WAL writes and fsyncs when the walwriter doesn't + * keep up. This means that the WALWriter is not an essential process and + * can shutdown quickly when requested. + * + * Because the walwriter's cycle is directly linked to the maximum delay + * before async-commit transactions are guaranteed committed, it's probably + * unwise to load additional functionality onto it. For instance, if you've + * got a yen to create xlog segments further in advance, that'd be better done + * in bgwriter than in walwriter. + * + * The walwriter is started by the postmaster as soon as the startup subprocess + * finishes. It remains alive until the postmaster commands it to terminate. + * Normal termination is by SIGTERM, which instructs the walwriter to exit(0). + * Emergency termination is by SIGQUIT; like any backend, the walwriter will + * simply abort and exit on SIGQUIT. + * + * If the walwriter exits unexpectedly, the postmaster treats that the same + * as a backend crash: shared memory may be corrupted, so remaining backends + * should be killed by SIGQUIT and then a recovery cycle started. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/postmaster/walwriter.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <signal.h> +#include <unistd.h> + +#include "access/xlog.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/interrupt.h" +#include "postmaster/walwriter.h" +#include "storage/bufmgr.h" +#include "storage/condition_variable.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/proc.h" +#include "storage/procsignal.h" +#include "storage/smgr.h" +#include "utils/guc.h" +#include "utils/hsearch.h" +#include "utils/memutils.h" +#include "utils/resowner.h" + + +/* + * GUC parameters + */ +int WalWriterDelay = 200; +int WalWriterFlushAfter = DEFAULT_WAL_WRITER_FLUSH_AFTER; + +/* + * Number of do-nothing loops before lengthening the delay time, and the + * multiplier to apply to WalWriterDelay when we do decide to hibernate. + * (Perhaps these need to be configurable?) + */ +#define LOOPS_UNTIL_HIBERNATE 50 +#define HIBERNATE_FACTOR 25 + +/* Prototypes for private functions */ +static void HandleWalWriterInterrupts(void); + +/* + * Main entry point for walwriter process + * + * This is invoked from AuxiliaryProcessMain, which has already created the + * basic execution environment, but not enabled signals yet. + */ +void +WalWriterMain(void) +{ + sigjmp_buf local_sigjmp_buf; + MemoryContext walwriter_context; + int left_till_hibernate; + bool hibernating; + + /* + * Properly accept or ignore signals the postmaster might send us + * + * We have no particular use for SIGINT at the moment, but seems + * reasonable to treat like SIGTERM. + */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGINT, SignalHandlerForShutdownRequest); + pqsignal(SIGTERM, SignalHandlerForShutdownRequest); + /* SIGQUIT handler was already set up by InitPostmasterChild */ + pqsignal(SIGALRM, SIG_IGN); + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, SIG_IGN); /* not used */ + + /* + * Reset some signals that are accepted by postmaster but not here + */ + pqsignal(SIGCHLD, SIG_DFL); + + /* + * Create a memory context that we will do all our work in. We do this so + * that we can reset the context during error recovery and thereby avoid + * possible memory leaks. Formerly this code just ran in + * TopMemoryContext, but resetting that would be a really bad idea. + */ + walwriter_context = AllocSetContextCreate(TopMemoryContext, + "Wal Writer", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(walwriter_context); + + /* + * If an exception is encountered, processing resumes here. + * + * You might wonder why this isn't coded as an infinite loop around a + * PG_TRY construct. The reason is that this is the bottom of the + * exception stack, and so with PG_TRY there would be no exception handler + * in force at all during the CATCH part. By leaving the outermost setjmp + * always active, we have at least some chance of recovering from an error + * during error recovery. (If we get into an infinite loop thereby, it + * will soon be stopped by overflow of elog.c's internal state stack.) + * + * Note that we use sigsetjmp(..., 1), so that the prevailing signal mask + * (to wit, BlockSig) will be restored when longjmp'ing to here. Thus, + * signals other than SIGQUIT will be blocked until we complete error + * recovery. It might seem that this policy makes the HOLD_INTERRUPTS() + * call redundant, but it is not since InterruptPending might be set + * already. + */ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* Since not using PG_TRY, must reset error stack by hand */ + error_context_stack = NULL; + + /* Prevent interrupts while cleaning up */ + HOLD_INTERRUPTS(); + + /* Report the error to the server log */ + EmitErrorReport(); + + /* + * These operations are really just a minimal subset of + * AbortTransaction(). We don't have very many resources to worry + * about in walwriter, but we do have LWLocks, and perhaps buffers? + */ + LWLockReleaseAll(); + ConditionVariableCancelSleep(); + pgstat_report_wait_end(); + UnlockBuffers(); + ReleaseAuxProcessResources(false); + AtEOXact_Buffers(false); + AtEOXact_SMgr(); + AtEOXact_Files(false); + AtEOXact_HashTables(false); + + /* + * Now return to normal top-level context and clear ErrorContext for + * next time. + */ + MemoryContextSwitchTo(walwriter_context); + FlushErrorState(); + + /* Flush any leaked data in the top-level context */ + MemoryContextResetAndDeleteChildren(walwriter_context); + + /* Now we can allow interrupts again */ + RESUME_INTERRUPTS(); + + /* + * Sleep at least 1 second after any error. A write error is likely + * to be repeated, and we don't want to be filling the error logs as + * fast as we can. + */ + pg_usleep(1000000L); + + /* + * Close all open files after any error. This is helpful on Windows, + * where holding deleted files open causes various strange errors. + * It's not clear we need it elsewhere, but shouldn't hurt. + */ + smgrcloseall(); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + /* + * Unblock signals (they were blocked when the postmaster forked us) + */ + sigprocmask(SIG_SETMASK, &UnBlockSig, NULL); + + /* + * Reset hibernation state after any error. + */ + left_till_hibernate = LOOPS_UNTIL_HIBERNATE; + hibernating = false; + SetWalWriterSleeping(false); + + /* + * Advertise our latch that backends can use to wake us up while we're + * sleeping. + */ + ProcGlobal->walwriterLatch = &MyProc->procLatch; + + /* + * Loop forever + */ + for (;;) + { + long cur_timeout; + + /* + * Advertise whether we might hibernate in this cycle. We do this + * before resetting the latch to ensure that any async commits will + * see the flag set if they might possibly need to wake us up, and + * that we won't miss any signal they send us. (If we discover work + * to do in the last cycle before we would hibernate, the global flag + * will be set unnecessarily, but little harm is done.) But avoid + * touching the global flag if it doesn't need to change. + */ + if (hibernating != (left_till_hibernate <= 1)) + { + hibernating = (left_till_hibernate <= 1); + SetWalWriterSleeping(hibernating); + } + + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + /* Process any signals received recently */ + HandleWalWriterInterrupts(); + + /* + * Do what we're here for; then, if XLogBackgroundFlush() found useful + * work to do, reset hibernation counter. + */ + if (XLogBackgroundFlush()) + left_till_hibernate = LOOPS_UNTIL_HIBERNATE; + else if (left_till_hibernate > 0) + left_till_hibernate--; + + /* report pending statistics to the cumulative stats system */ + pgstat_report_wal(false); + + /* + * Sleep until we are signaled or WalWriterDelay has elapsed. If we + * haven't done anything useful for quite some time, lengthen the + * sleep time so as to reduce the server's idle power consumption. + */ + if (left_till_hibernate > 0) + cur_timeout = WalWriterDelay; /* in ms */ + else + cur_timeout = WalWriterDelay * HIBERNATE_FACTOR; + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + cur_timeout, + WAIT_EVENT_WAL_WRITER_MAIN); + } +} + +/* + * Interrupt handler for main loops of WAL writer process. + */ +static void +HandleWalWriterInterrupts(void) +{ + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + if (ShutdownRequestPending) + proc_exit(0); + + /* Perform logging of memory contexts of this process */ + if (LogMemoryContextPending) + ProcessLogMemoryContextInterrupt(); +} |