diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
commit | 46651ce6fe013220ed397add242004d764fc0153 (patch) | |
tree | 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/postmaster | |
parent | Initial commit. (diff) | |
download | postgresql-14-upstream.tar.xz postgresql-14-upstream.zip |
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | src/backend/postmaster/Makefile | 29 | ||||
-rw-r--r-- | src/backend/postmaster/autovacuum.c | 3505 | ||||
-rw-r--r-- | src/backend/postmaster/bgworker.c | 1325 | ||||
-rw-r--r-- | src/backend/postmaster/bgwriter.c | 351 | ||||
-rw-r--r-- | src/backend/postmaster/checkpointer.c | 1354 | ||||
-rw-r--r-- | src/backend/postmaster/fork_process.c | 115 | ||||
-rw-r--r-- | src/backend/postmaster/interrupt.c | 112 | ||||
-rw-r--r-- | src/backend/postmaster/pgarch.c | 718 | ||||
-rw-r--r-- | src/backend/postmaster/pgstat.c | 5851 | ||||
-rw-r--r-- | src/backend/postmaster/postmaster.c | 6647 | ||||
-rw-r--r-- | src/backend/postmaster/startup.c | 283 | ||||
-rw-r--r-- | src/backend/postmaster/syslogger.c | 1566 | ||||
-rw-r--r-- | src/backend/postmaster/walwriter.c | 309 |
13 files changed, 22165 insertions, 0 deletions
diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile new file mode 100644 index 0000000..bfdf6a8 --- /dev/null +++ b/src/backend/postmaster/Makefile @@ -0,0 +1,29 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for src/backend/postmaster +# +# IDENTIFICATION +# src/backend/postmaster/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/postmaster +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + autovacuum.o \ + bgworker.o \ + bgwriter.o \ + checkpointer.o \ + fork_process.o \ + interrupt.o \ + pgarch.o \ + pgstat.o \ + postmaster.o \ + startup.o \ + syslogger.o \ + walwriter.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c new file mode 100644 index 0000000..23ebac8 --- /dev/null +++ b/src/backend/postmaster/autovacuum.c @@ -0,0 +1,3505 @@ +/*------------------------------------------------------------------------- + * + * autovacuum.c + * + * PostgreSQL Integrated Autovacuum Daemon + * + * The autovacuum system is structured in two different kinds of processes: the + * autovacuum launcher and the autovacuum worker. The launcher is an + * always-running process, started by the postmaster when the autovacuum GUC + * parameter is set. The launcher schedules autovacuum workers to be started + * when appropriate. The workers are the processes which execute the actual + * vacuuming; they connect to a database as determined in the launcher, and + * once connected they examine the catalogs to select the tables to vacuum. + * + * The autovacuum launcher cannot start the worker processes by itself, + * because doing so would cause robustness issues (namely, failure to shut + * them down on exceptional conditions, and also, since the launcher is + * connected to shared memory and is thus subject to corruption there, it is + * not as robust as the postmaster). So it leaves that task to the postmaster. + * + * There is an autovacuum shared memory area, where the launcher stores + * information about the database it wants vacuumed. When it wants a new + * worker to start, it sets a flag in shared memory and sends a signal to the + * postmaster. Then postmaster knows nothing more than it must start a worker; + * so it forks a new child, which turns into a worker. This new process + * connects to shared memory, and there it can inspect the information that the + * launcher has set up. + * + * If the fork() call fails in the postmaster, it sets a flag in the shared + * memory area, and sends a signal to the launcher. The launcher, upon + * noticing the flag, can try starting the worker again by resending the + * signal. Note that the failure can only be transient (fork failure due to + * high load, memory pressure, too many processes, etc); more permanent + * problems, like failure to connect to a database, are detected later in the + * worker and dealt with just by having the worker exit normally. The launcher + * will launch a new worker again later, per schedule. + * + * When the worker is done vacuuming it sends SIGUSR2 to the launcher. The + * launcher then wakes up and is able to launch another worker, if the schedule + * is so tight that a new worker is needed immediately. At this time the + * launcher can also balance the settings for the various remaining workers' + * cost-based vacuum delay feature. + * + * Note that there can be more than one worker in a database concurrently. + * They will store the table they are currently vacuuming in shared memory, so + * that other workers avoid being blocked waiting for the vacuum lock for that + * table. They will also reload the pgstats data just before vacuuming each + * table, to avoid vacuuming a table that was just finished being vacuumed by + * another worker and thus is no longer noted in shared memory. However, + * there is a window (caused by pgstat delay) on which a worker may choose a + * table that was already vacuumed; this is a bug in the current design. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/postmaster/autovacuum.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <signal.h> +#include <sys/time.h> +#include <unistd.h> + +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/multixact.h" +#include "access/reloptions.h" +#include "access/tableam.h" +#include "access/transam.h" +#include "access/xact.h" +#include "catalog/dependency.h" +#include "catalog/namespace.h" +#include "catalog/pg_database.h" +#include "commands/dbcommands.h" +#include "commands/vacuum.h" +#include "lib/ilist.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "nodes/makefuncs.h" +#include "pgstat.h" +#include "postmaster/autovacuum.h" +#include "postmaster/fork_process.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "storage/bufmgr.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/lmgr.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/procsignal.h" +#include "storage/sinvaladt.h" +#include "storage/smgr.h" +#include "tcop/tcopprot.h" +#include "utils/fmgroids.h" +#include "utils/fmgrprotos.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/ps_status.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" +#include "utils/timeout.h" +#include "utils/timestamp.h" + + +/* + * GUC parameters + */ +bool autovacuum_start_daemon = false; +int autovacuum_max_workers; +int autovacuum_work_mem = -1; +int autovacuum_naptime; +int autovacuum_vac_thresh; +double autovacuum_vac_scale; +int autovacuum_vac_ins_thresh; +double autovacuum_vac_ins_scale; +int autovacuum_anl_thresh; +double autovacuum_anl_scale; +int autovacuum_freeze_max_age; +int autovacuum_multixact_freeze_max_age; + +double autovacuum_vac_cost_delay; +int autovacuum_vac_cost_limit; + +int Log_autovacuum_min_duration = -1; + +/* how long to keep pgstat data in the launcher, in milliseconds */ +#define STATS_READ_DELAY 1000 + +/* the minimum allowed time between two awakenings of the launcher */ +#define MIN_AUTOVAC_SLEEPTIME 100.0 /* milliseconds */ +#define MAX_AUTOVAC_SLEEPTIME 300 /* seconds */ + +/* Flags to tell if we are in an autovacuum process */ +static bool am_autovacuum_launcher = false; +static bool am_autovacuum_worker = false; + +/* Flags set by signal handlers */ +static volatile sig_atomic_t got_SIGUSR2 = false; + +/* Comparison points for determining whether freeze_max_age is exceeded */ +static TransactionId recentXid; +static MultiXactId recentMulti; + +/* Default freeze ages to use for autovacuum (varies by database) */ +static int default_freeze_min_age; +static int default_freeze_table_age; +static int default_multixact_freeze_min_age; +static int default_multixact_freeze_table_age; + +/* Memory context for long-lived data */ +static MemoryContext AutovacMemCxt; + +/* struct to keep track of databases in launcher */ +typedef struct avl_dbase +{ + Oid adl_datid; /* hash key -- must be first */ + TimestampTz adl_next_worker; + int adl_score; + dlist_node adl_node; +} avl_dbase; + +/* struct to keep track of databases in worker */ +typedef struct avw_dbase +{ + Oid adw_datid; + char *adw_name; + TransactionId adw_frozenxid; + MultiXactId adw_minmulti; + PgStat_StatDBEntry *adw_entry; +} avw_dbase; + +/* struct to keep track of tables to vacuum and/or analyze, in 1st pass */ +typedef struct av_relation +{ + Oid ar_toastrelid; /* hash key - must be first */ + Oid ar_relid; + bool ar_hasrelopts; + AutoVacOpts ar_reloptions; /* copy of AutoVacOpts from the main table's + * reloptions, or NULL if none */ +} av_relation; + +/* struct to keep track of tables to vacuum and/or analyze, after rechecking */ +typedef struct autovac_table +{ + Oid at_relid; + VacuumParams at_params; + double at_vacuum_cost_delay; + int at_vacuum_cost_limit; + bool at_dobalance; + bool at_sharedrel; + char *at_relname; + char *at_nspname; + char *at_datname; +} autovac_table; + +/*------------- + * This struct holds information about a single worker's whereabouts. We keep + * an array of these in shared memory, sized according to + * autovacuum_max_workers. + * + * wi_links entry into free list or running list + * wi_dboid OID of the database this worker is supposed to work on + * wi_tableoid OID of the table currently being vacuumed, if any + * wi_sharedrel flag indicating whether table is marked relisshared + * wi_proc pointer to PGPROC of the running worker, NULL if not started + * wi_launchtime Time at which this worker was launched + * wi_cost_* Vacuum cost-based delay parameters current in this worker + * + * All fields are protected by AutovacuumLock, except for wi_tableoid and + * wi_sharedrel which are protected by AutovacuumScheduleLock (note these + * two fields are read-only for everyone except that worker itself). + *------------- + */ +typedef struct WorkerInfoData +{ + dlist_node wi_links; + Oid wi_dboid; + Oid wi_tableoid; + PGPROC *wi_proc; + TimestampTz wi_launchtime; + bool wi_dobalance; + bool wi_sharedrel; + double wi_cost_delay; + int wi_cost_limit; + int wi_cost_limit_base; +} WorkerInfoData; + +typedef struct WorkerInfoData *WorkerInfo; + +/* + * Possible signals received by the launcher from remote processes. These are + * stored atomically in shared memory so that other processes can set them + * without locking. + */ +typedef enum +{ + AutoVacForkFailed, /* failed trying to start a worker */ + AutoVacRebalance, /* rebalance the cost limits */ + AutoVacNumSignals /* must be last */ +} AutoVacuumSignal; + +/* + * Autovacuum workitem array, stored in AutoVacuumShmem->av_workItems. This + * list is mostly protected by AutovacuumLock, except that if an item is + * marked 'active' other processes must not modify the work-identifying + * members. + */ +typedef struct AutoVacuumWorkItem +{ + AutoVacuumWorkItemType avw_type; + bool avw_used; /* below data is valid */ + bool avw_active; /* being processed */ + Oid avw_database; + Oid avw_relation; + BlockNumber avw_blockNumber; +} AutoVacuumWorkItem; + +#define NUM_WORKITEMS 256 + +/*------------- + * The main autovacuum shmem struct. On shared memory we store this main + * struct and the array of WorkerInfo structs. This struct keeps: + * + * av_signal set by other processes to indicate various conditions + * av_launcherpid the PID of the autovacuum launcher + * av_freeWorkers the WorkerInfo freelist + * av_runningWorkers the WorkerInfo non-free queue + * av_startingWorker pointer to WorkerInfo currently being started (cleared by + * the worker itself as soon as it's up and running) + * av_workItems work item array + * + * This struct is protected by AutovacuumLock, except for av_signal and parts + * of the worker list (see above). + *------------- + */ +typedef struct +{ + sig_atomic_t av_signal[AutoVacNumSignals]; + pid_t av_launcherpid; + dlist_head av_freeWorkers; + dlist_head av_runningWorkers; + WorkerInfo av_startingWorker; + AutoVacuumWorkItem av_workItems[NUM_WORKITEMS]; +} AutoVacuumShmemStruct; + +static AutoVacuumShmemStruct *AutoVacuumShmem; + +/* + * the database list (of avl_dbase elements) in the launcher, and the context + * that contains it + */ +static dlist_head DatabaseList = DLIST_STATIC_INIT(DatabaseList); +static MemoryContext DatabaseListCxt = NULL; + +/* Pointer to my own WorkerInfo, valid on each worker */ +static WorkerInfo MyWorkerInfo = NULL; + +/* PID of launcher, valid only in worker while shutting down */ +int AutovacuumLauncherPid = 0; + +#ifdef EXEC_BACKEND +static pid_t avlauncher_forkexec(void); +static pid_t avworker_forkexec(void); +#endif +NON_EXEC_STATIC void AutoVacWorkerMain(int argc, char *argv[]) pg_attribute_noreturn(); +NON_EXEC_STATIC void AutoVacLauncherMain(int argc, char *argv[]) pg_attribute_noreturn(); + +static Oid do_start_worker(void); +static void HandleAutoVacLauncherInterrupts(void); +static void AutoVacLauncherShutdown(void) pg_attribute_noreturn(); +static void launcher_determine_sleep(bool canlaunch, bool recursing, + struct timeval *nap); +static void launch_worker(TimestampTz now); +static List *get_database_list(void); +static void rebuild_database_list(Oid newdb); +static int db_comparator(const void *a, const void *b); +static void autovac_balance_cost(void); + +static void do_autovacuum(void); +static void FreeWorkerInfo(int code, Datum arg); + +static autovac_table *table_recheck_autovac(Oid relid, HTAB *table_toast_map, + TupleDesc pg_class_desc, + int effective_multixact_freeze_max_age); +static void recheck_relation_needs_vacanalyze(Oid relid, AutoVacOpts *avopts, + Form_pg_class classForm, + int effective_multixact_freeze_max_age, + bool *dovacuum, bool *doanalyze, bool *wraparound); +static void relation_needs_vacanalyze(Oid relid, AutoVacOpts *relopts, + Form_pg_class classForm, + PgStat_StatTabEntry *tabentry, + int effective_multixact_freeze_max_age, + bool *dovacuum, bool *doanalyze, bool *wraparound); + +static void autovacuum_do_vac_analyze(autovac_table *tab, + BufferAccessStrategy bstrategy); +static AutoVacOpts *extract_autovac_opts(HeapTuple tup, + TupleDesc pg_class_desc); +static PgStat_StatTabEntry *get_pgstat_tabentry_relid(Oid relid, bool isshared, + PgStat_StatDBEntry *shared, + PgStat_StatDBEntry *dbentry); +static void perform_work_item(AutoVacuumWorkItem *workitem); +static void autovac_report_activity(autovac_table *tab); +static void autovac_report_workitem(AutoVacuumWorkItem *workitem, + const char *nspname, const char *relname); +static void avl_sigusr2_handler(SIGNAL_ARGS); +static void autovac_refresh_stats(void); + + + +/******************************************************************** + * AUTOVACUUM LAUNCHER CODE + ********************************************************************/ + +#ifdef EXEC_BACKEND +/* + * forkexec routine for the autovacuum launcher process. + * + * Format up the arglist, then fork and exec. + */ +static pid_t +avlauncher_forkexec(void) +{ + char *av[10]; + int ac = 0; + + av[ac++] = "postgres"; + av[ac++] = "--forkavlauncher"; + av[ac++] = NULL; /* filled in by postmaster_forkexec */ + av[ac] = NULL; + + Assert(ac < lengthof(av)); + + return postmaster_forkexec(ac, av); +} + +/* + * We need this set from the outside, before InitProcess is called + */ +void +AutovacuumLauncherIAm(void) +{ + am_autovacuum_launcher = true; +} +#endif + +/* + * Main entry point for autovacuum launcher process, to be called from the + * postmaster. + */ +int +StartAutoVacLauncher(void) +{ + pid_t AutoVacPID; + +#ifdef EXEC_BACKEND + switch ((AutoVacPID = avlauncher_forkexec())) +#else + switch ((AutoVacPID = fork_process())) +#endif + { + case -1: + ereport(LOG, + (errmsg("could not fork autovacuum launcher process: %m"))); + return 0; + +#ifndef EXEC_BACKEND + case 0: + /* in postmaster child ... */ + InitPostmasterChild(); + + /* Close the postmaster's sockets */ + ClosePostmasterPorts(false); + + AutoVacLauncherMain(0, NULL); + break; +#endif + default: + return (int) AutoVacPID; + } + + /* shouldn't get here */ + return 0; +} + +/* + * Main loop for the autovacuum launcher process. + */ +NON_EXEC_STATIC void +AutoVacLauncherMain(int argc, char *argv[]) +{ + sigjmp_buf local_sigjmp_buf; + + am_autovacuum_launcher = true; + + MyBackendType = B_AUTOVAC_LAUNCHER; + init_ps_display(NULL); + + ereport(DEBUG1, + (errmsg_internal("autovacuum launcher started"))); + + if (PostAuthDelay) + pg_usleep(PostAuthDelay * 1000000L); + + SetProcessingMode(InitProcessing); + + /* + * Set up signal handlers. We operate on databases much like a regular + * backend, so we use the same signal handling. See equivalent code in + * tcop/postgres.c. + */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGINT, StatementCancelHandler); + pqsignal(SIGTERM, SignalHandlerForShutdownRequest); + /* SIGQUIT handler was already set up by InitPostmasterChild */ + + InitializeTimeouts(); /* establishes SIGALRM handler */ + + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, avl_sigusr2_handler); + pqsignal(SIGFPE, FloatExceptionHandler); + pqsignal(SIGCHLD, SIG_DFL); + + /* Early initialization */ + BaseInit(); + + /* + * Create a per-backend PGPROC struct in shared memory, except in the + * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do + * this before we can use LWLocks (and in the EXEC_BACKEND case we already + * had to do some stuff with LWLocks). + */ +#ifndef EXEC_BACKEND + InitProcess(); +#endif + + InitPostgres(NULL, InvalidOid, NULL, InvalidOid, NULL, false); + + SetProcessingMode(NormalProcessing); + + /* + * Create a memory context that we will do all our work in. We do this so + * that we can reset the context during error recovery and thereby avoid + * possible memory leaks. + */ + AutovacMemCxt = AllocSetContextCreate(TopMemoryContext, + "Autovacuum Launcher", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(AutovacMemCxt); + + /* + * If an exception is encountered, processing resumes here. + * + * This code is a stripped down version of PostgresMain error recovery. + * + * Note that we use sigsetjmp(..., 1), so that the prevailing signal mask + * (to wit, BlockSig) will be restored when longjmp'ing to here. Thus, + * signals other than SIGQUIT will be blocked until we complete error + * recovery. It might seem that this policy makes the HOLD_INTERRUPTS() + * call redundant, but it is not since InterruptPending might be set + * already. + */ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* since not using PG_TRY, must reset error stack by hand */ + error_context_stack = NULL; + + /* Prevents interrupts while cleaning up */ + HOLD_INTERRUPTS(); + + /* Forget any pending QueryCancel or timeout request */ + disable_all_timeouts(false); + QueryCancelPending = false; /* second to avoid race condition */ + + /* Report the error to the server log */ + EmitErrorReport(); + + /* Abort the current transaction in order to recover */ + AbortCurrentTransaction(); + + /* + * Release any other resources, for the case where we were not in a + * transaction. + */ + LWLockReleaseAll(); + pgstat_report_wait_end(); + AbortBufferIO(); + UnlockBuffers(); + /* this is probably dead code, but let's be safe: */ + if (AuxProcessResourceOwner) + ReleaseAuxProcessResources(false); + AtEOXact_Buffers(false); + AtEOXact_SMgr(); + AtEOXact_Files(false); + AtEOXact_HashTables(false); + + /* + * Now return to normal top-level context and clear ErrorContext for + * next time. + */ + MemoryContextSwitchTo(AutovacMemCxt); + FlushErrorState(); + + /* Flush any leaked data in the top-level context */ + MemoryContextResetAndDeleteChildren(AutovacMemCxt); + + /* don't leave dangling pointers to freed memory */ + DatabaseListCxt = NULL; + dlist_init(&DatabaseList); + + /* + * Make sure pgstat also considers our stat data as gone. Note: we + * mustn't use autovac_refresh_stats here. + */ + pgstat_clear_snapshot(); + + /* Now we can allow interrupts again */ + RESUME_INTERRUPTS(); + + /* if in shutdown mode, no need for anything further; just go away */ + if (ShutdownRequestPending) + AutoVacLauncherShutdown(); + + /* + * Sleep at least 1 second after any error. We don't want to be + * filling the error logs as fast as we can. + */ + pg_usleep(1000000L); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + /* must unblock signals before calling rebuild_database_list */ + PG_SETMASK(&UnBlockSig); + + /* + * Set always-secure search path. Launcher doesn't connect to a database, + * so this has no effect. + */ + SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE); + + /* + * Force zero_damaged_pages OFF in the autovac process, even if it is set + * in postgresql.conf. We don't really want such a dangerous option being + * applied non-interactively. + */ + SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE); + + /* + * Force settable timeouts off to avoid letting these settings prevent + * regular maintenance from being executed. + */ + SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE); + SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE); + SetConfigOption("idle_in_transaction_session_timeout", "0", + PGC_SUSET, PGC_S_OVERRIDE); + + /* + * Force default_transaction_isolation to READ COMMITTED. We don't want + * to pay the overhead of serializable mode, nor add any risk of causing + * deadlocks or delaying other transactions. + */ + SetConfigOption("default_transaction_isolation", "read committed", + PGC_SUSET, PGC_S_OVERRIDE); + + /* + * In emergency mode, just start a worker (unless shutdown was requested) + * and go away. + */ + if (!AutoVacuumingActive()) + { + if (!ShutdownRequestPending) + do_start_worker(); + proc_exit(0); /* done */ + } + + AutoVacuumShmem->av_launcherpid = MyProcPid; + + /* + * Create the initial database list. The invariant we want this list to + * keep is that it's ordered by decreasing next_time. As soon as an entry + * is updated to a higher time, it will be moved to the front (which is + * correct because the only operation is to add autovacuum_naptime to the + * entry, and time always increases). + */ + rebuild_database_list(InvalidOid); + + /* loop until shutdown request */ + while (!ShutdownRequestPending) + { + struct timeval nap; + TimestampTz current_time = 0; + bool can_launch; + + /* + * This loop is a bit different from the normal use of WaitLatch, + * because we'd like to sleep before the first launch of a child + * process. So it's WaitLatch, then ResetLatch, then check for + * wakening conditions. + */ + + launcher_determine_sleep(!dlist_is_empty(&AutoVacuumShmem->av_freeWorkers), + false, &nap); + + /* + * Wait until naptime expires or we get some type of signal (all the + * signal handlers will wake us by calling SetLatch). + */ + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L), + WAIT_EVENT_AUTOVACUUM_MAIN); + + ResetLatch(MyLatch); + + HandleAutoVacLauncherInterrupts(); + + /* + * a worker finished, or postmaster signaled failure to start a worker + */ + if (got_SIGUSR2) + { + got_SIGUSR2 = false; + + /* rebalance cost limits, if needed */ + if (AutoVacuumShmem->av_signal[AutoVacRebalance]) + { + LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); + AutoVacuumShmem->av_signal[AutoVacRebalance] = false; + autovac_balance_cost(); + LWLockRelease(AutovacuumLock); + } + + if (AutoVacuumShmem->av_signal[AutoVacForkFailed]) + { + /* + * If the postmaster failed to start a new worker, we sleep + * for a little while and resend the signal. The new worker's + * state is still in memory, so this is sufficient. After + * that, we restart the main loop. + * + * XXX should we put a limit to the number of times we retry? + * I don't think it makes much sense, because a future start + * of a worker will continue to fail in the same way. + */ + AutoVacuumShmem->av_signal[AutoVacForkFailed] = false; + pg_usleep(1000000L); /* 1s */ + SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER); + continue; + } + } + + /* + * There are some conditions that we need to check before trying to + * start a worker. First, we need to make sure that there is a worker + * slot available. Second, we need to make sure that no other worker + * failed while starting up. + */ + + current_time = GetCurrentTimestamp(); + LWLockAcquire(AutovacuumLock, LW_SHARED); + + can_launch = !dlist_is_empty(&AutoVacuumShmem->av_freeWorkers); + + if (AutoVacuumShmem->av_startingWorker != NULL) + { + int waittime; + WorkerInfo worker = AutoVacuumShmem->av_startingWorker; + + /* + * We can't launch another worker when another one is still + * starting up (or failed while doing so), so just sleep for a bit + * more; that worker will wake us up again as soon as it's ready. + * We will only wait autovacuum_naptime seconds (up to a maximum + * of 60 seconds) for this to happen however. Note that failure + * to connect to a particular database is not a problem here, + * because the worker removes itself from the startingWorker + * pointer before trying to connect. Problems detected by the + * postmaster (like fork() failure) are also reported and handled + * differently. The only problems that may cause this code to + * fire are errors in the earlier sections of AutoVacWorkerMain, + * before the worker removes the WorkerInfo from the + * startingWorker pointer. + */ + waittime = Min(autovacuum_naptime, 60) * 1000; + if (TimestampDifferenceExceeds(worker->wi_launchtime, current_time, + waittime)) + { + LWLockRelease(AutovacuumLock); + LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); + + /* + * No other process can put a worker in starting mode, so if + * startingWorker is still INVALID after exchanging our lock, + * we assume it's the same one we saw above (so we don't + * recheck the launch time). + */ + if (AutoVacuumShmem->av_startingWorker != NULL) + { + worker = AutoVacuumShmem->av_startingWorker; + worker->wi_dboid = InvalidOid; + worker->wi_tableoid = InvalidOid; + worker->wi_sharedrel = false; + worker->wi_proc = NULL; + worker->wi_launchtime = 0; + dlist_push_head(&AutoVacuumShmem->av_freeWorkers, + &worker->wi_links); + AutoVacuumShmem->av_startingWorker = NULL; + elog(WARNING, "worker took too long to start; canceled"); + } + } + else + can_launch = false; + } + LWLockRelease(AutovacuumLock); /* either shared or exclusive */ + + /* if we can't do anything, just go back to sleep */ + if (!can_launch) + continue; + + /* We're OK to start a new worker */ + + if (dlist_is_empty(&DatabaseList)) + { + /* + * Special case when the list is empty: start a worker right away. + * This covers the initial case, when no database is in pgstats + * (thus the list is empty). Note that the constraints in + * launcher_determine_sleep keep us from starting workers too + * quickly (at most once every autovacuum_naptime when the list is + * empty). + */ + launch_worker(current_time); + } + else + { + /* + * because rebuild_database_list constructs a list with most + * distant adl_next_worker first, we obtain our database from the + * tail of the list. + */ + avl_dbase *avdb; + + avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList); + + /* + * launch a worker if next_worker is right now or it is in the + * past + */ + if (TimestampDifferenceExceeds(avdb->adl_next_worker, + current_time, 0)) + launch_worker(current_time); + } + } + + AutoVacLauncherShutdown(); +} + +/* + * Process any new interrupts. + */ +static void +HandleAutoVacLauncherInterrupts(void) +{ + /* the normal shutdown case */ + if (ShutdownRequestPending) + AutoVacLauncherShutdown(); + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + + /* shutdown requested in config file? */ + if (!AutoVacuumingActive()) + AutoVacLauncherShutdown(); + + /* rebalance in case the default cost parameters changed */ + LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); + autovac_balance_cost(); + LWLockRelease(AutovacuumLock); + + /* rebuild the list in case the naptime changed */ + rebuild_database_list(InvalidOid); + } + + /* Process barrier events */ + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); + + /* Perform logging of memory contexts of this process */ + if (LogMemoryContextPending) + ProcessLogMemoryContextInterrupt(); + + /* Process sinval catchup interrupts that happened while sleeping */ + ProcessCatchupInterrupt(); +} + +/* + * Perform a normal exit from the autovac launcher. + */ +static void +AutoVacLauncherShutdown(void) +{ + ereport(DEBUG1, + (errmsg_internal("autovacuum launcher shutting down"))); + AutoVacuumShmem->av_launcherpid = 0; + + proc_exit(0); /* done */ +} + +/* + * Determine the time to sleep, based on the database list. + * + * The "canlaunch" parameter indicates whether we can start a worker right now, + * for example due to the workers being all busy. If this is false, we will + * cause a long sleep, which will be interrupted when a worker exits. + */ +static void +launcher_determine_sleep(bool canlaunch, bool recursing, struct timeval *nap) +{ + /* + * We sleep until the next scheduled vacuum. We trust that when the + * database list was built, care was taken so that no entries have times + * in the past; if the first entry has too close a next_worker value, or a + * time in the past, we will sleep a small nominal time. + */ + if (!canlaunch) + { + nap->tv_sec = autovacuum_naptime; + nap->tv_usec = 0; + } + else if (!dlist_is_empty(&DatabaseList)) + { + TimestampTz current_time = GetCurrentTimestamp(); + TimestampTz next_wakeup; + avl_dbase *avdb; + long secs; + int usecs; + + avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList); + + next_wakeup = avdb->adl_next_worker; + TimestampDifference(current_time, next_wakeup, &secs, &usecs); + + nap->tv_sec = secs; + nap->tv_usec = usecs; + } + else + { + /* list is empty, sleep for whole autovacuum_naptime seconds */ + nap->tv_sec = autovacuum_naptime; + nap->tv_usec = 0; + } + + /* + * If the result is exactly zero, it means a database had an entry with + * time in the past. Rebuild the list so that the databases are evenly + * distributed again, and recalculate the time to sleep. This can happen + * if there are more tables needing vacuum than workers, and they all take + * longer to vacuum than autovacuum_naptime. + * + * We only recurse once. rebuild_database_list should always return times + * in the future, but it seems best not to trust too much on that. + */ + if (nap->tv_sec == 0 && nap->tv_usec == 0 && !recursing) + { + rebuild_database_list(InvalidOid); + launcher_determine_sleep(canlaunch, true, nap); + return; + } + + /* The smallest time we'll allow the launcher to sleep. */ + if (nap->tv_sec <= 0 && nap->tv_usec <= MIN_AUTOVAC_SLEEPTIME * 1000) + { + nap->tv_sec = 0; + nap->tv_usec = MIN_AUTOVAC_SLEEPTIME * 1000; + } + + /* + * If the sleep time is too large, clamp it to an arbitrary maximum (plus + * any fractional seconds, for simplicity). This avoids an essentially + * infinite sleep in strange cases like the system clock going backwards a + * few years. + */ + if (nap->tv_sec > MAX_AUTOVAC_SLEEPTIME) + nap->tv_sec = MAX_AUTOVAC_SLEEPTIME; +} + +/* + * Build an updated DatabaseList. It must only contain databases that appear + * in pgstats, and must be sorted by next_worker from highest to lowest, + * distributed regularly across the next autovacuum_naptime interval. + * + * Receives the Oid of the database that made this list be generated (we call + * this the "new" database, because when the database was already present on + * the list, we expect that this function is not called at all). The + * preexisting list, if any, will be used to preserve the order of the + * databases in the autovacuum_naptime period. The new database is put at the + * end of the interval. The actual values are not saved, which should not be + * much of a problem. + */ +static void +rebuild_database_list(Oid newdb) +{ + List *dblist; + ListCell *cell; + MemoryContext newcxt; + MemoryContext oldcxt; + MemoryContext tmpcxt; + HASHCTL hctl; + int score; + int nelems; + HTAB *dbhash; + dlist_iter iter; + + /* use fresh stats */ + autovac_refresh_stats(); + + newcxt = AllocSetContextCreate(AutovacMemCxt, + "AV dblist", + ALLOCSET_DEFAULT_SIZES); + tmpcxt = AllocSetContextCreate(newcxt, + "tmp AV dblist", + ALLOCSET_DEFAULT_SIZES); + oldcxt = MemoryContextSwitchTo(tmpcxt); + + /* + * Implementing this is not as simple as it sounds, because we need to put + * the new database at the end of the list; next the databases that were + * already on the list, and finally (at the tail of the list) all the + * other databases that are not on the existing list. + * + * To do this, we build an empty hash table of scored databases. We will + * start with the lowest score (zero) for the new database, then + * increasing scores for the databases in the existing list, in order, and + * lastly increasing scores for all databases gotten via + * get_database_list() that are not already on the hash. + * + * Then we will put all the hash elements into an array, sort the array by + * score, and finally put the array elements into the new doubly linked + * list. + */ + hctl.keysize = sizeof(Oid); + hctl.entrysize = sizeof(avl_dbase); + hctl.hcxt = tmpcxt; + dbhash = hash_create("db hash", 20, &hctl, /* magic number here FIXME */ + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + /* start by inserting the new database */ + score = 0; + if (OidIsValid(newdb)) + { + avl_dbase *db; + PgStat_StatDBEntry *entry; + + /* only consider this database if it has a pgstat entry */ + entry = pgstat_fetch_stat_dbentry(newdb); + if (entry != NULL) + { + /* we assume it isn't found because the hash was just created */ + db = hash_search(dbhash, &newdb, HASH_ENTER, NULL); + + /* hash_search already filled in the key */ + db->adl_score = score++; + /* next_worker is filled in later */ + } + } + + /* Now insert the databases from the existing list */ + dlist_foreach(iter, &DatabaseList) + { + avl_dbase *avdb = dlist_container(avl_dbase, adl_node, iter.cur); + avl_dbase *db; + bool found; + PgStat_StatDBEntry *entry; + + /* + * skip databases with no stat entries -- in particular, this gets rid + * of dropped databases + */ + entry = pgstat_fetch_stat_dbentry(avdb->adl_datid); + if (entry == NULL) + continue; + + db = hash_search(dbhash, &(avdb->adl_datid), HASH_ENTER, &found); + + if (!found) + { + /* hash_search already filled in the key */ + db->adl_score = score++; + /* next_worker is filled in later */ + } + } + + /* finally, insert all qualifying databases not previously inserted */ + dblist = get_database_list(); + foreach(cell, dblist) + { + avw_dbase *avdb = lfirst(cell); + avl_dbase *db; + bool found; + PgStat_StatDBEntry *entry; + + /* only consider databases with a pgstat entry */ + entry = pgstat_fetch_stat_dbentry(avdb->adw_datid); + if (entry == NULL) + continue; + + db = hash_search(dbhash, &(avdb->adw_datid), HASH_ENTER, &found); + /* only update the score if the database was not already on the hash */ + if (!found) + { + /* hash_search already filled in the key */ + db->adl_score = score++; + /* next_worker is filled in later */ + } + } + nelems = score; + + /* from here on, the allocated memory belongs to the new list */ + MemoryContextSwitchTo(newcxt); + dlist_init(&DatabaseList); + + if (nelems > 0) + { + TimestampTz current_time; + int millis_increment; + avl_dbase *dbary; + avl_dbase *db; + HASH_SEQ_STATUS seq; + int i; + + /* put all the hash elements into an array */ + dbary = palloc(nelems * sizeof(avl_dbase)); + + i = 0; + hash_seq_init(&seq, dbhash); + while ((db = hash_seq_search(&seq)) != NULL) + memcpy(&(dbary[i++]), db, sizeof(avl_dbase)); + + /* sort the array */ + qsort(dbary, nelems, sizeof(avl_dbase), db_comparator); + + /* + * Determine the time interval between databases in the schedule. If + * we see that the configured naptime would take us to sleep times + * lower than our min sleep time (which launcher_determine_sleep is + * coded not to allow), silently use a larger naptime (but don't touch + * the GUC variable). + */ + millis_increment = 1000.0 * autovacuum_naptime / nelems; + if (millis_increment <= MIN_AUTOVAC_SLEEPTIME) + millis_increment = MIN_AUTOVAC_SLEEPTIME * 1.1; + + current_time = GetCurrentTimestamp(); + + /* + * move the elements from the array into the dlist, setting the + * next_worker while walking the array + */ + for (i = 0; i < nelems; i++) + { + avl_dbase *db = &(dbary[i]); + + current_time = TimestampTzPlusMilliseconds(current_time, + millis_increment); + db->adl_next_worker = current_time; + + /* later elements should go closer to the head of the list */ + dlist_push_head(&DatabaseList, &db->adl_node); + } + } + + /* all done, clean up memory */ + if (DatabaseListCxt != NULL) + MemoryContextDelete(DatabaseListCxt); + MemoryContextDelete(tmpcxt); + DatabaseListCxt = newcxt; + MemoryContextSwitchTo(oldcxt); +} + +/* qsort comparator for avl_dbase, using adl_score */ +static int +db_comparator(const void *a, const void *b) +{ + if (((const avl_dbase *) a)->adl_score == ((const avl_dbase *) b)->adl_score) + return 0; + else + return (((const avl_dbase *) a)->adl_score < ((const avl_dbase *) b)->adl_score) ? 1 : -1; +} + +/* + * do_start_worker + * + * Bare-bones procedure for starting an autovacuum worker from the launcher. + * It determines what database to work on, sets up shared memory stuff and + * signals postmaster to start the worker. It fails gracefully if invoked when + * autovacuum_workers are already active. + * + * Return value is the OID of the database that the worker is going to process, + * or InvalidOid if no worker was actually started. + */ +static Oid +do_start_worker(void) +{ + List *dblist; + ListCell *cell; + TransactionId xidForceLimit; + MultiXactId multiForceLimit; + bool for_xid_wrap; + bool for_multi_wrap; + avw_dbase *avdb; + TimestampTz current_time; + bool skipit = false; + Oid retval = InvalidOid; + MemoryContext tmpcxt, + oldcxt; + + /* return quickly when there are no free workers */ + LWLockAcquire(AutovacuumLock, LW_SHARED); + if (dlist_is_empty(&AutoVacuumShmem->av_freeWorkers)) + { + LWLockRelease(AutovacuumLock); + return InvalidOid; + } + LWLockRelease(AutovacuumLock); + + /* + * Create and switch to a temporary context to avoid leaking the memory + * allocated for the database list. + */ + tmpcxt = AllocSetContextCreate(CurrentMemoryContext, + "Start worker tmp cxt", + ALLOCSET_DEFAULT_SIZES); + oldcxt = MemoryContextSwitchTo(tmpcxt); + + /* use fresh stats */ + autovac_refresh_stats(); + + /* Get a list of databases */ + dblist = get_database_list(); + + /* + * Determine the oldest datfrozenxid/relfrozenxid that we will allow to + * pass without forcing a vacuum. (This limit can be tightened for + * particular tables, but not loosened.) + */ + recentXid = ReadNextTransactionId(); + xidForceLimit = recentXid - autovacuum_freeze_max_age; + /* ensure it's a "normal" XID, else TransactionIdPrecedes misbehaves */ + /* this can cause the limit to go backwards by 3, but that's OK */ + if (xidForceLimit < FirstNormalTransactionId) + xidForceLimit -= FirstNormalTransactionId; + + /* Also determine the oldest datminmxid we will consider. */ + recentMulti = ReadNextMultiXactId(); + multiForceLimit = recentMulti - MultiXactMemberFreezeThreshold(); + if (multiForceLimit < FirstMultiXactId) + multiForceLimit -= FirstMultiXactId; + + /* + * Choose a database to connect to. We pick the database that was least + * recently auto-vacuumed, or one that needs vacuuming to prevent Xid + * wraparound-related data loss. If any db at risk of Xid wraparound is + * found, we pick the one with oldest datfrozenxid, independently of + * autovacuum times; similarly we pick the one with the oldest datminmxid + * if any is in MultiXactId wraparound. Note that those in Xid wraparound + * danger are given more priority than those in multi wraparound danger. + * + * Note that a database with no stats entry is not considered, except for + * Xid wraparound purposes. The theory is that if no one has ever + * connected to it since the stats were last initialized, it doesn't need + * vacuuming. + * + * XXX This could be improved if we had more info about whether it needs + * vacuuming before connecting to it. Perhaps look through the pgstats + * data for the database's tables? One idea is to keep track of the + * number of new and dead tuples per database in pgstats. However it + * isn't clear how to construct a metric that measures that and not cause + * starvation for less busy databases. + */ + avdb = NULL; + for_xid_wrap = false; + for_multi_wrap = false; + current_time = GetCurrentTimestamp(); + foreach(cell, dblist) + { + avw_dbase *tmp = lfirst(cell); + dlist_iter iter; + + /* Check to see if this one is at risk of wraparound */ + if (TransactionIdPrecedes(tmp->adw_frozenxid, xidForceLimit)) + { + if (avdb == NULL || + TransactionIdPrecedes(tmp->adw_frozenxid, + avdb->adw_frozenxid)) + avdb = tmp; + for_xid_wrap = true; + continue; + } + else if (for_xid_wrap) + continue; /* ignore not-at-risk DBs */ + else if (MultiXactIdPrecedes(tmp->adw_minmulti, multiForceLimit)) + { + if (avdb == NULL || + MultiXactIdPrecedes(tmp->adw_minmulti, avdb->adw_minmulti)) + avdb = tmp; + for_multi_wrap = true; + continue; + } + else if (for_multi_wrap) + continue; /* ignore not-at-risk DBs */ + + /* Find pgstat entry if any */ + tmp->adw_entry = pgstat_fetch_stat_dbentry(tmp->adw_datid); + + /* + * Skip a database with no pgstat entry; it means it hasn't seen any + * activity. + */ + if (!tmp->adw_entry) + continue; + + /* + * Also, skip a database that appears on the database list as having + * been processed recently (less than autovacuum_naptime seconds ago). + * We do this so that we don't select a database which we just + * selected, but that pgstat hasn't gotten around to updating the last + * autovacuum time yet. + */ + skipit = false; + + dlist_reverse_foreach(iter, &DatabaseList) + { + avl_dbase *dbp = dlist_container(avl_dbase, adl_node, iter.cur); + + if (dbp->adl_datid == tmp->adw_datid) + { + /* + * Skip this database if its next_worker value falls between + * the current time and the current time plus naptime. + */ + if (!TimestampDifferenceExceeds(dbp->adl_next_worker, + current_time, 0) && + !TimestampDifferenceExceeds(current_time, + dbp->adl_next_worker, + autovacuum_naptime * 1000)) + skipit = true; + + break; + } + } + if (skipit) + continue; + + /* + * Remember the db with oldest autovac time. (If we are here, both + * tmp->entry and db->entry must be non-null.) + */ + if (avdb == NULL || + tmp->adw_entry->last_autovac_time < avdb->adw_entry->last_autovac_time) + avdb = tmp; + } + + /* Found a database -- process it */ + if (avdb != NULL) + { + WorkerInfo worker; + dlist_node *wptr; + + LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); + + /* + * Get a worker entry from the freelist. We checked above, so there + * really should be a free slot. + */ + wptr = dlist_pop_head_node(&AutoVacuumShmem->av_freeWorkers); + + worker = dlist_container(WorkerInfoData, wi_links, wptr); + worker->wi_dboid = avdb->adw_datid; + worker->wi_proc = NULL; + worker->wi_launchtime = GetCurrentTimestamp(); + + AutoVacuumShmem->av_startingWorker = worker; + + LWLockRelease(AutovacuumLock); + + SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER); + + retval = avdb->adw_datid; + } + else if (skipit) + { + /* + * If we skipped all databases on the list, rebuild it, because it + * probably contains a dropped database. + */ + rebuild_database_list(InvalidOid); + } + + MemoryContextSwitchTo(oldcxt); + MemoryContextDelete(tmpcxt); + + return retval; +} + +/* + * launch_worker + * + * Wrapper for starting a worker from the launcher. Besides actually starting + * it, update the database list to reflect the next time that another one will + * need to be started on the selected database. The actual database choice is + * left to do_start_worker. + * + * This routine is also expected to insert an entry into the database list if + * the selected database was previously absent from the list. + */ +static void +launch_worker(TimestampTz now) +{ + Oid dbid; + dlist_iter iter; + + dbid = do_start_worker(); + if (OidIsValid(dbid)) + { + bool found = false; + + /* + * Walk the database list and update the corresponding entry. If the + * database is not on the list, we'll recreate the list. + */ + dlist_foreach(iter, &DatabaseList) + { + avl_dbase *avdb = dlist_container(avl_dbase, adl_node, iter.cur); + + if (avdb->adl_datid == dbid) + { + found = true; + + /* + * add autovacuum_naptime seconds to the current time, and use + * that as the new "next_worker" field for this database. + */ + avdb->adl_next_worker = + TimestampTzPlusMilliseconds(now, autovacuum_naptime * 1000); + + dlist_move_head(&DatabaseList, iter.cur); + break; + } + } + + /* + * If the database was not present in the database list, we rebuild + * the list. It's possible that the database does not get into the + * list anyway, for example if it's a database that doesn't have a + * pgstat entry, but this is not a problem because we don't want to + * schedule workers regularly into those in any case. + */ + if (!found) + rebuild_database_list(dbid); + } +} + +/* + * Called from postmaster to signal a failure to fork a process to become + * worker. The postmaster should kill(SIGUSR2) the launcher shortly + * after calling this function. + */ +void +AutoVacWorkerFailed(void) +{ + AutoVacuumShmem->av_signal[AutoVacForkFailed] = true; +} + +/* SIGUSR2: a worker is up and running, or just finished, or failed to fork */ +static void +avl_sigusr2_handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + got_SIGUSR2 = true; + SetLatch(MyLatch); + + errno = save_errno; +} + + +/******************************************************************** + * AUTOVACUUM WORKER CODE + ********************************************************************/ + +#ifdef EXEC_BACKEND +/* + * forkexec routines for the autovacuum worker. + * + * Format up the arglist, then fork and exec. + */ +static pid_t +avworker_forkexec(void) +{ + char *av[10]; + int ac = 0; + + av[ac++] = "postgres"; + av[ac++] = "--forkavworker"; + av[ac++] = NULL; /* filled in by postmaster_forkexec */ + av[ac] = NULL; + + Assert(ac < lengthof(av)); + + return postmaster_forkexec(ac, av); +} + +/* + * We need this set from the outside, before InitProcess is called + */ +void +AutovacuumWorkerIAm(void) +{ + am_autovacuum_worker = true; +} +#endif + +/* + * Main entry point for autovacuum worker process. + * + * This code is heavily based on pgarch.c, q.v. + */ +int +StartAutoVacWorker(void) +{ + pid_t worker_pid; + +#ifdef EXEC_BACKEND + switch ((worker_pid = avworker_forkexec())) +#else + switch ((worker_pid = fork_process())) +#endif + { + case -1: + ereport(LOG, + (errmsg("could not fork autovacuum worker process: %m"))); + return 0; + +#ifndef EXEC_BACKEND + case 0: + /* in postmaster child ... */ + InitPostmasterChild(); + + /* Close the postmaster's sockets */ + ClosePostmasterPorts(false); + + AutoVacWorkerMain(0, NULL); + break; +#endif + default: + return (int) worker_pid; + } + + /* shouldn't get here */ + return 0; +} + +/* + * AutoVacWorkerMain + */ +NON_EXEC_STATIC void +AutoVacWorkerMain(int argc, char *argv[]) +{ + sigjmp_buf local_sigjmp_buf; + Oid dbid; + + am_autovacuum_worker = true; + + MyBackendType = B_AUTOVAC_WORKER; + init_ps_display(NULL); + + SetProcessingMode(InitProcessing); + + /* + * Set up signal handlers. We operate on databases much like a regular + * backend, so we use the same signal handling. See equivalent code in + * tcop/postgres.c. + */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + + /* + * SIGINT is used to signal canceling the current table's vacuum; SIGTERM + * means abort and exit cleanly, and SIGQUIT means abandon ship. + */ + pqsignal(SIGINT, StatementCancelHandler); + pqsignal(SIGTERM, die); + /* SIGQUIT handler was already set up by InitPostmasterChild */ + + InitializeTimeouts(); /* establishes SIGALRM handler */ + + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, SIG_IGN); + pqsignal(SIGFPE, FloatExceptionHandler); + pqsignal(SIGCHLD, SIG_DFL); + + /* Early initialization */ + BaseInit(); + + /* + * Create a per-backend PGPROC struct in shared memory, except in the + * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do + * this before we can use LWLocks (and in the EXEC_BACKEND case we already + * had to do some stuff with LWLocks). + */ +#ifndef EXEC_BACKEND + InitProcess(); +#endif + + /* + * If an exception is encountered, processing resumes here. + * + * Unlike most auxiliary processes, we don't attempt to continue + * processing after an error; we just clean up and exit. The autovac + * launcher is responsible for spawning another worker later. + * + * Note that we use sigsetjmp(..., 1), so that the prevailing signal mask + * (to wit, BlockSig) will be restored when longjmp'ing to here. Thus, + * signals other than SIGQUIT will be blocked until we exit. It might + * seem that this policy makes the HOLD_INTERRUPTS() call redundant, but + * it is not since InterruptPending might be set already. + */ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* since not using PG_TRY, must reset error stack by hand */ + error_context_stack = NULL; + + /* Prevents interrupts while cleaning up */ + HOLD_INTERRUPTS(); + + /* Report the error to the server log */ + EmitErrorReport(); + + /* + * We can now go away. Note that because we called InitProcess, a + * callback was registered to do ProcKill, which will clean up + * necessary state. + */ + proc_exit(0); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + PG_SETMASK(&UnBlockSig); + + /* + * Set always-secure search path, so malicious users can't redirect user + * code (e.g. pg_index.indexprs). (That code runs in a + * SECURITY_RESTRICTED_OPERATION sandbox, so malicious users could not + * take control of the entire autovacuum worker in any case.) + */ + SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE); + + /* + * Force zero_damaged_pages OFF in the autovac process, even if it is set + * in postgresql.conf. We don't really want such a dangerous option being + * applied non-interactively. + */ + SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE); + + /* + * Force settable timeouts off to avoid letting these settings prevent + * regular maintenance from being executed. + */ + SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE); + SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE); + SetConfigOption("idle_in_transaction_session_timeout", "0", + PGC_SUSET, PGC_S_OVERRIDE); + + /* + * Force default_transaction_isolation to READ COMMITTED. We don't want + * to pay the overhead of serializable mode, nor add any risk of causing + * deadlocks or delaying other transactions. + */ + SetConfigOption("default_transaction_isolation", "read committed", + PGC_SUSET, PGC_S_OVERRIDE); + + /* + * Force synchronous replication off to allow regular maintenance even if + * we are waiting for standbys to connect. This is important to ensure we + * aren't blocked from performing anti-wraparound tasks. + */ + if (synchronous_commit > SYNCHRONOUS_COMMIT_LOCAL_FLUSH) + SetConfigOption("synchronous_commit", "local", + PGC_SUSET, PGC_S_OVERRIDE); + + /* + * Get the info about the database we're going to work on. + */ + LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); + + /* + * beware of startingWorker being INVALID; this should normally not + * happen, but if a worker fails after forking and before this, the + * launcher might have decided to remove it from the queue and start + * again. + */ + if (AutoVacuumShmem->av_startingWorker != NULL) + { + MyWorkerInfo = AutoVacuumShmem->av_startingWorker; + dbid = MyWorkerInfo->wi_dboid; + MyWorkerInfo->wi_proc = MyProc; + + /* insert into the running list */ + dlist_push_head(&AutoVacuumShmem->av_runningWorkers, + &MyWorkerInfo->wi_links); + + /* + * remove from the "starting" pointer, so that the launcher can start + * a new worker if required + */ + AutoVacuumShmem->av_startingWorker = NULL; + LWLockRelease(AutovacuumLock); + + on_shmem_exit(FreeWorkerInfo, 0); + + /* wake up the launcher */ + if (AutoVacuumShmem->av_launcherpid != 0) + kill(AutoVacuumShmem->av_launcherpid, SIGUSR2); + } + else + { + /* no worker entry for me, go away */ + elog(WARNING, "autovacuum worker started without a worker entry"); + dbid = InvalidOid; + LWLockRelease(AutovacuumLock); + } + + if (OidIsValid(dbid)) + { + char dbname[NAMEDATALEN]; + + /* + * Report autovac startup to the stats collector. We deliberately do + * this before InitPostgres, so that the last_autovac_time will get + * updated even if the connection attempt fails. This is to prevent + * autovac from getting "stuck" repeatedly selecting an unopenable + * database, rather than making any progress on stuff it can connect + * to. + */ + pgstat_report_autovac(dbid); + + /* + * Connect to the selected database + * + * Note: if we have selected a just-deleted database (due to using + * stale stats info), we'll fail and exit here. + */ + InitPostgres(NULL, dbid, NULL, InvalidOid, dbname, false); + SetProcessingMode(NormalProcessing); + set_ps_display(dbname); + ereport(DEBUG1, + (errmsg_internal("autovacuum: processing database \"%s\"", dbname))); + + if (PostAuthDelay) + pg_usleep(PostAuthDelay * 1000000L); + + /* And do an appropriate amount of work */ + recentXid = ReadNextTransactionId(); + recentMulti = ReadNextMultiXactId(); + do_autovacuum(); + } + + /* + * The launcher will be notified of my death in ProcKill, *if* we managed + * to get a worker slot at all + */ + + /* All done, go away */ + proc_exit(0); +} + +/* + * Return a WorkerInfo to the free list + */ +static void +FreeWorkerInfo(int code, Datum arg) +{ + if (MyWorkerInfo != NULL) + { + LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); + + /* + * Wake the launcher up so that he can launch a new worker immediately + * if required. We only save the launcher's PID in local memory here; + * the actual signal will be sent when the PGPROC is recycled. Note + * that we always do this, so that the launcher can rebalance the cost + * limit setting of the remaining workers. + * + * We somewhat ignore the risk that the launcher changes its PID + * between us reading it and the actual kill; we expect ProcKill to be + * called shortly after us, and we assume that PIDs are not reused too + * quickly after a process exits. + */ + AutovacuumLauncherPid = AutoVacuumShmem->av_launcherpid; + + dlist_delete(&MyWorkerInfo->wi_links); + MyWorkerInfo->wi_dboid = InvalidOid; + MyWorkerInfo->wi_tableoid = InvalidOid; + MyWorkerInfo->wi_sharedrel = false; + MyWorkerInfo->wi_proc = NULL; + MyWorkerInfo->wi_launchtime = 0; + MyWorkerInfo->wi_dobalance = false; + MyWorkerInfo->wi_cost_delay = 0; + MyWorkerInfo->wi_cost_limit = 0; + MyWorkerInfo->wi_cost_limit_base = 0; + dlist_push_head(&AutoVacuumShmem->av_freeWorkers, + &MyWorkerInfo->wi_links); + /* not mine anymore */ + MyWorkerInfo = NULL; + + /* + * now that we're inactive, cause a rebalancing of the surviving + * workers + */ + AutoVacuumShmem->av_signal[AutoVacRebalance] = true; + LWLockRelease(AutovacuumLock); + } +} + +/* + * Update the cost-based delay parameters, so that multiple workers consume + * each a fraction of the total available I/O. + */ +void +AutoVacuumUpdateDelay(void) +{ + if (MyWorkerInfo) + { + VacuumCostDelay = MyWorkerInfo->wi_cost_delay; + VacuumCostLimit = MyWorkerInfo->wi_cost_limit; + } +} + +/* + * autovac_balance_cost + * Recalculate the cost limit setting for each active worker. + * + * Caller must hold the AutovacuumLock in exclusive mode. + */ +static void +autovac_balance_cost(void) +{ + /* + * The idea here is that we ration out I/O equally. The amount of I/O + * that a worker can consume is determined by cost_limit/cost_delay, so we + * try to equalize those ratios rather than the raw limit settings. + * + * note: in cost_limit, zero also means use value from elsewhere, because + * zero is not a valid value. + */ + int vac_cost_limit = (autovacuum_vac_cost_limit > 0 ? + autovacuum_vac_cost_limit : VacuumCostLimit); + double vac_cost_delay = (autovacuum_vac_cost_delay >= 0 ? + autovacuum_vac_cost_delay : VacuumCostDelay); + double cost_total; + double cost_avail; + dlist_iter iter; + + /* not set? nothing to do */ + if (vac_cost_limit <= 0 || vac_cost_delay <= 0) + return; + + /* calculate the total base cost limit of participating active workers */ + cost_total = 0.0; + dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers) + { + WorkerInfo worker = dlist_container(WorkerInfoData, wi_links, iter.cur); + + if (worker->wi_proc != NULL && + worker->wi_dobalance && + worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0) + cost_total += + (double) worker->wi_cost_limit_base / worker->wi_cost_delay; + } + + /* there are no cost limits -- nothing to do */ + if (cost_total <= 0) + return; + + /* + * Adjust cost limit of each active worker to balance the total of cost + * limit to autovacuum_vacuum_cost_limit. + */ + cost_avail = (double) vac_cost_limit / vac_cost_delay; + dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers) + { + WorkerInfo worker = dlist_container(WorkerInfoData, wi_links, iter.cur); + + if (worker->wi_proc != NULL && + worker->wi_dobalance && + worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0) + { + int limit = (int) + (cost_avail * worker->wi_cost_limit_base / cost_total); + + /* + * We put a lower bound of 1 on the cost_limit, to avoid division- + * by-zero in the vacuum code. Also, in case of roundoff trouble + * in these calculations, let's be sure we don't ever set + * cost_limit to more than the base value. + */ + worker->wi_cost_limit = Max(Min(limit, + worker->wi_cost_limit_base), + 1); + } + + if (worker->wi_proc != NULL) + elog(DEBUG2, "autovac_balance_cost(pid=%d db=%u, rel=%u, dobalance=%s cost_limit=%d, cost_limit_base=%d, cost_delay=%g)", + worker->wi_proc->pid, worker->wi_dboid, worker->wi_tableoid, + worker->wi_dobalance ? "yes" : "no", + worker->wi_cost_limit, worker->wi_cost_limit_base, + worker->wi_cost_delay); + } +} + +/* + * get_database_list + * Return a list of all databases found in pg_database. + * + * The list and associated data is allocated in the caller's memory context, + * which is in charge of ensuring that it's properly cleaned up afterwards. + * + * Note: this is the only function in which the autovacuum launcher uses a + * transaction. Although we aren't attached to any particular database and + * therefore can't access most catalogs, we do have enough infrastructure + * to do a seqscan on pg_database. + */ +static List * +get_database_list(void) +{ + List *dblist = NIL; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + MemoryContext resultcxt; + + /* This is the context that we will allocate our output data in */ + resultcxt = CurrentMemoryContext; + + /* + * Start a transaction so we can access pg_database, and get a snapshot. + * We don't have a use for the snapshot itself, but we're interested in + * the secondary effect that it sets RecentGlobalXmin. (This is critical + * for anything that reads heap pages, because HOT may decide to prune + * them even if the process doesn't attempt to modify any tuples.) + * + * FIXME: This comment is inaccurate / the code buggy. A snapshot that is + * not pushed/active does not reliably prevent HOT pruning (->xmin could + * e.g. be cleared when cache invalidations are processed). + */ + StartTransactionCommand(); + (void) GetTransactionSnapshot(); + + rel = table_open(DatabaseRelationId, AccessShareLock); + scan = table_beginscan_catalog(rel, 0, NULL); + + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) + { + Form_pg_database pgdatabase = (Form_pg_database) GETSTRUCT(tup); + avw_dbase *avdb; + MemoryContext oldcxt; + + /* + * Allocate our results in the caller's context, not the + * transaction's. We do this inside the loop, and restore the original + * context at the end, so that leaky things like heap_getnext() are + * not called in a potentially long-lived context. + */ + oldcxt = MemoryContextSwitchTo(resultcxt); + + avdb = (avw_dbase *) palloc(sizeof(avw_dbase)); + + avdb->adw_datid = pgdatabase->oid; + avdb->adw_name = pstrdup(NameStr(pgdatabase->datname)); + avdb->adw_frozenxid = pgdatabase->datfrozenxid; + avdb->adw_minmulti = pgdatabase->datminmxid; + /* this gets set later: */ + avdb->adw_entry = NULL; + + dblist = lappend(dblist, avdb); + MemoryContextSwitchTo(oldcxt); + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + CommitTransactionCommand(); + + return dblist; +} + +/* + * Process a database table-by-table + * + * Note that CHECK_FOR_INTERRUPTS is supposed to be used in certain spots in + * order not to ignore shutdown commands for too long. + */ +static void +do_autovacuum(void) +{ + Relation classRel; + HeapTuple tuple; + TableScanDesc relScan; + Form_pg_database dbForm; + List *table_oids = NIL; + List *orphan_oids = NIL; + HASHCTL ctl; + HTAB *table_toast_map; + ListCell *volatile cell; + PgStat_StatDBEntry *shared; + PgStat_StatDBEntry *dbentry; + BufferAccessStrategy bstrategy; + ScanKeyData key; + TupleDesc pg_class_desc; + int effective_multixact_freeze_max_age; + bool did_vacuum = false; + bool found_concurrent_worker = false; + int i; + + /* + * StartTransactionCommand and CommitTransactionCommand will automatically + * switch to other contexts. We need this one to keep the list of + * relations to vacuum/analyze across transactions. + */ + AutovacMemCxt = AllocSetContextCreate(TopMemoryContext, + "AV worker", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(AutovacMemCxt); + + /* + * may be NULL if we couldn't find an entry (only happens if we are + * forcing a vacuum for anti-wrap purposes). + */ + dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId); + + /* Start a transaction so our commands have one to play into. */ + StartTransactionCommand(); + + /* + * Clean up any dead statistics collector entries for this DB. We always + * want to do this exactly once per DB-processing cycle, even if we find + * nothing worth vacuuming in the database. + */ + pgstat_vacuum_stat(); + + /* + * Compute the multixact age for which freezing is urgent. This is + * normally autovacuum_multixact_freeze_max_age, but may be less if we are + * short of multixact member space. + */ + effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold(); + + /* + * Find the pg_database entry and select the default freeze ages. We use + * zero in template and nonconnectable databases, else the system-wide + * default. + */ + tuple = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for database %u", MyDatabaseId); + dbForm = (Form_pg_database) GETSTRUCT(tuple); + + if (dbForm->datistemplate || !dbForm->datallowconn) + { + default_freeze_min_age = 0; + default_freeze_table_age = 0; + default_multixact_freeze_min_age = 0; + default_multixact_freeze_table_age = 0; + } + else + { + default_freeze_min_age = vacuum_freeze_min_age; + default_freeze_table_age = vacuum_freeze_table_age; + default_multixact_freeze_min_age = vacuum_multixact_freeze_min_age; + default_multixact_freeze_table_age = vacuum_multixact_freeze_table_age; + } + + ReleaseSysCache(tuple); + + /* StartTransactionCommand changed elsewhere */ + MemoryContextSwitchTo(AutovacMemCxt); + + /* The database hash where pgstat keeps shared relations */ + shared = pgstat_fetch_stat_dbentry(InvalidOid); + + classRel = table_open(RelationRelationId, AccessShareLock); + + /* create a copy so we can use it after closing pg_class */ + pg_class_desc = CreateTupleDescCopy(RelationGetDescr(classRel)); + + /* create hash table for toast <-> main relid mapping */ + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(av_relation); + + table_toast_map = hash_create("TOAST to main relid map", + 100, + &ctl, + HASH_ELEM | HASH_BLOBS); + + /* + * Scan pg_class to determine which tables to vacuum. + * + * We do this in two passes: on the first one we collect the list of plain + * relations and materialized views, and on the second one we collect + * TOAST tables. The reason for doing the second pass is that during it we + * want to use the main relation's pg_class.reloptions entry if the TOAST + * table does not have any, and we cannot obtain it unless we know + * beforehand what's the main table OID. + * + * We need to check TOAST tables separately because in cases with short, + * wide tables there might be proportionally much more activity in the + * TOAST table than in its parent. + */ + relScan = table_beginscan_catalog(classRel, 0, NULL); + + /* + * On the first pass, we collect main tables to vacuum, and also the main + * table relid to TOAST relid mapping. + */ + while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL) + { + Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple); + PgStat_StatTabEntry *tabentry; + AutoVacOpts *relopts; + Oid relid; + bool dovacuum; + bool doanalyze; + bool wraparound; + + if (classForm->relkind != RELKIND_RELATION && + classForm->relkind != RELKIND_MATVIEW) + continue; + + relid = classForm->oid; + + /* + * Check if it is a temp table (presumably, of some other backend's). + * We cannot safely process other backends' temp tables. + */ + if (classForm->relpersistence == RELPERSISTENCE_TEMP) + { + /* + * We just ignore it if the owning backend is still active and + * using the temporary schema. Also, for safety, ignore it if the + * namespace doesn't exist or isn't a temp namespace after all. + */ + if (checkTempNamespaceStatus(classForm->relnamespace) == TEMP_NAMESPACE_IDLE) + { + /* + * The table seems to be orphaned -- although it might be that + * the owning backend has already deleted it and exited; our + * pg_class scan snapshot is not necessarily up-to-date + * anymore, so we could be looking at a committed-dead entry. + * Remember it so we can try to delete it later. + */ + orphan_oids = lappend_oid(orphan_oids, relid); + } + continue; + } + + /* Fetch reloptions and the pgstat entry for this table */ + relopts = extract_autovac_opts(tuple, pg_class_desc); + tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared, + shared, dbentry); + + /* Check if it needs vacuum or analyze */ + relation_needs_vacanalyze(relid, relopts, classForm, tabentry, + effective_multixact_freeze_max_age, + &dovacuum, &doanalyze, &wraparound); + + /* Relations that need work are added to table_oids */ + if (dovacuum || doanalyze) + table_oids = lappend_oid(table_oids, relid); + + /* + * Remember TOAST associations for the second pass. Note: we must do + * this whether or not the table is going to be vacuumed, because we + * don't automatically vacuum toast tables along the parent table. + */ + if (OidIsValid(classForm->reltoastrelid)) + { + av_relation *hentry; + bool found; + + hentry = hash_search(table_toast_map, + &classForm->reltoastrelid, + HASH_ENTER, &found); + + if (!found) + { + /* hash_search already filled in the key */ + hentry->ar_relid = relid; + hentry->ar_hasrelopts = false; + if (relopts != NULL) + { + hentry->ar_hasrelopts = true; + memcpy(&hentry->ar_reloptions, relopts, + sizeof(AutoVacOpts)); + } + } + } + } + + table_endscan(relScan); + + /* second pass: check TOAST tables */ + ScanKeyInit(&key, + Anum_pg_class_relkind, + BTEqualStrategyNumber, F_CHAREQ, + CharGetDatum(RELKIND_TOASTVALUE)); + + relScan = table_beginscan_catalog(classRel, 1, &key); + while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL) + { + Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple); + PgStat_StatTabEntry *tabentry; + Oid relid; + AutoVacOpts *relopts = NULL; + bool dovacuum; + bool doanalyze; + bool wraparound; + + /* + * We cannot safely process other backends' temp tables, so skip 'em. + */ + if (classForm->relpersistence == RELPERSISTENCE_TEMP) + continue; + + relid = classForm->oid; + + /* + * fetch reloptions -- if this toast table does not have them, try the + * main rel + */ + relopts = extract_autovac_opts(tuple, pg_class_desc); + if (relopts == NULL) + { + av_relation *hentry; + bool found; + + hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found); + if (found && hentry->ar_hasrelopts) + relopts = &hentry->ar_reloptions; + } + + /* Fetch the pgstat entry for this table */ + tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared, + shared, dbentry); + + relation_needs_vacanalyze(relid, relopts, classForm, tabentry, + effective_multixact_freeze_max_age, + &dovacuum, &doanalyze, &wraparound); + + /* ignore analyze for toast tables */ + if (dovacuum) + table_oids = lappend_oid(table_oids, relid); + } + + table_endscan(relScan); + table_close(classRel, AccessShareLock); + + /* + * Recheck orphan temporary tables, and if they still seem orphaned, drop + * them. We'll eat a transaction per dropped table, which might seem + * excessive, but we should only need to do anything as a result of a + * previous backend crash, so this should not happen often enough to + * justify "optimizing". Using separate transactions ensures that we + * don't bloat the lock table if there are many temp tables to be dropped, + * and it ensures that we don't lose work if a deletion attempt fails. + */ + foreach(cell, orphan_oids) + { + Oid relid = lfirst_oid(cell); + Form_pg_class classForm; + ObjectAddress object; + + /* + * Check for user-requested abort. + */ + CHECK_FOR_INTERRUPTS(); + + /* + * Try to lock the table. If we can't get the lock immediately, + * somebody else is using (or dropping) the table, so it's not our + * concern anymore. Having the lock prevents race conditions below. + */ + if (!ConditionalLockRelationOid(relid, AccessExclusiveLock)) + continue; + + /* + * Re-fetch the pg_class tuple and re-check whether it still seems to + * be an orphaned temp table. If it's not there or no longer the same + * relation, ignore it. + */ + tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(tuple)) + { + /* be sure to drop useless lock so we don't bloat lock table */ + UnlockRelationOid(relid, AccessExclusiveLock); + continue; + } + classForm = (Form_pg_class) GETSTRUCT(tuple); + + /* + * Make all the same tests made in the loop above. In event of OID + * counter wraparound, the pg_class entry we have now might be + * completely unrelated to the one we saw before. + */ + if (!((classForm->relkind == RELKIND_RELATION || + classForm->relkind == RELKIND_MATVIEW) && + classForm->relpersistence == RELPERSISTENCE_TEMP)) + { + UnlockRelationOid(relid, AccessExclusiveLock); + continue; + } + + if (checkTempNamespaceStatus(classForm->relnamespace) != TEMP_NAMESPACE_IDLE) + { + UnlockRelationOid(relid, AccessExclusiveLock); + continue; + } + + /* OK, let's delete it */ + ereport(LOG, + (errmsg("autovacuum: dropping orphan temp table \"%s.%s.%s\"", + get_database_name(MyDatabaseId), + get_namespace_name(classForm->relnamespace), + NameStr(classForm->relname)))); + + object.classId = RelationRelationId; + object.objectId = relid; + object.objectSubId = 0; + performDeletion(&object, DROP_CASCADE, + PERFORM_DELETION_INTERNAL | + PERFORM_DELETION_QUIETLY | + PERFORM_DELETION_SKIP_EXTENSIONS); + + /* + * To commit the deletion, end current transaction and start a new + * one. Note this also releases the lock we took. + */ + CommitTransactionCommand(); + StartTransactionCommand(); + + /* StartTransactionCommand changed current memory context */ + MemoryContextSwitchTo(AutovacMemCxt); + } + + /* + * Create a buffer access strategy object for VACUUM to use. We want to + * use the same one across all the vacuum operations we perform, since the + * point is for VACUUM not to blow out the shared cache. + */ + bstrategy = GetAccessStrategy(BAS_VACUUM); + + /* + * create a memory context to act as fake PortalContext, so that the + * contexts created in the vacuum code are cleaned up for each table. + */ + PortalContext = AllocSetContextCreate(AutovacMemCxt, + "Autovacuum Portal", + ALLOCSET_DEFAULT_SIZES); + + /* + * Perform operations on collected tables. + */ + foreach(cell, table_oids) + { + Oid relid = lfirst_oid(cell); + HeapTuple classTup; + autovac_table *tab; + bool isshared; + bool skipit; + double stdVacuumCostDelay; + int stdVacuumCostLimit; + dlist_iter iter; + + CHECK_FOR_INTERRUPTS(); + + /* + * Check for config changes before processing each collected table. + */ + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + + /* + * You might be tempted to bail out if we see autovacuum is now + * disabled. Must resist that temptation -- this might be a + * for-wraparound emergency worker, in which case that would be + * entirely inappropriate. + */ + } + + /* + * Find out whether the table is shared or not. (It's slightly + * annoying to fetch the syscache entry just for this, but in typical + * cases it adds little cost because table_recheck_autovac would + * refetch the entry anyway. We could buy that back by copying the + * tuple here and passing it to table_recheck_autovac, but that + * increases the odds of that function working with stale data.) + */ + classTup = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(classTup)) + continue; /* somebody deleted the rel, forget it */ + isshared = ((Form_pg_class) GETSTRUCT(classTup))->relisshared; + ReleaseSysCache(classTup); + + /* + * Hold schedule lock from here until we've claimed the table. We + * also need the AutovacuumLock to walk the worker array, but that one + * can just be a shared lock. + */ + LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE); + LWLockAcquire(AutovacuumLock, LW_SHARED); + + /* + * Check whether the table is being vacuumed concurrently by another + * worker. + */ + skipit = false; + dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers) + { + WorkerInfo worker = dlist_container(WorkerInfoData, wi_links, iter.cur); + + /* ignore myself */ + if (worker == MyWorkerInfo) + continue; + + /* ignore workers in other databases (unless table is shared) */ + if (!worker->wi_sharedrel && worker->wi_dboid != MyDatabaseId) + continue; + + if (worker->wi_tableoid == relid) + { + skipit = true; + found_concurrent_worker = true; + break; + } + } + LWLockRelease(AutovacuumLock); + if (skipit) + { + LWLockRelease(AutovacuumScheduleLock); + continue; + } + + /* + * Store the table's OID in shared memory before releasing the + * schedule lock, so that other workers don't try to vacuum it + * concurrently. (We claim it here so as not to hold + * AutovacuumScheduleLock while rechecking the stats.) + */ + MyWorkerInfo->wi_tableoid = relid; + MyWorkerInfo->wi_sharedrel = isshared; + LWLockRelease(AutovacuumScheduleLock); + + /* + * Check whether pgstat data still says we need to vacuum this table. + * It could have changed if something else processed the table while + * we weren't looking. + * + * Note: we have a special case in pgstat code to ensure that the + * stats we read are as up-to-date as possible, to avoid the problem + * that somebody just finished vacuuming this table. The window to + * the race condition is not closed but it is very small. + */ + MemoryContextSwitchTo(AutovacMemCxt); + tab = table_recheck_autovac(relid, table_toast_map, pg_class_desc, + effective_multixact_freeze_max_age); + if (tab == NULL) + { + /* someone else vacuumed the table, or it went away */ + LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE); + MyWorkerInfo->wi_tableoid = InvalidOid; + MyWorkerInfo->wi_sharedrel = false; + LWLockRelease(AutovacuumScheduleLock); + continue; + } + + /* + * Remember the prevailing values of the vacuum cost GUCs. We have to + * restore these at the bottom of the loop, else we'll compute wrong + * values in the next iteration of autovac_balance_cost(). + */ + stdVacuumCostDelay = VacuumCostDelay; + stdVacuumCostLimit = VacuumCostLimit; + + /* Must hold AutovacuumLock while mucking with cost balance info */ + LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); + + /* advertise my cost delay parameters for the balancing algorithm */ + MyWorkerInfo->wi_dobalance = tab->at_dobalance; + MyWorkerInfo->wi_cost_delay = tab->at_vacuum_cost_delay; + MyWorkerInfo->wi_cost_limit = tab->at_vacuum_cost_limit; + MyWorkerInfo->wi_cost_limit_base = tab->at_vacuum_cost_limit; + + /* do a balance */ + autovac_balance_cost(); + + /* set the active cost parameters from the result of that */ + AutoVacuumUpdateDelay(); + + /* done */ + LWLockRelease(AutovacuumLock); + + /* clean up memory before each iteration */ + MemoryContextResetAndDeleteChildren(PortalContext); + + /* + * Save the relation name for a possible error message, to avoid a + * catalog lookup in case of an error. If any of these return NULL, + * then the relation has been dropped since last we checked; skip it. + * Note: they must live in a long-lived memory context because we call + * vacuum and analyze in different transactions. + */ + + tab->at_relname = get_rel_name(tab->at_relid); + tab->at_nspname = get_namespace_name(get_rel_namespace(tab->at_relid)); + tab->at_datname = get_database_name(MyDatabaseId); + if (!tab->at_relname || !tab->at_nspname || !tab->at_datname) + goto deleted; + + /* + * We will abort vacuuming the current table if something errors out, + * and continue with the next one in schedule; in particular, this + * happens if we are interrupted with SIGINT. + */ + PG_TRY(); + { + /* Use PortalContext for any per-table allocations */ + MemoryContextSwitchTo(PortalContext); + + /* have at it */ + autovacuum_do_vac_analyze(tab, bstrategy); + + /* + * Clear a possible query-cancel signal, to avoid a late reaction + * to an automatically-sent signal because of vacuuming the + * current table (we're done with it, so it would make no sense to + * cancel at this point.) + */ + QueryCancelPending = false; + } + PG_CATCH(); + { + /* + * Abort the transaction, start a new one, and proceed with the + * next table in our list. + */ + HOLD_INTERRUPTS(); + if (tab->at_params.options & VACOPT_VACUUM) + errcontext("automatic vacuum of table \"%s.%s.%s\"", + tab->at_datname, tab->at_nspname, tab->at_relname); + else + errcontext("automatic analyze of table \"%s.%s.%s\"", + tab->at_datname, tab->at_nspname, tab->at_relname); + EmitErrorReport(); + + /* this resets ProcGlobal->statusFlags[i] too */ + AbortOutOfAnyTransaction(); + FlushErrorState(); + MemoryContextResetAndDeleteChildren(PortalContext); + + /* restart our transaction for the following operations */ + StartTransactionCommand(); + RESUME_INTERRUPTS(); + } + PG_END_TRY(); + + /* Make sure we're back in AutovacMemCxt */ + MemoryContextSwitchTo(AutovacMemCxt); + + did_vacuum = true; + + /* ProcGlobal->statusFlags[i] are reset at the next end of xact */ + + /* be tidy */ +deleted: + if (tab->at_datname != NULL) + pfree(tab->at_datname); + if (tab->at_nspname != NULL) + pfree(tab->at_nspname); + if (tab->at_relname != NULL) + pfree(tab->at_relname); + pfree(tab); + + /* + * Remove my info from shared memory. We could, but intentionally + * don't, clear wi_cost_limit and friends --- this is on the + * assumption that we probably have more to do with similar cost + * settings, so we don't want to give up our share of I/O for a very + * short interval and thereby thrash the global balance. + */ + LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE); + MyWorkerInfo->wi_tableoid = InvalidOid; + MyWorkerInfo->wi_sharedrel = false; + LWLockRelease(AutovacuumScheduleLock); + + /* restore vacuum cost GUCs for the next iteration */ + VacuumCostDelay = stdVacuumCostDelay; + VacuumCostLimit = stdVacuumCostLimit; + } + + /* + * Perform additional work items, as requested by backends. + */ + LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); + for (i = 0; i < NUM_WORKITEMS; i++) + { + AutoVacuumWorkItem *workitem = &AutoVacuumShmem->av_workItems[i]; + + if (!workitem->avw_used) + continue; + if (workitem->avw_active) + continue; + if (workitem->avw_database != MyDatabaseId) + continue; + + /* claim this one, and release lock while performing it */ + workitem->avw_active = true; + LWLockRelease(AutovacuumLock); + + perform_work_item(workitem); + + /* + * Check for config changes before acquiring lock for further jobs. + */ + CHECK_FOR_INTERRUPTS(); + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); + + /* and mark it done */ + workitem->avw_active = false; + workitem->avw_used = false; + } + LWLockRelease(AutovacuumLock); + + /* + * We leak table_toast_map here (among other things), but since we're + * going away soon, it's not a problem. + */ + + /* + * Update pg_database.datfrozenxid, and truncate pg_xact if possible. We + * only need to do this once, not after each table. + * + * Even if we didn't vacuum anything, it may still be important to do + * this, because one indirect effect of vac_update_datfrozenxid() is to + * update ShmemVariableCache->xidVacLimit. That might need to be done + * even if we haven't vacuumed anything, because relations with older + * relfrozenxid values or other databases with older datfrozenxid values + * might have been dropped, allowing xidVacLimit to advance. + * + * However, it's also important not to do this blindly in all cases, + * because when autovacuum=off this will restart the autovacuum launcher. + * If we're not careful, an infinite loop can result, where workers find + * no work to do and restart the launcher, which starts another worker in + * the same database that finds no work to do. To prevent that, we skip + * this if (1) we found no work to do and (2) we skipped at least one + * table due to concurrent autovacuum activity. In that case, the other + * worker has already done it, or will do so when it finishes. + */ + if (did_vacuum || !found_concurrent_worker) + vac_update_datfrozenxid(); + + /* Finally close out the last transaction. */ + CommitTransactionCommand(); +} + +/* + * Execute a previously registered work item. + */ +static void +perform_work_item(AutoVacuumWorkItem *workitem) +{ + char *cur_datname = NULL; + char *cur_nspname = NULL; + char *cur_relname = NULL; + + /* + * Note we do not store table info in MyWorkerInfo, since this is not + * vacuuming proper. + */ + + /* + * Save the relation name for a possible error message, to avoid a catalog + * lookup in case of an error. If any of these return NULL, then the + * relation has been dropped since last we checked; skip it. + */ + Assert(CurrentMemoryContext == AutovacMemCxt); + + cur_relname = get_rel_name(workitem->avw_relation); + cur_nspname = get_namespace_name(get_rel_namespace(workitem->avw_relation)); + cur_datname = get_database_name(MyDatabaseId); + if (!cur_relname || !cur_nspname || !cur_datname) + goto deleted2; + + autovac_report_workitem(workitem, cur_nspname, cur_relname); + + /* clean up memory before each work item */ + MemoryContextResetAndDeleteChildren(PortalContext); + + /* + * We will abort the current work item if something errors out, and + * continue with the next one; in particular, this happens if we are + * interrupted with SIGINT. Note that this means that the work item list + * can be lossy. + */ + PG_TRY(); + { + /* Use PortalContext for any per-work-item allocations */ + MemoryContextSwitchTo(PortalContext); + + /* have at it */ + switch (workitem->avw_type) + { + case AVW_BRINSummarizeRange: + DirectFunctionCall2(brin_summarize_range, + ObjectIdGetDatum(workitem->avw_relation), + Int64GetDatum((int64) workitem->avw_blockNumber)); + break; + default: + elog(WARNING, "unrecognized work item found: type %d", + workitem->avw_type); + break; + } + + /* + * Clear a possible query-cancel signal, to avoid a late reaction to + * an automatically-sent signal because of vacuuming the current table + * (we're done with it, so it would make no sense to cancel at this + * point.) + */ + QueryCancelPending = false; + } + PG_CATCH(); + { + /* + * Abort the transaction, start a new one, and proceed with the next + * table in our list. + */ + HOLD_INTERRUPTS(); + errcontext("processing work entry for relation \"%s.%s.%s\"", + cur_datname, cur_nspname, cur_relname); + EmitErrorReport(); + + /* this resets ProcGlobal->statusFlags[i] too */ + AbortOutOfAnyTransaction(); + FlushErrorState(); + MemoryContextResetAndDeleteChildren(PortalContext); + + /* restart our transaction for the following operations */ + StartTransactionCommand(); + RESUME_INTERRUPTS(); + } + PG_END_TRY(); + + /* Make sure we're back in AutovacMemCxt */ + MemoryContextSwitchTo(AutovacMemCxt); + + /* We intentionally do not set did_vacuum here */ + + /* be tidy */ +deleted2: + if (cur_datname) + pfree(cur_datname); + if (cur_nspname) + pfree(cur_nspname); + if (cur_relname) + pfree(cur_relname); +} + +/* + * extract_autovac_opts + * + * Given a relation's pg_class tuple, return the AutoVacOpts portion of + * reloptions, if set; otherwise, return NULL. + * + * Note: callers do not have a relation lock on the table at this point, + * so the table could have been dropped, and its catalog rows gone, after + * we acquired the pg_class row. If pg_class had a TOAST table, this would + * be a risk; fortunately, it doesn't. + */ +static AutoVacOpts * +extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc) +{ + bytea *relopts; + AutoVacOpts *av; + + Assert(((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_RELATION || + ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_MATVIEW || + ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_TOASTVALUE); + + relopts = extractRelOptions(tup, pg_class_desc, NULL); + if (relopts == NULL) + return NULL; + + av = palloc(sizeof(AutoVacOpts)); + memcpy(av, &(((StdRdOptions *) relopts)->autovacuum), sizeof(AutoVacOpts)); + pfree(relopts); + + return av; +} + +/* + * get_pgstat_tabentry_relid + * + * Fetch the pgstat entry of a table, either local to a database or shared. + */ +static PgStat_StatTabEntry * +get_pgstat_tabentry_relid(Oid relid, bool isshared, PgStat_StatDBEntry *shared, + PgStat_StatDBEntry *dbentry) +{ + PgStat_StatTabEntry *tabentry = NULL; + + if (isshared) + { + if (PointerIsValid(shared)) + tabentry = hash_search(shared->tables, &relid, + HASH_FIND, NULL); + } + else if (PointerIsValid(dbentry)) + tabentry = hash_search(dbentry->tables, &relid, + HASH_FIND, NULL); + + return tabentry; +} + +/* + * table_recheck_autovac + * + * Recheck whether a table still needs vacuum or analyze. Return value is a + * valid autovac_table pointer if it does, NULL otherwise. + * + * Note that the returned autovac_table does not have the name fields set. + */ +static autovac_table * +table_recheck_autovac(Oid relid, HTAB *table_toast_map, + TupleDesc pg_class_desc, + int effective_multixact_freeze_max_age) +{ + Form_pg_class classForm; + HeapTuple classTup; + bool dovacuum; + bool doanalyze; + autovac_table *tab = NULL; + bool wraparound; + AutoVacOpts *avopts; + static bool reuse_stats = false; + + /* fetch the relation's relcache entry */ + classTup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(classTup)) + return NULL; + classForm = (Form_pg_class) GETSTRUCT(classTup); + + /* + * Get the applicable reloptions. If it is a TOAST table, try to get the + * main table reloptions if the toast table itself doesn't have. + */ + avopts = extract_autovac_opts(classTup, pg_class_desc); + if (classForm->relkind == RELKIND_TOASTVALUE && + avopts == NULL && table_toast_map != NULL) + { + av_relation *hentry; + bool found; + + hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found); + if (found && hentry->ar_hasrelopts) + avopts = &hentry->ar_reloptions; + } + + /* + * Reuse the stats to recheck whether a relation needs to be vacuumed or + * analyzed if it was reloaded before and has not been cleared yet. This + * is necessary to avoid frequent refresh of stats, especially when there + * are very large number of relations and the refresh can cause lots of + * overhead. + * + * If we determined that a relation needs to be vacuumed or analyzed, + * based on the old stats, we refresh stats and recheck the necessity + * again. Because a relation may have already been vacuumed or analyzed by + * someone since the last reload of stats. + */ + if (reuse_stats) + { + recheck_relation_needs_vacanalyze(relid, avopts, classForm, + effective_multixact_freeze_max_age, + &dovacuum, &doanalyze, &wraparound); + + /* Quick exit if a relation doesn't need to be vacuumed or analyzed */ + if (!doanalyze && !dovacuum) + { + heap_freetuple(classTup); + return NULL; + } + } + + /* Use fresh stats and recheck again */ + autovac_refresh_stats(); + + recheck_relation_needs_vacanalyze(relid, avopts, classForm, + effective_multixact_freeze_max_age, + &dovacuum, &doanalyze, &wraparound); + + /* OK, it needs something done */ + if (doanalyze || dovacuum) + { + int freeze_min_age; + int freeze_table_age; + int multixact_freeze_min_age; + int multixact_freeze_table_age; + int vac_cost_limit; + double vac_cost_delay; + int log_min_duration; + + /* + * Calculate the vacuum cost parameters and the freeze ages. If there + * are options set in pg_class.reloptions, use them; in the case of a + * toast table, try the main table too. Otherwise use the GUC + * defaults, autovacuum's own first and plain vacuum second. + */ + + /* -1 in autovac setting means use plain vacuum_cost_delay */ + vac_cost_delay = (avopts && avopts->vacuum_cost_delay >= 0) + ? avopts->vacuum_cost_delay + : (autovacuum_vac_cost_delay >= 0) + ? autovacuum_vac_cost_delay + : VacuumCostDelay; + + /* 0 or -1 in autovac setting means use plain vacuum_cost_limit */ + vac_cost_limit = (avopts && avopts->vacuum_cost_limit > 0) + ? avopts->vacuum_cost_limit + : (autovacuum_vac_cost_limit > 0) + ? autovacuum_vac_cost_limit + : VacuumCostLimit; + + /* -1 in autovac setting means use log_autovacuum_min_duration */ + log_min_duration = (avopts && avopts->log_min_duration >= 0) + ? avopts->log_min_duration + : Log_autovacuum_min_duration; + + /* these do not have autovacuum-specific settings */ + freeze_min_age = (avopts && avopts->freeze_min_age >= 0) + ? avopts->freeze_min_age + : default_freeze_min_age; + + freeze_table_age = (avopts && avopts->freeze_table_age >= 0) + ? avopts->freeze_table_age + : default_freeze_table_age; + + multixact_freeze_min_age = (avopts && + avopts->multixact_freeze_min_age >= 0) + ? avopts->multixact_freeze_min_age + : default_multixact_freeze_min_age; + + multixact_freeze_table_age = (avopts && + avopts->multixact_freeze_table_age >= 0) + ? avopts->multixact_freeze_table_age + : default_multixact_freeze_table_age; + + tab = palloc(sizeof(autovac_table)); + tab->at_relid = relid; + tab->at_sharedrel = classForm->relisshared; + + /* Note that this skips toast relations */ + tab->at_params.options = (dovacuum ? VACOPT_VACUUM : 0) | + (doanalyze ? VACOPT_ANALYZE : 0) | + (!wraparound ? VACOPT_SKIP_LOCKED : 0); + + /* + * index_cleanup and truncate are unspecified at first in autovacuum. + * They will be filled in with usable values using their reloptions + * (or reloption defaults) later. + */ + tab->at_params.index_cleanup = VACOPTVALUE_UNSPECIFIED; + tab->at_params.truncate = VACOPTVALUE_UNSPECIFIED; + /* As of now, we don't support parallel vacuum for autovacuum */ + tab->at_params.nworkers = -1; + tab->at_params.freeze_min_age = freeze_min_age; + tab->at_params.freeze_table_age = freeze_table_age; + tab->at_params.multixact_freeze_min_age = multixact_freeze_min_age; + tab->at_params.multixact_freeze_table_age = multixact_freeze_table_age; + tab->at_params.is_wraparound = wraparound; + tab->at_params.log_min_duration = log_min_duration; + tab->at_vacuum_cost_limit = vac_cost_limit; + tab->at_vacuum_cost_delay = vac_cost_delay; + tab->at_relname = NULL; + tab->at_nspname = NULL; + tab->at_datname = NULL; + + /* + * If any of the cost delay parameters has been set individually for + * this table, disable the balancing algorithm. + */ + tab->at_dobalance = + !(avopts && (avopts->vacuum_cost_limit > 0 || + avopts->vacuum_cost_delay > 0)); + + /* + * When we decide to do vacuum or analyze, the existing stats cannot + * be reused in the next cycle because it's cleared at the end of + * vacuum or analyze (by AtEOXact_PgStat()). + */ + reuse_stats = false; + } + else + { + /* + * If neither vacuum nor analyze is necessary, the existing stats is + * not cleared and can be reused in the next cycle. + */ + reuse_stats = true; + } + + heap_freetuple(classTup); + return tab; +} + +/* + * recheck_relation_needs_vacanalyze + * + * Subroutine for table_recheck_autovac. + * + * Fetch the pgstat of a relation and recheck whether a relation + * needs to be vacuumed or analyzed. + */ +static void +recheck_relation_needs_vacanalyze(Oid relid, + AutoVacOpts *avopts, + Form_pg_class classForm, + int effective_multixact_freeze_max_age, + bool *dovacuum, + bool *doanalyze, + bool *wraparound) +{ + PgStat_StatTabEntry *tabentry; + PgStat_StatDBEntry *shared = NULL; + PgStat_StatDBEntry *dbentry = NULL; + + if (classForm->relisshared) + shared = pgstat_fetch_stat_dbentry(InvalidOid); + else + dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId); + + /* fetch the pgstat table entry */ + tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared, + shared, dbentry); + + relation_needs_vacanalyze(relid, avopts, classForm, tabentry, + effective_multixact_freeze_max_age, + dovacuum, doanalyze, wraparound); + + /* ignore ANALYZE for toast tables */ + if (classForm->relkind == RELKIND_TOASTVALUE) + *doanalyze = false; +} + +/* + * relation_needs_vacanalyze + * + * Check whether a relation needs to be vacuumed or analyzed; return each into + * "dovacuum" and "doanalyze", respectively. Also return whether the vacuum is + * being forced because of Xid or multixact wraparound. + * + * relopts is a pointer to the AutoVacOpts options (either for itself in the + * case of a plain table, or for either itself or its parent table in the case + * of a TOAST table), NULL if none; tabentry is the pgstats entry, which can be + * NULL. + * + * A table needs to be vacuumed if the number of dead tuples exceeds a + * threshold. This threshold is calculated as + * + * threshold = vac_base_thresh + vac_scale_factor * reltuples + * + * For analyze, the analysis done is that the number of tuples inserted, + * deleted and updated since the last analyze exceeds a threshold calculated + * in the same fashion as above. Note that the collector actually stores + * the number of tuples (both live and dead) that there were as of the last + * analyze. This is asymmetric to the VACUUM case. + * + * We also force vacuum if the table's relfrozenxid is more than freeze_max_age + * transactions back, and if its relminmxid is more than + * multixact_freeze_max_age multixacts back. + * + * A table whose autovacuum_enabled option is false is + * automatically skipped (unless we have to vacuum it due to freeze_max_age). + * Thus autovacuum can be disabled for specific tables. Also, when the stats + * collector does not have data about a table, it will be skipped. + * + * A table whose vac_base_thresh value is < 0 takes the base value from the + * autovacuum_vacuum_threshold GUC variable. Similarly, a vac_scale_factor + * value < 0 is substituted with the value of + * autovacuum_vacuum_scale_factor GUC variable. Ditto for analyze. + */ +static void +relation_needs_vacanalyze(Oid relid, + AutoVacOpts *relopts, + Form_pg_class classForm, + PgStat_StatTabEntry *tabentry, + int effective_multixact_freeze_max_age, + /* output params below */ + bool *dovacuum, + bool *doanalyze, + bool *wraparound) +{ + bool force_vacuum; + bool av_enabled; + float4 reltuples; /* pg_class.reltuples */ + + /* constants from reloptions or GUC variables */ + int vac_base_thresh, + vac_ins_base_thresh, + anl_base_thresh; + float4 vac_scale_factor, + vac_ins_scale_factor, + anl_scale_factor; + + /* thresholds calculated from above constants */ + float4 vacthresh, + vacinsthresh, + anlthresh; + + /* number of vacuum (resp. analyze) tuples at this time */ + float4 vactuples, + instuples, + anltuples; + + /* freeze parameters */ + int freeze_max_age; + int multixact_freeze_max_age; + TransactionId xidForceLimit; + MultiXactId multiForceLimit; + + AssertArg(classForm != NULL); + AssertArg(OidIsValid(relid)); + + /* + * Determine vacuum/analyze equation parameters. We have two possible + * sources: the passed reloptions (which could be a main table or a toast + * table), or the autovacuum GUC variables. + */ + + /* -1 in autovac setting means use plain vacuum_scale_factor */ + vac_scale_factor = (relopts && relopts->vacuum_scale_factor >= 0) + ? relopts->vacuum_scale_factor + : autovacuum_vac_scale; + + vac_base_thresh = (relopts && relopts->vacuum_threshold >= 0) + ? relopts->vacuum_threshold + : autovacuum_vac_thresh; + + vac_ins_scale_factor = (relopts && relopts->vacuum_ins_scale_factor >= 0) + ? relopts->vacuum_ins_scale_factor + : autovacuum_vac_ins_scale; + + /* -1 is used to disable insert vacuums */ + vac_ins_base_thresh = (relopts && relopts->vacuum_ins_threshold >= -1) + ? relopts->vacuum_ins_threshold + : autovacuum_vac_ins_thresh; + + anl_scale_factor = (relopts && relopts->analyze_scale_factor >= 0) + ? relopts->analyze_scale_factor + : autovacuum_anl_scale; + + anl_base_thresh = (relopts && relopts->analyze_threshold >= 0) + ? relopts->analyze_threshold + : autovacuum_anl_thresh; + + freeze_max_age = (relopts && relopts->freeze_max_age >= 0) + ? Min(relopts->freeze_max_age, autovacuum_freeze_max_age) + : autovacuum_freeze_max_age; + + multixact_freeze_max_age = (relopts && relopts->multixact_freeze_max_age >= 0) + ? Min(relopts->multixact_freeze_max_age, effective_multixact_freeze_max_age) + : effective_multixact_freeze_max_age; + + av_enabled = (relopts ? relopts->enabled : true); + + /* Force vacuum if table is at risk of wraparound */ + xidForceLimit = recentXid - freeze_max_age; + if (xidForceLimit < FirstNormalTransactionId) + xidForceLimit -= FirstNormalTransactionId; + force_vacuum = (TransactionIdIsNormal(classForm->relfrozenxid) && + TransactionIdPrecedes(classForm->relfrozenxid, + xidForceLimit)); + if (!force_vacuum) + { + multiForceLimit = recentMulti - multixact_freeze_max_age; + if (multiForceLimit < FirstMultiXactId) + multiForceLimit -= FirstMultiXactId; + force_vacuum = MultiXactIdIsValid(classForm->relminmxid) && + MultiXactIdPrecedes(classForm->relminmxid, multiForceLimit); + } + *wraparound = force_vacuum; + + /* User disabled it in pg_class.reloptions? (But ignore if at risk) */ + if (!av_enabled && !force_vacuum) + { + *doanalyze = false; + *dovacuum = false; + return; + } + + /* + * If we found the table in the stats hash, and autovacuum is currently + * enabled, make a threshold-based decision whether to vacuum and/or + * analyze. If autovacuum is currently disabled, we must be here for + * anti-wraparound vacuuming only, so don't vacuum (or analyze) anything + * that's not being forced. + */ + if (PointerIsValid(tabentry) && AutoVacuumingActive()) + { + reltuples = classForm->reltuples; + vactuples = tabentry->n_dead_tuples; + instuples = tabentry->inserts_since_vacuum; + anltuples = tabentry->changes_since_analyze; + + /* If the table hasn't yet been vacuumed, take reltuples as zero */ + if (reltuples < 0) + reltuples = 0; + + vacthresh = (float4) vac_base_thresh + vac_scale_factor * reltuples; + vacinsthresh = (float4) vac_ins_base_thresh + vac_ins_scale_factor * reltuples; + anlthresh = (float4) anl_base_thresh + anl_scale_factor * reltuples; + + /* + * Note that we don't need to take special consideration for stat + * reset, because if that happens, the last vacuum and analyze counts + * will be reset too. + */ + if (vac_ins_base_thresh >= 0) + elog(DEBUG3, "%s: vac: %.0f (threshold %.0f), ins: %.0f (threshold %.0f), anl: %.0f (threshold %.0f)", + NameStr(classForm->relname), + vactuples, vacthresh, instuples, vacinsthresh, anltuples, anlthresh); + else + elog(DEBUG3, "%s: vac: %.0f (threshold %.0f), ins: (disabled), anl: %.0f (threshold %.0f)", + NameStr(classForm->relname), + vactuples, vacthresh, anltuples, anlthresh); + + /* Determine if this table needs vacuum or analyze. */ + *dovacuum = force_vacuum || (vactuples > vacthresh) || + (vac_ins_base_thresh >= 0 && instuples > vacinsthresh); + *doanalyze = (anltuples > anlthresh); + } + else + { + /* + * Skip a table not found in stat hash, unless we have to force vacuum + * for anti-wrap purposes. If it's not acted upon, there's no need to + * vacuum it. + */ + *dovacuum = force_vacuum; + *doanalyze = false; + } + + /* ANALYZE refuses to work with pg_statistic */ + if (relid == StatisticRelationId) + *doanalyze = false; +} + +/* + * autovacuum_do_vac_analyze + * Vacuum and/or analyze the specified table + */ +static void +autovacuum_do_vac_analyze(autovac_table *tab, BufferAccessStrategy bstrategy) +{ + RangeVar *rangevar; + VacuumRelation *rel; + List *rel_list; + + /* Let pgstat know what we're doing */ + autovac_report_activity(tab); + + /* Set up one VacuumRelation target, identified by OID, for vacuum() */ + rangevar = makeRangeVar(tab->at_nspname, tab->at_relname, -1); + rel = makeVacuumRelation(rangevar, tab->at_relid, NIL); + rel_list = list_make1(rel); + + vacuum(rel_list, &tab->at_params, bstrategy, true); +} + +/* + * autovac_report_activity + * Report to pgstat what autovacuum is doing + * + * We send a SQL string corresponding to what the user would see if the + * equivalent command was to be issued manually. + * + * Note we assume that we are going to report the next command as soon as we're + * done with the current one, and exit right after the last one, so we don't + * bother to report "<IDLE>" or some such. + */ +static void +autovac_report_activity(autovac_table *tab) +{ +#define MAX_AUTOVAC_ACTIV_LEN (NAMEDATALEN * 2 + 56) + char activity[MAX_AUTOVAC_ACTIV_LEN]; + int len; + + /* Report the command and possible options */ + if (tab->at_params.options & VACOPT_VACUUM) + snprintf(activity, MAX_AUTOVAC_ACTIV_LEN, + "autovacuum: VACUUM%s", + tab->at_params.options & VACOPT_ANALYZE ? " ANALYZE" : ""); + else + snprintf(activity, MAX_AUTOVAC_ACTIV_LEN, + "autovacuum: ANALYZE"); + + /* + * Report the qualified name of the relation. + */ + len = strlen(activity); + + snprintf(activity + len, MAX_AUTOVAC_ACTIV_LEN - len, + " %s.%s%s", tab->at_nspname, tab->at_relname, + tab->at_params.is_wraparound ? " (to prevent wraparound)" : ""); + + /* Set statement_timestamp() to current time for pg_stat_activity */ + SetCurrentStatementStartTimestamp(); + + pgstat_report_activity(STATE_RUNNING, activity); +} + +/* + * autovac_report_workitem + * Report to pgstat that autovacuum is processing a work item + */ +static void +autovac_report_workitem(AutoVacuumWorkItem *workitem, + const char *nspname, const char *relname) +{ + char activity[MAX_AUTOVAC_ACTIV_LEN + 12 + 2]; + char blk[12 + 2]; + int len; + + switch (workitem->avw_type) + { + case AVW_BRINSummarizeRange: + snprintf(activity, MAX_AUTOVAC_ACTIV_LEN, + "autovacuum: BRIN summarize"); + break; + } + + /* + * Report the qualified name of the relation, and the block number if any + */ + len = strlen(activity); + + if (BlockNumberIsValid(workitem->avw_blockNumber)) + snprintf(blk, sizeof(blk), " %u", workitem->avw_blockNumber); + else + blk[0] = '\0'; + + snprintf(activity + len, MAX_AUTOVAC_ACTIV_LEN - len, + " %s.%s%s", nspname, relname, blk); + + /* Set statement_timestamp() to current time for pg_stat_activity */ + SetCurrentStatementStartTimestamp(); + + pgstat_report_activity(STATE_RUNNING, activity); +} + +/* + * AutoVacuumingActive + * Check GUC vars and report whether the autovacuum process should be + * running. + */ +bool +AutoVacuumingActive(void) +{ + if (!autovacuum_start_daemon || !pgstat_track_counts) + return false; + return true; +} + +/* + * Request one work item to the next autovacuum run processing our database. + * Return false if the request can't be recorded. + */ +bool +AutoVacuumRequestWork(AutoVacuumWorkItemType type, Oid relationId, + BlockNumber blkno) +{ + int i; + bool result = false; + + LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE); + + /* + * Locate an unused work item and fill it with the given data. + */ + for (i = 0; i < NUM_WORKITEMS; i++) + { + AutoVacuumWorkItem *workitem = &AutoVacuumShmem->av_workItems[i]; + + if (workitem->avw_used) + continue; + + workitem->avw_used = true; + workitem->avw_active = false; + workitem->avw_type = type; + workitem->avw_database = MyDatabaseId; + workitem->avw_relation = relationId; + workitem->avw_blockNumber = blkno; + result = true; + + /* done */ + break; + } + + LWLockRelease(AutovacuumLock); + + return result; +} + +/* + * autovac_init + * This is called at postmaster initialization. + * + * All we do here is annoy the user if he got it wrong. + */ +void +autovac_init(void) +{ + if (autovacuum_start_daemon && !pgstat_track_counts) + ereport(WARNING, + (errmsg("autovacuum not started because of misconfiguration"), + errhint("Enable the \"track_counts\" option."))); +} + +/* + * IsAutoVacuum functions + * Return whether this is either a launcher autovacuum process or a worker + * process. + */ +bool +IsAutoVacuumLauncherProcess(void) +{ + return am_autovacuum_launcher; +} + +bool +IsAutoVacuumWorkerProcess(void) +{ + return am_autovacuum_worker; +} + + +/* + * AutoVacuumShmemSize + * Compute space needed for autovacuum-related shared memory + */ +Size +AutoVacuumShmemSize(void) +{ + Size size; + + /* + * Need the fixed struct and the array of WorkerInfoData. + */ + size = sizeof(AutoVacuumShmemStruct); + size = MAXALIGN(size); + size = add_size(size, mul_size(autovacuum_max_workers, + sizeof(WorkerInfoData))); + return size; +} + +/* + * AutoVacuumShmemInit + * Allocate and initialize autovacuum-related shared memory + */ +void +AutoVacuumShmemInit(void) +{ + bool found; + + AutoVacuumShmem = (AutoVacuumShmemStruct *) + ShmemInitStruct("AutoVacuum Data", + AutoVacuumShmemSize(), + &found); + + if (!IsUnderPostmaster) + { + WorkerInfo worker; + int i; + + Assert(!found); + + AutoVacuumShmem->av_launcherpid = 0; + dlist_init(&AutoVacuumShmem->av_freeWorkers); + dlist_init(&AutoVacuumShmem->av_runningWorkers); + AutoVacuumShmem->av_startingWorker = NULL; + memset(AutoVacuumShmem->av_workItems, 0, + sizeof(AutoVacuumWorkItem) * NUM_WORKITEMS); + + worker = (WorkerInfo) ((char *) AutoVacuumShmem + + MAXALIGN(sizeof(AutoVacuumShmemStruct))); + + /* initialize the WorkerInfo free list */ + for (i = 0; i < autovacuum_max_workers; i++) + dlist_push_head(&AutoVacuumShmem->av_freeWorkers, + &worker[i].wi_links); + } + else + Assert(found); +} + +/* + * autovac_refresh_stats + * Refresh pgstats data for an autovacuum process + * + * Cause the next pgstats read operation to obtain fresh data, but throttle + * such refreshing in the autovacuum launcher. This is mostly to avoid + * rereading the pgstats files too many times in quick succession when there + * are many databases. + * + * Note: we avoid throttling in the autovac worker, as it would be + * counterproductive in the recheck logic. + */ +static void +autovac_refresh_stats(void) +{ + if (IsAutoVacuumLauncherProcess()) + { + static TimestampTz last_read = 0; + TimestampTz current_time; + + current_time = GetCurrentTimestamp(); + + if (!TimestampDifferenceExceeds(last_read, current_time, + STATS_READ_DELAY)) + return; + + last_read = current_time; + } + + pgstat_clear_snapshot(); +} diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c new file mode 100644 index 0000000..c40410d --- /dev/null +++ b/src/backend/postmaster/bgworker.c @@ -0,0 +1,1325 @@ +/*-------------------------------------------------------------------- + * bgworker.c + * POSTGRES pluggable background workers implementation + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/postmaster/bgworker.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/parallel.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "port/atomics.h" +#include "postmaster/bgworker_internals.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "replication/logicallauncher.h" +#include "replication/logicalworker.h" +#include "storage/dsm.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/lwlock.h" +#include "storage/pg_shmem.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" +#include "tcop/tcopprot.h" +#include "utils/ascii.h" +#include "utils/ps_status.h" +#include "utils/timeout.h" + +/* + * The postmaster's list of registered background workers, in private memory. + */ +slist_head BackgroundWorkerList = SLIST_STATIC_INIT(BackgroundWorkerList); + +/* + * BackgroundWorkerSlots exist in shared memory and can be accessed (via + * the BackgroundWorkerArray) by both the postmaster and by regular backends. + * However, the postmaster cannot take locks, even spinlocks, because this + * might allow it to crash or become wedged if shared memory gets corrupted. + * Such an outcome is intolerable. Therefore, we need a lockless protocol + * for coordinating access to this data. + * + * The 'in_use' flag is used to hand off responsibility for the slot between + * the postmaster and the rest of the system. When 'in_use' is false, + * the postmaster will ignore the slot entirely, except for the 'in_use' flag + * itself, which it may read. In this state, regular backends may modify the + * slot. Once a backend sets 'in_use' to true, the slot becomes the + * responsibility of the postmaster. Regular backends may no longer modify it, + * but the postmaster may examine it. Thus, a backend initializing a slot + * must fully initialize the slot - and insert a write memory barrier - before + * marking it as in use. + * + * As an exception, however, even when the slot is in use, regular backends + * may set the 'terminate' flag for a slot, telling the postmaster not + * to restart it. Once the background worker is no longer running, the slot + * will be released for reuse. + * + * In addition to coordinating with the postmaster, backends modifying this + * data structure must coordinate with each other. Since they can take locks, + * this is straightforward: any backend wishing to manipulate a slot must + * take BackgroundWorkerLock in exclusive mode. Backends wishing to read + * data that might get concurrently modified by other backends should take + * this lock in shared mode. No matter what, backends reading this data + * structure must be able to tolerate concurrent modifications by the + * postmaster. + */ +typedef struct BackgroundWorkerSlot +{ + bool in_use; + bool terminate; + pid_t pid; /* InvalidPid = not started yet; 0 = dead */ + uint64 generation; /* incremented when slot is recycled */ + BackgroundWorker worker; +} BackgroundWorkerSlot; + +/* + * In order to limit the total number of parallel workers (according to + * max_parallel_workers GUC), we maintain the number of active parallel + * workers. Since the postmaster cannot take locks, two variables are used for + * this purpose: the number of registered parallel workers (modified by the + * backends, protected by BackgroundWorkerLock) and the number of terminated + * parallel workers (modified only by the postmaster, lockless). The active + * number of parallel workers is the number of registered workers minus the + * terminated ones. These counters can of course overflow, but it's not + * important here since the subtraction will still give the right number. + */ +typedef struct BackgroundWorkerArray +{ + int total_slots; + uint32 parallel_register_count; + uint32 parallel_terminate_count; + BackgroundWorkerSlot slot[FLEXIBLE_ARRAY_MEMBER]; +} BackgroundWorkerArray; + +struct BackgroundWorkerHandle +{ + int slot; + uint64 generation; +}; + +static BackgroundWorkerArray *BackgroundWorkerData; + +/* + * List of internal background worker entry points. We need this for + * reasons explained in LookupBackgroundWorkerFunction(), below. + */ +static const struct +{ + const char *fn_name; + bgworker_main_type fn_addr; +} InternalBGWorkers[] = + +{ + { + "ParallelWorkerMain", ParallelWorkerMain + }, + { + "ApplyLauncherMain", ApplyLauncherMain + }, + { + "ApplyWorkerMain", ApplyWorkerMain + } +}; + +/* Private functions. */ +static bgworker_main_type LookupBackgroundWorkerFunction(const char *libraryname, const char *funcname); + + +/* + * Calculate shared memory needed. + */ +Size +BackgroundWorkerShmemSize(void) +{ + Size size; + + /* Array of workers is variably sized. */ + size = offsetof(BackgroundWorkerArray, slot); + size = add_size(size, mul_size(max_worker_processes, + sizeof(BackgroundWorkerSlot))); + + return size; +} + +/* + * Initialize shared memory. + */ +void +BackgroundWorkerShmemInit(void) +{ + bool found; + + BackgroundWorkerData = ShmemInitStruct("Background Worker Data", + BackgroundWorkerShmemSize(), + &found); + if (!IsUnderPostmaster) + { + slist_iter siter; + int slotno = 0; + + BackgroundWorkerData->total_slots = max_worker_processes; + BackgroundWorkerData->parallel_register_count = 0; + BackgroundWorkerData->parallel_terminate_count = 0; + + /* + * Copy contents of worker list into shared memory. Record the shared + * memory slot assigned to each worker. This ensures a 1-to-1 + * correspondence between the postmaster's private list and the array + * in shared memory. + */ + slist_foreach(siter, &BackgroundWorkerList) + { + BackgroundWorkerSlot *slot = &BackgroundWorkerData->slot[slotno]; + RegisteredBgWorker *rw; + + rw = slist_container(RegisteredBgWorker, rw_lnode, siter.cur); + Assert(slotno < max_worker_processes); + slot->in_use = true; + slot->terminate = false; + slot->pid = InvalidPid; + slot->generation = 0; + rw->rw_shmem_slot = slotno; + rw->rw_worker.bgw_notify_pid = 0; /* might be reinit after crash */ + memcpy(&slot->worker, &rw->rw_worker, sizeof(BackgroundWorker)); + ++slotno; + } + + /* + * Mark any remaining slots as not in use. + */ + while (slotno < max_worker_processes) + { + BackgroundWorkerSlot *slot = &BackgroundWorkerData->slot[slotno]; + + slot->in_use = false; + ++slotno; + } + } + else + Assert(found); +} + +/* + * Search the postmaster's backend-private list of RegisteredBgWorker objects + * for the one that maps to the given slot number. + */ +static RegisteredBgWorker * +FindRegisteredWorkerBySlotNumber(int slotno) +{ + slist_iter siter; + + slist_foreach(siter, &BackgroundWorkerList) + { + RegisteredBgWorker *rw; + + rw = slist_container(RegisteredBgWorker, rw_lnode, siter.cur); + if (rw->rw_shmem_slot == slotno) + return rw; + } + + return NULL; +} + +/* + * Notice changes to shared memory made by other backends. + * Accept new worker requests only if allow_new_workers is true. + * + * This code runs in the postmaster, so we must be very careful not to assume + * that shared memory contents are sane. Otherwise, a rogue backend could + * take out the postmaster. + */ +void +BackgroundWorkerStateChange(bool allow_new_workers) +{ + int slotno; + + /* + * The total number of slots stored in shared memory should match our + * notion of max_worker_processes. If it does not, something is very + * wrong. Further down, we always refer to this value as + * max_worker_processes, in case shared memory gets corrupted while we're + * looping. + */ + if (max_worker_processes != BackgroundWorkerData->total_slots) + { + ereport(LOG, + (errmsg("inconsistent background worker state (max_worker_processes=%d, total_slots=%d)", + max_worker_processes, + BackgroundWorkerData->total_slots))); + return; + } + + /* + * Iterate through slots, looking for newly-registered workers or workers + * who must die. + */ + for (slotno = 0; slotno < max_worker_processes; ++slotno) + { + BackgroundWorkerSlot *slot = &BackgroundWorkerData->slot[slotno]; + RegisteredBgWorker *rw; + + if (!slot->in_use) + continue; + + /* + * Make sure we don't see the in_use flag before the updated slot + * contents. + */ + pg_read_barrier(); + + /* See whether we already know about this worker. */ + rw = FindRegisteredWorkerBySlotNumber(slotno); + if (rw != NULL) + { + /* + * In general, the worker data can't change after it's initially + * registered. However, someone can set the terminate flag. + */ + if (slot->terminate && !rw->rw_terminate) + { + rw->rw_terminate = true; + if (rw->rw_pid != 0) + kill(rw->rw_pid, SIGTERM); + else + { + /* Report never-started, now-terminated worker as dead. */ + ReportBackgroundWorkerPID(rw); + } + } + continue; + } + + /* + * If we aren't allowing new workers, then immediately mark it for + * termination; the next stanza will take care of cleaning it up. + * Doing this ensures that any process waiting for the worker will get + * awoken, even though the worker will never be allowed to run. + */ + if (!allow_new_workers) + slot->terminate = true; + + /* + * If the worker is marked for termination, we don't need to add it to + * the registered workers list; we can just free the slot. However, if + * bgw_notify_pid is set, the process that registered the worker may + * need to know that we've processed the terminate request, so be sure + * to signal it. + */ + if (slot->terminate) + { + int notify_pid; + + /* + * We need a memory barrier here to make sure that the load of + * bgw_notify_pid and the update of parallel_terminate_count + * complete before the store to in_use. + */ + notify_pid = slot->worker.bgw_notify_pid; + if ((slot->worker.bgw_flags & BGWORKER_CLASS_PARALLEL) != 0) + BackgroundWorkerData->parallel_terminate_count++; + slot->pid = 0; + + pg_memory_barrier(); + slot->in_use = false; + + if (notify_pid != 0) + kill(notify_pid, SIGUSR1); + + continue; + } + + /* + * Copy the registration data into the registered workers list. + */ + rw = malloc(sizeof(RegisteredBgWorker)); + if (rw == NULL) + { + ereport(LOG, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + return; + } + + /* + * Copy strings in a paranoid way. If shared memory is corrupted, the + * source data might not even be NUL-terminated. + */ + ascii_safe_strlcpy(rw->rw_worker.bgw_name, + slot->worker.bgw_name, BGW_MAXLEN); + ascii_safe_strlcpy(rw->rw_worker.bgw_type, + slot->worker.bgw_type, BGW_MAXLEN); + ascii_safe_strlcpy(rw->rw_worker.bgw_library_name, + slot->worker.bgw_library_name, BGW_MAXLEN); + ascii_safe_strlcpy(rw->rw_worker.bgw_function_name, + slot->worker.bgw_function_name, BGW_MAXLEN); + + /* + * Copy various fixed-size fields. + * + * flags, start_time, and restart_time are examined by the postmaster, + * but nothing too bad will happen if they are corrupted. The + * remaining fields will only be examined by the child process. It + * might crash, but we won't. + */ + rw->rw_worker.bgw_flags = slot->worker.bgw_flags; + rw->rw_worker.bgw_start_time = slot->worker.bgw_start_time; + rw->rw_worker.bgw_restart_time = slot->worker.bgw_restart_time; + rw->rw_worker.bgw_main_arg = slot->worker.bgw_main_arg; + memcpy(rw->rw_worker.bgw_extra, slot->worker.bgw_extra, BGW_EXTRALEN); + + /* + * Copy the PID to be notified about state changes, but only if the + * postmaster knows about a backend with that PID. It isn't an error + * if the postmaster doesn't know about the PID, because the backend + * that requested the worker could have died (or been killed) just + * after doing so. Nonetheless, at least until we get some experience + * with how this plays out in the wild, log a message at a relative + * high debug level. + */ + rw->rw_worker.bgw_notify_pid = slot->worker.bgw_notify_pid; + if (!PostmasterMarkPIDForWorkerNotify(rw->rw_worker.bgw_notify_pid)) + { + elog(DEBUG1, "worker notification PID %ld is not valid", + (long) rw->rw_worker.bgw_notify_pid); + rw->rw_worker.bgw_notify_pid = 0; + } + + /* Initialize postmaster bookkeeping. */ + rw->rw_backend = NULL; + rw->rw_pid = 0; + rw->rw_child_slot = 0; + rw->rw_crashed_at = 0; + rw->rw_shmem_slot = slotno; + rw->rw_terminate = false; + + /* Log it! */ + ereport(DEBUG1, + (errmsg_internal("registering background worker \"%s\"", + rw->rw_worker.bgw_name))); + + slist_push_head(&BackgroundWorkerList, &rw->rw_lnode); + } +} + +/* + * Forget about a background worker that's no longer needed. + * + * The worker must be identified by passing an slist_mutable_iter that + * points to it. This convention allows deletion of workers during + * searches of the worker list, and saves having to search the list again. + * + * Caller is responsible for notifying bgw_notify_pid, if appropriate. + * + * This function must be invoked only in the postmaster. + */ +void +ForgetBackgroundWorker(slist_mutable_iter *cur) +{ + RegisteredBgWorker *rw; + BackgroundWorkerSlot *slot; + + rw = slist_container(RegisteredBgWorker, rw_lnode, cur->cur); + + Assert(rw->rw_shmem_slot < max_worker_processes); + slot = &BackgroundWorkerData->slot[rw->rw_shmem_slot]; + Assert(slot->in_use); + + /* + * We need a memory barrier here to make sure that the update of + * parallel_terminate_count completes before the store to in_use. + */ + if ((rw->rw_worker.bgw_flags & BGWORKER_CLASS_PARALLEL) != 0) + BackgroundWorkerData->parallel_terminate_count++; + + pg_memory_barrier(); + slot->in_use = false; + + ereport(DEBUG1, + (errmsg_internal("unregistering background worker \"%s\"", + rw->rw_worker.bgw_name))); + + slist_delete_current(cur); + free(rw); +} + +/* + * Report the PID of a newly-launched background worker in shared memory. + * + * This function should only be called from the postmaster. + */ +void +ReportBackgroundWorkerPID(RegisteredBgWorker *rw) +{ + BackgroundWorkerSlot *slot; + + Assert(rw->rw_shmem_slot < max_worker_processes); + slot = &BackgroundWorkerData->slot[rw->rw_shmem_slot]; + slot->pid = rw->rw_pid; + + if (rw->rw_worker.bgw_notify_pid != 0) + kill(rw->rw_worker.bgw_notify_pid, SIGUSR1); +} + +/* + * Report that the PID of a background worker is now zero because a + * previously-running background worker has exited. + * + * This function should only be called from the postmaster. + */ +void +ReportBackgroundWorkerExit(slist_mutable_iter *cur) +{ + RegisteredBgWorker *rw; + BackgroundWorkerSlot *slot; + int notify_pid; + + rw = slist_container(RegisteredBgWorker, rw_lnode, cur->cur); + + Assert(rw->rw_shmem_slot < max_worker_processes); + slot = &BackgroundWorkerData->slot[rw->rw_shmem_slot]; + slot->pid = rw->rw_pid; + notify_pid = rw->rw_worker.bgw_notify_pid; + + /* + * If this worker is slated for deregistration, do that before notifying + * the process which started it. Otherwise, if that process tries to + * reuse the slot immediately, it might not be available yet. In theory + * that could happen anyway if the process checks slot->pid at just the + * wrong moment, but this makes the window narrower. + */ + if (rw->rw_terminate || + rw->rw_worker.bgw_restart_time == BGW_NEVER_RESTART) + ForgetBackgroundWorker(cur); + + if (notify_pid != 0) + kill(notify_pid, SIGUSR1); +} + +/* + * Cancel SIGUSR1 notifications for a PID belonging to an exiting backend. + * + * This function should only be called from the postmaster. + */ +void +BackgroundWorkerStopNotifications(pid_t pid) +{ + slist_iter siter; + + slist_foreach(siter, &BackgroundWorkerList) + { + RegisteredBgWorker *rw; + + rw = slist_container(RegisteredBgWorker, rw_lnode, siter.cur); + if (rw->rw_worker.bgw_notify_pid == pid) + rw->rw_worker.bgw_notify_pid = 0; + } +} + +/* + * Cancel any not-yet-started worker requests that have waiting processes. + * + * This is called during a normal ("smart" or "fast") database shutdown. + * After this point, no new background workers will be started, so anything + * that might be waiting for them needs to be kicked off its wait. We do + * that by cancelling the bgworker registration entirely, which is perhaps + * overkill, but since we're shutting down it does not matter whether the + * registration record sticks around. + * + * This function should only be called from the postmaster. + */ +void +ForgetUnstartedBackgroundWorkers(void) +{ + slist_mutable_iter iter; + + slist_foreach_modify(iter, &BackgroundWorkerList) + { + RegisteredBgWorker *rw; + BackgroundWorkerSlot *slot; + + rw = slist_container(RegisteredBgWorker, rw_lnode, iter.cur); + Assert(rw->rw_shmem_slot < max_worker_processes); + slot = &BackgroundWorkerData->slot[rw->rw_shmem_slot]; + + /* If it's not yet started, and there's someone waiting ... */ + if (slot->pid == InvalidPid && + rw->rw_worker.bgw_notify_pid != 0) + { + /* ... then zap it, and notify the waiter */ + int notify_pid = rw->rw_worker.bgw_notify_pid; + + ForgetBackgroundWorker(&iter); + if (notify_pid != 0) + kill(notify_pid, SIGUSR1); + } + } +} + +/* + * Reset background worker crash state. + * + * We assume that, after a crash-and-restart cycle, background workers without + * the never-restart flag should be restarted immediately, instead of waiting + * for bgw_restart_time to elapse. On the other hand, workers with that flag + * should be forgotten immediately, since we won't ever restart them. + * + * This function should only be called from the postmaster. + */ +void +ResetBackgroundWorkerCrashTimes(void) +{ + slist_mutable_iter iter; + + slist_foreach_modify(iter, &BackgroundWorkerList) + { + RegisteredBgWorker *rw; + + rw = slist_container(RegisteredBgWorker, rw_lnode, iter.cur); + + if (rw->rw_worker.bgw_restart_time == BGW_NEVER_RESTART) + { + /* + * Workers marked BGW_NEVER_RESTART shouldn't get relaunched after + * the crash, so forget about them. (If we wait until after the + * crash to forget about them, and they are parallel workers, + * parallel_terminate_count will get incremented after we've + * already zeroed parallel_register_count, which would be bad.) + */ + ForgetBackgroundWorker(&iter); + } + else + { + /* + * The accounting which we do via parallel_register_count and + * parallel_terminate_count would get messed up if a worker marked + * parallel could survive a crash and restart cycle. All such + * workers should be marked BGW_NEVER_RESTART, and thus control + * should never reach this branch. + */ + Assert((rw->rw_worker.bgw_flags & BGWORKER_CLASS_PARALLEL) == 0); + + /* + * Allow this worker to be restarted immediately after we finish + * resetting. + */ + rw->rw_crashed_at = 0; + + /* + * If there was anyone waiting for it, they're history. + */ + rw->rw_worker.bgw_notify_pid = 0; + } + } +} + +#ifdef EXEC_BACKEND +/* + * In EXEC_BACKEND mode, workers use this to retrieve their details from + * shared memory. + */ +BackgroundWorker * +BackgroundWorkerEntry(int slotno) +{ + static BackgroundWorker myEntry; + BackgroundWorkerSlot *slot; + + Assert(slotno < BackgroundWorkerData->total_slots); + slot = &BackgroundWorkerData->slot[slotno]; + Assert(slot->in_use); + + /* must copy this in case we don't intend to retain shmem access */ + memcpy(&myEntry, &slot->worker, sizeof myEntry); + return &myEntry; +} +#endif + +/* + * Complain about the BackgroundWorker definition using error level elevel. + * Return true if it looks ok, false if not (unless elevel >= ERROR, in + * which case we won't return at all in the not-OK case). + */ +static bool +SanityCheckBackgroundWorker(BackgroundWorker *worker, int elevel) +{ + /* sanity check for flags */ + if (worker->bgw_flags & BGWORKER_BACKEND_DATABASE_CONNECTION) + { + if (!(worker->bgw_flags & BGWORKER_SHMEM_ACCESS)) + { + ereport(elevel, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("background worker \"%s\": must attach to shared memory in order to request a database connection", + worker->bgw_name))); + return false; + } + + if (worker->bgw_start_time == BgWorkerStart_PostmasterStart) + { + ereport(elevel, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("background worker \"%s\": cannot request database access if starting at postmaster start", + worker->bgw_name))); + return false; + } + + /* XXX other checks? */ + } + + if ((worker->bgw_restart_time < 0 && + worker->bgw_restart_time != BGW_NEVER_RESTART) || + (worker->bgw_restart_time > USECS_PER_DAY / 1000)) + { + ereport(elevel, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("background worker \"%s\": invalid restart interval", + worker->bgw_name))); + return false; + } + + /* + * Parallel workers may not be configured for restart, because the + * parallel_register_count/parallel_terminate_count accounting can't + * handle parallel workers lasting through a crash-and-restart cycle. + */ + if (worker->bgw_restart_time != BGW_NEVER_RESTART && + (worker->bgw_flags & BGWORKER_CLASS_PARALLEL) != 0) + { + ereport(elevel, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("background worker \"%s\": parallel workers may not be configured for restart", + worker->bgw_name))); + return false; + } + + /* + * If bgw_type is not filled in, use bgw_name. + */ + if (strcmp(worker->bgw_type, "") == 0) + strcpy(worker->bgw_type, worker->bgw_name); + + return true; +} + +/* + * Standard SIGTERM handler for background workers + */ +static void +bgworker_die(SIGNAL_ARGS) +{ + PG_SETMASK(&BlockSig); + + ereport(FATAL, + (errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg("terminating background worker \"%s\" due to administrator command", + MyBgworkerEntry->bgw_type))); +} + +/* + * Start a new background worker + * + * This is the main entry point for background worker, to be called from + * postmaster. + */ +void +StartBackgroundWorker(void) +{ + sigjmp_buf local_sigjmp_buf; + BackgroundWorker *worker = MyBgworkerEntry; + bgworker_main_type entrypt; + + if (worker == NULL) + elog(FATAL, "unable to find bgworker entry"); + + IsBackgroundWorker = true; + + MyBackendType = B_BG_WORKER; + init_ps_display(worker->bgw_name); + + /* + * If we're not supposed to have shared memory access, then detach from + * shared memory. If we didn't request shared memory access, the + * postmaster won't force a cluster-wide restart if we exit unexpectedly, + * so we'd better make sure that we don't mess anything up that would + * require that sort of cleanup. + */ + if ((worker->bgw_flags & BGWORKER_SHMEM_ACCESS) == 0) + { + ShutdownLatchSupport(); + dsm_detach_all(); + PGSharedMemoryDetach(); + } + + SetProcessingMode(InitProcessing); + + /* Apply PostAuthDelay */ + if (PostAuthDelay > 0) + pg_usleep(PostAuthDelay * 1000000L); + + /* + * Set up signal handlers. + */ + if (worker->bgw_flags & BGWORKER_BACKEND_DATABASE_CONNECTION) + { + /* + * SIGINT is used to signal canceling the current action + */ + pqsignal(SIGINT, StatementCancelHandler); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGFPE, FloatExceptionHandler); + + /* XXX Any other handlers needed here? */ + } + else + { + pqsignal(SIGINT, SIG_IGN); + pqsignal(SIGUSR1, SIG_IGN); + pqsignal(SIGFPE, SIG_IGN); + } + pqsignal(SIGTERM, bgworker_die); + /* SIGQUIT handler was already set up by InitPostmasterChild */ + pqsignal(SIGHUP, SIG_IGN); + + InitializeTimeouts(); /* establishes SIGALRM handler */ + + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR2, SIG_IGN); + pqsignal(SIGCHLD, SIG_DFL); + + /* + * If an exception is encountered, processing resumes here. + * + * We just need to clean up, report the error, and go away. + */ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* Since not using PG_TRY, must reset error stack by hand */ + error_context_stack = NULL; + + /* Prevent interrupts while cleaning up */ + HOLD_INTERRUPTS(); + + /* + * sigsetjmp will have blocked all signals, but we may need to accept + * signals while communicating with our parallel leader. Once we've + * done HOLD_INTERRUPTS() it should be safe to unblock signals. + */ + BackgroundWorkerUnblockSignals(); + + /* Report the error to the parallel leader and the server log */ + EmitErrorReport(); + + /* + * Do we need more cleanup here? For shmem-connected bgworkers, we + * will call InitProcess below, which will install ProcKill as exit + * callback. That will take care of releasing locks, etc. + */ + + /* and go away */ + proc_exit(1); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + /* + * If the background worker request shared memory access, set that up now; + * else, detach all shared memory segments. + */ + if (worker->bgw_flags & BGWORKER_SHMEM_ACCESS) + { + /* + * Early initialization. Some of this could be useful even for + * background workers that aren't using shared memory, but they can + * call the individual startup routines for those subsystems if + * needed. + */ + BaseInit(); + + /* + * Create a per-backend PGPROC struct in shared memory, except in the + * EXEC_BACKEND case where this was done in SubPostmasterMain. We must + * do this before we can use LWLocks (and in the EXEC_BACKEND case we + * already had to do some stuff with LWLocks). + */ +#ifndef EXEC_BACKEND + InitProcess(); +#endif + } + + /* + * Look up the entry point function, loading its library if necessary. + */ + entrypt = LookupBackgroundWorkerFunction(worker->bgw_library_name, + worker->bgw_function_name); + + /* + * Note that in normal processes, we would call InitPostgres here. For a + * worker, however, we don't know what database to connect to, yet; so we + * need to wait until the user code does it via + * BackgroundWorkerInitializeConnection(). + */ + + /* + * Now invoke the user-defined worker code + */ + entrypt(worker->bgw_main_arg); + + /* ... and if it returns, we're done */ + proc_exit(0); +} + +/* + * Register a new static background worker. + * + * This can only be called directly from postmaster or in the _PG_init + * function of a module library that's loaded by shared_preload_libraries; + * otherwise it will have no effect. + */ +void +RegisterBackgroundWorker(BackgroundWorker *worker) +{ + RegisteredBgWorker *rw; + static int numworkers = 0; + + if (!IsUnderPostmaster) + ereport(DEBUG1, + (errmsg_internal("registering background worker \"%s\"", worker->bgw_name))); + + if (!process_shared_preload_libraries_in_progress && + strcmp(worker->bgw_library_name, "postgres") != 0) + { + if (!IsUnderPostmaster) + ereport(LOG, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("background worker \"%s\": must be registered in shared_preload_libraries", + worker->bgw_name))); + return; + } + + if (!SanityCheckBackgroundWorker(worker, LOG)) + return; + + if (worker->bgw_notify_pid != 0) + { + ereport(LOG, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("background worker \"%s\": only dynamic background workers can request notification", + worker->bgw_name))); + return; + } + + /* + * Enforce maximum number of workers. Note this is overly restrictive: we + * could allow more non-shmem-connected workers, because these don't count + * towards the MAX_BACKENDS limit elsewhere. For now, it doesn't seem + * important to relax this restriction. + */ + if (++numworkers > max_worker_processes) + { + ereport(LOG, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("too many background workers"), + errdetail_plural("Up to %d background worker can be registered with the current settings.", + "Up to %d background workers can be registered with the current settings.", + max_worker_processes, + max_worker_processes), + errhint("Consider increasing the configuration parameter \"max_worker_processes\"."))); + return; + } + + /* + * Copy the registration data into the registered workers list. + */ + rw = malloc(sizeof(RegisteredBgWorker)); + if (rw == NULL) + { + ereport(LOG, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + return; + } + + rw->rw_worker = *worker; + rw->rw_backend = NULL; + rw->rw_pid = 0; + rw->rw_child_slot = 0; + rw->rw_crashed_at = 0; + rw->rw_terminate = false; + + slist_push_head(&BackgroundWorkerList, &rw->rw_lnode); +} + +/* + * Register a new background worker from a regular backend. + * + * Returns true on success and false on failure. Failure typically indicates + * that no background worker slots are currently available. + * + * If handle != NULL, we'll set *handle to a pointer that can subsequently + * be used as an argument to GetBackgroundWorkerPid(). The caller can + * free this pointer using pfree(), if desired. + */ +bool +RegisterDynamicBackgroundWorker(BackgroundWorker *worker, + BackgroundWorkerHandle **handle) +{ + int slotno; + bool success = false; + bool parallel; + uint64 generation = 0; + + /* + * We can't register dynamic background workers from the postmaster. If + * this is a standalone backend, we're the only process and can't start + * any more. In a multi-process environment, it might be theoretically + * possible, but we don't currently support it due to locking + * considerations; see comments on the BackgroundWorkerSlot data + * structure. + */ + if (!IsUnderPostmaster) + return false; + + if (!SanityCheckBackgroundWorker(worker, ERROR)) + return false; + + parallel = (worker->bgw_flags & BGWORKER_CLASS_PARALLEL) != 0; + + LWLockAcquire(BackgroundWorkerLock, LW_EXCLUSIVE); + + /* + * If this is a parallel worker, check whether there are already too many + * parallel workers; if so, don't register another one. Our view of + * parallel_terminate_count may be slightly stale, but that doesn't really + * matter: we would have gotten the same result if we'd arrived here + * slightly earlier anyway. There's no help for it, either, since the + * postmaster must not take locks; a memory barrier wouldn't guarantee + * anything useful. + */ + if (parallel && (BackgroundWorkerData->parallel_register_count - + BackgroundWorkerData->parallel_terminate_count) >= + max_parallel_workers) + { + Assert(BackgroundWorkerData->parallel_register_count - + BackgroundWorkerData->parallel_terminate_count <= + MAX_PARALLEL_WORKER_LIMIT); + LWLockRelease(BackgroundWorkerLock); + return false; + } + + /* + * Look for an unused slot. If we find one, grab it. + */ + for (slotno = 0; slotno < BackgroundWorkerData->total_slots; ++slotno) + { + BackgroundWorkerSlot *slot = &BackgroundWorkerData->slot[slotno]; + + if (!slot->in_use) + { + memcpy(&slot->worker, worker, sizeof(BackgroundWorker)); + slot->pid = InvalidPid; /* indicates not started yet */ + slot->generation++; + slot->terminate = false; + generation = slot->generation; + if (parallel) + BackgroundWorkerData->parallel_register_count++; + + /* + * Make sure postmaster doesn't see the slot as in use before it + * sees the new contents. + */ + pg_write_barrier(); + + slot->in_use = true; + success = true; + break; + } + } + + LWLockRelease(BackgroundWorkerLock); + + /* If we found a slot, tell the postmaster to notice the change. */ + if (success) + SendPostmasterSignal(PMSIGNAL_BACKGROUND_WORKER_CHANGE); + + /* + * If we found a slot and the user has provided a handle, initialize it. + */ + if (success && handle) + { + *handle = palloc(sizeof(BackgroundWorkerHandle)); + (*handle)->slot = slotno; + (*handle)->generation = generation; + } + + return success; +} + +/* + * Get the PID of a dynamically-registered background worker. + * + * If the worker is determined to be running, the return value will be + * BGWH_STARTED and *pidp will get the PID of the worker process. If the + * postmaster has not yet attempted to start the worker, the return value will + * be BGWH_NOT_YET_STARTED. Otherwise, the return value is BGWH_STOPPED. + * + * BGWH_STOPPED can indicate either that the worker is temporarily stopped + * (because it is configured for automatic restart and exited non-zero), + * or that the worker is permanently stopped (because it exited with exit + * code 0, or was not configured for automatic restart), or even that the + * worker was unregistered without ever starting (either because startup + * failed and the worker is not configured for automatic restart, or because + * TerminateBackgroundWorker was used before the worker was successfully + * started). + */ +BgwHandleStatus +GetBackgroundWorkerPid(BackgroundWorkerHandle *handle, pid_t *pidp) +{ + BackgroundWorkerSlot *slot; + pid_t pid; + + Assert(handle->slot < max_worker_processes); + slot = &BackgroundWorkerData->slot[handle->slot]; + + /* + * We could probably arrange to synchronize access to data using memory + * barriers only, but for now, let's just keep it simple and grab the + * lock. It seems unlikely that there will be enough traffic here to + * result in meaningful contention. + */ + LWLockAcquire(BackgroundWorkerLock, LW_SHARED); + + /* + * The generation number can't be concurrently changed while we hold the + * lock. The pid, which is updated by the postmaster, can change at any + * time, but we assume such changes are atomic. So the value we read + * won't be garbage, but it might be out of date by the time the caller + * examines it (but that's unavoidable anyway). + * + * The in_use flag could be in the process of changing from true to false, + * but if it is already false then it can't change further. + */ + if (handle->generation != slot->generation || !slot->in_use) + pid = 0; + else + pid = slot->pid; + + /* All done. */ + LWLockRelease(BackgroundWorkerLock); + + if (pid == 0) + return BGWH_STOPPED; + else if (pid == InvalidPid) + return BGWH_NOT_YET_STARTED; + *pidp = pid; + return BGWH_STARTED; +} + +/* + * Wait for a background worker to start up. + * + * This is like GetBackgroundWorkerPid(), except that if the worker has not + * yet started, we wait for it to do so; thus, BGWH_NOT_YET_STARTED is never + * returned. However, if the postmaster has died, we give up and return + * BGWH_POSTMASTER_DIED, since it that case we know that startup will not + * take place. + * + * The caller *must* have set our PID as the worker's bgw_notify_pid, + * else we will not be awoken promptly when the worker's state changes. + */ +BgwHandleStatus +WaitForBackgroundWorkerStartup(BackgroundWorkerHandle *handle, pid_t *pidp) +{ + BgwHandleStatus status; + int rc; + + for (;;) + { + pid_t pid; + + CHECK_FOR_INTERRUPTS(); + + status = GetBackgroundWorkerPid(handle, &pid); + if (status == BGWH_STARTED) + *pidp = pid; + if (status != BGWH_NOT_YET_STARTED) + break; + + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_POSTMASTER_DEATH, 0, + WAIT_EVENT_BGWORKER_STARTUP); + + if (rc & WL_POSTMASTER_DEATH) + { + status = BGWH_POSTMASTER_DIED; + break; + } + + ResetLatch(MyLatch); + } + + return status; +} + +/* + * Wait for a background worker to stop. + * + * If the worker hasn't yet started, or is running, we wait for it to stop + * and then return BGWH_STOPPED. However, if the postmaster has died, we give + * up and return BGWH_POSTMASTER_DIED, because it's the postmaster that + * notifies us when a worker's state changes. + * + * The caller *must* have set our PID as the worker's bgw_notify_pid, + * else we will not be awoken promptly when the worker's state changes. + */ +BgwHandleStatus +WaitForBackgroundWorkerShutdown(BackgroundWorkerHandle *handle) +{ + BgwHandleStatus status; + int rc; + + for (;;) + { + pid_t pid; + + CHECK_FOR_INTERRUPTS(); + + status = GetBackgroundWorkerPid(handle, &pid); + if (status == BGWH_STOPPED) + break; + + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_POSTMASTER_DEATH, 0, + WAIT_EVENT_BGWORKER_SHUTDOWN); + + if (rc & WL_POSTMASTER_DEATH) + { + status = BGWH_POSTMASTER_DIED; + break; + } + + ResetLatch(MyLatch); + } + + return status; +} + +/* + * Instruct the postmaster to terminate a background worker. + * + * Note that it's safe to do this without regard to whether the worker is + * still running, or even if the worker may already have exited and been + * unregistered. + */ +void +TerminateBackgroundWorker(BackgroundWorkerHandle *handle) +{ + BackgroundWorkerSlot *slot; + bool signal_postmaster = false; + + Assert(handle->slot < max_worker_processes); + slot = &BackgroundWorkerData->slot[handle->slot]; + + /* Set terminate flag in shared memory, unless slot has been reused. */ + LWLockAcquire(BackgroundWorkerLock, LW_EXCLUSIVE); + if (handle->generation == slot->generation) + { + slot->terminate = true; + signal_postmaster = true; + } + LWLockRelease(BackgroundWorkerLock); + + /* Make sure the postmaster notices the change to shared memory. */ + if (signal_postmaster) + SendPostmasterSignal(PMSIGNAL_BACKGROUND_WORKER_CHANGE); +} + +/* + * Look up (and possibly load) a bgworker entry point function. + * + * For functions contained in the core code, we use library name "postgres" + * and consult the InternalBGWorkers array. External functions are + * looked up, and loaded if necessary, using load_external_function(). + * + * The point of this is to pass function names as strings across process + * boundaries. We can't pass actual function addresses because of the + * possibility that the function has been loaded at a different address + * in a different process. This is obviously a hazard for functions in + * loadable libraries, but it can happen even for functions in the core code + * on platforms using EXEC_BACKEND (e.g., Windows). + * + * At some point it might be worthwhile to get rid of InternalBGWorkers[] + * in favor of applying load_external_function() for core functions too; + * but that raises portability issues that are not worth addressing now. + */ +static bgworker_main_type +LookupBackgroundWorkerFunction(const char *libraryname, const char *funcname) +{ + /* + * If the function is to be loaded from postgres itself, search the + * InternalBGWorkers array. + */ + if (strcmp(libraryname, "postgres") == 0) + { + int i; + + for (i = 0; i < lengthof(InternalBGWorkers); i++) + { + if (strcmp(InternalBGWorkers[i].fn_name, funcname) == 0) + return InternalBGWorkers[i].fn_addr; + } + + /* We can only reach this by programming error. */ + elog(ERROR, "internal function \"%s\" not found", funcname); + } + + /* Otherwise load from external library. */ + return (bgworker_main_type) + load_external_function(libraryname, funcname, true, NULL); +} + +/* + * Given a PID, get the bgw_type of the background worker. Returns NULL if + * not a valid background worker. + * + * The return value is in static memory belonging to this function, so it has + * to be used before calling this function again. This is so that the caller + * doesn't have to worry about the background worker locking protocol. + */ +const char * +GetBackgroundWorkerTypeByPid(pid_t pid) +{ + int slotno; + bool found = false; + static char result[BGW_MAXLEN]; + + LWLockAcquire(BackgroundWorkerLock, LW_SHARED); + + for (slotno = 0; slotno < BackgroundWorkerData->total_slots; slotno++) + { + BackgroundWorkerSlot *slot = &BackgroundWorkerData->slot[slotno]; + + if (slot->pid > 0 && slot->pid == pid) + { + strcpy(result, slot->worker.bgw_type); + found = true; + break; + } + } + + LWLockRelease(BackgroundWorkerLock); + + if (!found) + return NULL; + + return result; +} diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c new file mode 100644 index 0000000..715d519 --- /dev/null +++ b/src/backend/postmaster/bgwriter.c @@ -0,0 +1,351 @@ +/*------------------------------------------------------------------------- + * + * bgwriter.c + * + * The background writer (bgwriter) is new as of Postgres 8.0. It attempts + * to keep regular backends from having to write out dirty shared buffers + * (which they would only do when needing to free a shared buffer to read in + * another page). In the best scenario all writes from shared buffers will + * be issued by the background writer process. However, regular backends are + * still empowered to issue writes if the bgwriter fails to maintain enough + * clean shared buffers. + * + * As of Postgres 9.2 the bgwriter no longer handles checkpoints. + * + * The bgwriter is started by the postmaster as soon as the startup subprocess + * finishes, or as soon as recovery begins if we are doing archive recovery. + * It remains alive until the postmaster commands it to terminate. + * Normal termination is by SIGTERM, which instructs the bgwriter to exit(0). + * Emergency termination is by SIGQUIT; like any backend, the bgwriter will + * simply abort and exit on SIGQUIT. + * + * If the bgwriter exits unexpectedly, the postmaster treats that the same + * as a backend crash: shared memory may be corrupted, so remaining backends + * should be killed by SIGQUIT and then a recovery cycle started. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/postmaster/bgwriter.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgwriter.h" +#include "postmaster/interrupt.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "storage/condition_variable.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/proc.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" +#include "storage/smgr.h" +#include "storage/spin.h" +#include "storage/standby.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/resowner.h" +#include "utils/timestamp.h" + +/* + * GUC parameters + */ +int BgWriterDelay = 200; + +/* + * Multiplier to apply to BgWriterDelay when we decide to hibernate. + * (Perhaps this needs to be configurable?) + */ +#define HIBERNATE_FACTOR 50 + +/* + * Interval in which standby snapshots are logged into the WAL stream, in + * milliseconds. + */ +#define LOG_SNAPSHOT_INTERVAL_MS 15000 + +/* + * LSN and timestamp at which we last issued a LogStandbySnapshot(), to avoid + * doing so too often or repeatedly if there has been no other write activity + * in the system. + */ +static TimestampTz last_snapshot_ts; +static XLogRecPtr last_snapshot_lsn = InvalidXLogRecPtr; + + +/* + * Main entry point for bgwriter process + * + * This is invoked from AuxiliaryProcessMain, which has already created the + * basic execution environment, but not enabled signals yet. + */ +void +BackgroundWriterMain(void) +{ + sigjmp_buf local_sigjmp_buf; + MemoryContext bgwriter_context; + bool prev_hibernate; + WritebackContext wb_context; + + /* + * Properly accept or ignore signals that might be sent to us. + */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGINT, SIG_IGN); + pqsignal(SIGTERM, SignalHandlerForShutdownRequest); + /* SIGQUIT handler was already set up by InitPostmasterChild */ + pqsignal(SIGALRM, SIG_IGN); + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, SIG_IGN); + + /* + * Reset some signals that are accepted by postmaster but not here + */ + pqsignal(SIGCHLD, SIG_DFL); + + /* + * We just started, assume there has been either a shutdown or + * end-of-recovery snapshot. + */ + last_snapshot_ts = GetCurrentTimestamp(); + + /* + * Create a memory context that we will do all our work in. We do this so + * that we can reset the context during error recovery and thereby avoid + * possible memory leaks. Formerly this code just ran in + * TopMemoryContext, but resetting that would be a really bad idea. + */ + bgwriter_context = AllocSetContextCreate(TopMemoryContext, + "Background Writer", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(bgwriter_context); + + WritebackContextInit(&wb_context, &bgwriter_flush_after); + + /* + * If an exception is encountered, processing resumes here. + * + * You might wonder why this isn't coded as an infinite loop around a + * PG_TRY construct. The reason is that this is the bottom of the + * exception stack, and so with PG_TRY there would be no exception handler + * in force at all during the CATCH part. By leaving the outermost setjmp + * always active, we have at least some chance of recovering from an error + * during error recovery. (If we get into an infinite loop thereby, it + * will soon be stopped by overflow of elog.c's internal state stack.) + * + * Note that we use sigsetjmp(..., 1), so that the prevailing signal mask + * (to wit, BlockSig) will be restored when longjmp'ing to here. Thus, + * signals other than SIGQUIT will be blocked until we complete error + * recovery. It might seem that this policy makes the HOLD_INTERRUPTS() + * call redundant, but it is not since InterruptPending might be set + * already. + */ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* Since not using PG_TRY, must reset error stack by hand */ + error_context_stack = NULL; + + /* Prevent interrupts while cleaning up */ + HOLD_INTERRUPTS(); + + /* Report the error to the server log */ + EmitErrorReport(); + + /* + * These operations are really just a minimal subset of + * AbortTransaction(). We don't have very many resources to worry + * about in bgwriter, but we do have LWLocks, buffers, and temp files. + */ + LWLockReleaseAll(); + ConditionVariableCancelSleep(); + AbortBufferIO(); + UnlockBuffers(); + ReleaseAuxProcessResources(false); + AtEOXact_Buffers(false); + AtEOXact_SMgr(); + AtEOXact_Files(false); + AtEOXact_HashTables(false); + + /* + * Now return to normal top-level context and clear ErrorContext for + * next time. + */ + MemoryContextSwitchTo(bgwriter_context); + FlushErrorState(); + + /* Flush any leaked data in the top-level context */ + MemoryContextResetAndDeleteChildren(bgwriter_context); + + /* re-initialize to avoid repeated errors causing problems */ + WritebackContextInit(&wb_context, &bgwriter_flush_after); + + /* Now we can allow interrupts again */ + RESUME_INTERRUPTS(); + + /* + * Sleep at least 1 second after any error. A write error is likely + * to be repeated, and we don't want to be filling the error logs as + * fast as we can. + */ + pg_usleep(1000000L); + + /* + * Close all open files after any error. This is helpful on Windows, + * where holding deleted files open causes various strange errors. + * It's not clear we need it elsewhere, but shouldn't hurt. + */ + smgrcloseall(); + + /* Report wait end here, when there is no further possibility of wait */ + pgstat_report_wait_end(); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + /* + * Unblock signals (they were blocked when the postmaster forked us) + */ + PG_SETMASK(&UnBlockSig); + + /* + * Reset hibernation state after any error. + */ + prev_hibernate = false; + + /* + * Loop forever + */ + for (;;) + { + bool can_hibernate; + int rc; + + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + HandleMainLoopInterrupts(); + + /* + * Do one cycle of dirty-buffer writing. + */ + can_hibernate = BgBufferSync(&wb_context); + + /* + * Send off activity statistics to the stats collector + */ + pgstat_send_bgwriter(); + + if (FirstCallSinceLastCheckpoint()) + { + /* + * After any checkpoint, close all smgr files. This is so we + * won't hang onto smgr references to deleted files indefinitely. + */ + smgrcloseall(); + } + + /* + * Log a new xl_running_xacts every now and then so replication can + * get into a consistent state faster (think of suboverflowed + * snapshots) and clean up resources (locks, KnownXids*) more + * frequently. The costs of this are relatively low, so doing it 4 + * times (LOG_SNAPSHOT_INTERVAL_MS) a minute seems fine. + * + * We assume the interval for writing xl_running_xacts is + * significantly bigger than BgWriterDelay, so we don't complicate the + * overall timeout handling but just assume we're going to get called + * often enough even if hibernation mode is active. It's not that + * important that LOG_SNAPSHOT_INTERVAL_MS is met strictly. To make + * sure we're not waking the disk up unnecessarily on an idle system + * we check whether there has been any WAL inserted since the last + * time we've logged a running xacts. + * + * We do this logging in the bgwriter as it is the only process that + * is run regularly and returns to its mainloop all the time. E.g. + * Checkpointer, when active, is barely ever in its mainloop and thus + * makes it hard to log regularly. + */ + if (XLogStandbyInfoActive() && !RecoveryInProgress()) + { + TimestampTz timeout = 0; + TimestampTz now = GetCurrentTimestamp(); + + timeout = TimestampTzPlusMilliseconds(last_snapshot_ts, + LOG_SNAPSHOT_INTERVAL_MS); + + /* + * Only log if enough time has passed and interesting records have + * been inserted since the last snapshot. Have to compare with <= + * instead of < because GetLastImportantRecPtr() points at the + * start of a record, whereas last_snapshot_lsn points just past + * the end of the record. + */ + if (now >= timeout && + last_snapshot_lsn <= GetLastImportantRecPtr()) + { + last_snapshot_lsn = LogStandbySnapshot(); + last_snapshot_ts = now; + } + } + + /* + * Sleep until we are signaled or BgWriterDelay has elapsed. + * + * Note: the feedback control loop in BgBufferSync() expects that we + * will call it every BgWriterDelay msec. While it's not critical for + * correctness that that be exact, the feedback loop might misbehave + * if we stray too far from that. Hence, avoid loading this process + * down with latch events that are likely to happen frequently during + * normal operation. + */ + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + BgWriterDelay /* ms */ , WAIT_EVENT_BGWRITER_MAIN); + + /* + * If no latch event and BgBufferSync says nothing's happening, extend + * the sleep in "hibernation" mode, where we sleep for much longer + * than bgwriter_delay says. Fewer wakeups save electricity. When a + * backend starts using buffers again, it will wake us up by setting + * our latch. Because the extra sleep will persist only as long as no + * buffer allocations happen, this should not distort the behavior of + * BgBufferSync's control loop too badly; essentially, it will think + * that the system-wide idle interval didn't exist. + * + * There is a race condition here, in that a backend might allocate a + * buffer between the time BgBufferSync saw the alloc count as zero + * and the time we call StrategyNotifyBgWriter. While it's not + * critical that we not hibernate anyway, we try to reduce the odds of + * that by only hibernating when BgBufferSync says nothing's happening + * for two consecutive cycles. Also, we mitigate any possible + * consequences of a missed wakeup by not hibernating forever. + */ + if (rc == WL_TIMEOUT && can_hibernate && prev_hibernate) + { + /* Ask for notification at next buffer allocation */ + StrategyNotifyBgWriter(MyProc->pgprocno); + /* Sleep ... */ + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + BgWriterDelay * HIBERNATE_FACTOR, + WAIT_EVENT_BGWRITER_HIBERNATE); + /* Reset the notification request in case we timed out */ + StrategyNotifyBgWriter(-1); + } + + prev_hibernate = can_hibernate; + } +} diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c new file mode 100644 index 0000000..8699675 --- /dev/null +++ b/src/backend/postmaster/checkpointer.c @@ -0,0 +1,1354 @@ +/*------------------------------------------------------------------------- + * + * checkpointer.c + * + * The checkpointer is new as of Postgres 9.2. It handles all checkpoints. + * Checkpoints are automatically dispatched after a certain amount of time has + * elapsed since the last one, and it can be signaled to perform requested + * checkpoints as well. (The GUC parameter that mandates a checkpoint every + * so many WAL segments is implemented by having backends signal when they + * fill WAL segments; the checkpointer itself doesn't watch for the + * condition.) + * + * The checkpointer is started by the postmaster as soon as the startup + * subprocess finishes, or as soon as recovery begins if we are doing archive + * recovery. It remains alive until the postmaster commands it to terminate. + * Normal termination is by SIGUSR2, which instructs the checkpointer to + * execute a shutdown checkpoint and then exit(0). (All backends must be + * stopped before SIGUSR2 is issued!) Emergency termination is by SIGQUIT; + * like any backend, the checkpointer will simply abort and exit on SIGQUIT. + * + * If the checkpointer exits unexpectedly, the postmaster treats that the same + * as a backend crash: shared memory may be corrupted, so remaining backends + * should be killed by SIGQUIT and then a recovery cycle started. (Even if + * shared memory isn't corrupted, we have lost information about which + * files need to be fsync'd for the next checkpoint, and so a system + * restart needs to be forced.) + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/postmaster/checkpointer.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <sys/time.h> +#include <time.h> + +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgwriter.h" +#include "postmaster/interrupt.h" +#include "replication/syncrep.h" +#include "storage/bufmgr.h" +#include "storage/condition_variable.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/proc.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" +#include "storage/smgr.h" +#include "storage/spin.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/resowner.h" + + +/*---------- + * Shared memory area for communication between checkpointer and backends + * + * The ckpt counters allow backends to watch for completion of a checkpoint + * request they send. Here's how it works: + * * At start of a checkpoint, checkpointer reads (and clears) the request + * flags and increments ckpt_started, while holding ckpt_lck. + * * On completion of a checkpoint, checkpointer sets ckpt_done to + * equal ckpt_started. + * * On failure of a checkpoint, checkpointer increments ckpt_failed + * and sets ckpt_done to equal ckpt_started. + * + * The algorithm for backends is: + * 1. Record current values of ckpt_failed and ckpt_started, and + * set request flags, while holding ckpt_lck. + * 2. Send signal to request checkpoint. + * 3. Sleep until ckpt_started changes. Now you know a checkpoint has + * begun since you started this algorithm (although *not* that it was + * specifically initiated by your signal), and that it is using your flags. + * 4. Record new value of ckpt_started. + * 5. Sleep until ckpt_done >= saved value of ckpt_started. (Use modulo + * arithmetic here in case counters wrap around.) Now you know a + * checkpoint has started and completed, but not whether it was + * successful. + * 6. If ckpt_failed is different from the originally saved value, + * assume request failed; otherwise it was definitely successful. + * + * ckpt_flags holds the OR of the checkpoint request flags sent by all + * requesting backends since the last checkpoint start. The flags are + * chosen so that OR'ing is the correct way to combine multiple requests. + * + * num_backend_writes is used to count the number of buffer writes performed + * by user backend processes. This counter should be wide enough that it + * can't overflow during a single processing cycle. num_backend_fsync + * counts the subset of those writes that also had to do their own fsync, + * because the checkpointer failed to absorb their request. + * + * The requests array holds fsync requests sent by backends and not yet + * absorbed by the checkpointer. + * + * Unlike the checkpoint fields, num_backend_writes, num_backend_fsync, and + * the requests fields are protected by CheckpointerCommLock. + *---------- + */ +typedef struct +{ + SyncRequestType type; /* request type */ + FileTag ftag; /* file identifier */ +} CheckpointerRequest; + +typedef struct +{ + pid_t checkpointer_pid; /* PID (0 if not started) */ + + slock_t ckpt_lck; /* protects all the ckpt_* fields */ + + int ckpt_started; /* advances when checkpoint starts */ + int ckpt_done; /* advances when checkpoint done */ + int ckpt_failed; /* advances when checkpoint fails */ + + int ckpt_flags; /* checkpoint flags, as defined in xlog.h */ + + ConditionVariable start_cv; /* signaled when ckpt_started advances */ + ConditionVariable done_cv; /* signaled when ckpt_done advances */ + + uint32 num_backend_writes; /* counts user backend buffer writes */ + uint32 num_backend_fsync; /* counts user backend fsync calls */ + + int num_requests; /* current # of requests */ + int max_requests; /* allocated array size */ + CheckpointerRequest requests[FLEXIBLE_ARRAY_MEMBER]; +} CheckpointerShmemStruct; + +static CheckpointerShmemStruct *CheckpointerShmem; + +/* interval for calling AbsorbSyncRequests in CheckpointWriteDelay */ +#define WRITES_PER_ABSORB 1000 + +/* + * GUC parameters + */ +int CheckPointTimeout = 300; +int CheckPointWarning = 30; +double CheckPointCompletionTarget = 0.9; + +/* + * Private state + */ +static bool ckpt_active = false; + +/* these values are valid when ckpt_active is true: */ +static pg_time_t ckpt_start_time; +static XLogRecPtr ckpt_start_recptr; +static double ckpt_cached_elapsed; + +static pg_time_t last_checkpoint_time; +static pg_time_t last_xlog_switch_time; + +/* Prototypes for private functions */ + +static void HandleCheckpointerInterrupts(void); +static void CheckArchiveTimeout(void); +static bool IsCheckpointOnSchedule(double progress); +static bool ImmediateCheckpointRequested(void); +static bool CompactCheckpointerRequestQueue(void); +static void UpdateSharedMemoryConfig(void); + +/* Signal handlers */ +static void ReqCheckpointHandler(SIGNAL_ARGS); + + +/* + * Main entry point for checkpointer process + * + * This is invoked from AuxiliaryProcessMain, which has already created the + * basic execution environment, but not enabled signals yet. + */ +void +CheckpointerMain(void) +{ + sigjmp_buf local_sigjmp_buf; + MemoryContext checkpointer_context; + + CheckpointerShmem->checkpointer_pid = MyProcPid; + + /* + * Properly accept or ignore signals the postmaster might send us + * + * Note: we deliberately ignore SIGTERM, because during a standard Unix + * system shutdown cycle, init will SIGTERM all processes at once. We + * want to wait for the backends to exit, whereupon the postmaster will + * tell us it's okay to shut down (via SIGUSR2). + */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGINT, ReqCheckpointHandler); /* request checkpoint */ + pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */ + /* SIGQUIT handler was already set up by InitPostmasterChild */ + pqsignal(SIGALRM, SIG_IGN); + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, SignalHandlerForShutdownRequest); + + /* + * Reset some signals that are accepted by postmaster but not here + */ + pqsignal(SIGCHLD, SIG_DFL); + + /* + * Initialize so that first time-driven event happens at the correct time. + */ + last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); + + /* + * Create a memory context that we will do all our work in. We do this so + * that we can reset the context during error recovery and thereby avoid + * possible memory leaks. Formerly this code just ran in + * TopMemoryContext, but resetting that would be a really bad idea. + */ + checkpointer_context = AllocSetContextCreate(TopMemoryContext, + "Checkpointer", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(checkpointer_context); + + /* + * If an exception is encountered, processing resumes here. + * + * You might wonder why this isn't coded as an infinite loop around a + * PG_TRY construct. The reason is that this is the bottom of the + * exception stack, and so with PG_TRY there would be no exception handler + * in force at all during the CATCH part. By leaving the outermost setjmp + * always active, we have at least some chance of recovering from an error + * during error recovery. (If we get into an infinite loop thereby, it + * will soon be stopped by overflow of elog.c's internal state stack.) + * + * Note that we use sigsetjmp(..., 1), so that the prevailing signal mask + * (to wit, BlockSig) will be restored when longjmp'ing to here. Thus, + * signals other than SIGQUIT will be blocked until we complete error + * recovery. It might seem that this policy makes the HOLD_INTERRUPTS() + * call redundant, but it is not since InterruptPending might be set + * already. + */ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* Since not using PG_TRY, must reset error stack by hand */ + error_context_stack = NULL; + + /* Prevent interrupts while cleaning up */ + HOLD_INTERRUPTS(); + + /* Report the error to the server log */ + EmitErrorReport(); + + /* + * These operations are really just a minimal subset of + * AbortTransaction(). We don't have very many resources to worry + * about in checkpointer, but we do have LWLocks, buffers, and temp + * files. + */ + LWLockReleaseAll(); + ConditionVariableCancelSleep(); + pgstat_report_wait_end(); + AbortBufferIO(); + UnlockBuffers(); + ReleaseAuxProcessResources(false); + AtEOXact_Buffers(false); + AtEOXact_SMgr(); + AtEOXact_Files(false); + AtEOXact_HashTables(false); + + /* Warn any waiting backends that the checkpoint failed. */ + if (ckpt_active) + { + SpinLockAcquire(&CheckpointerShmem->ckpt_lck); + CheckpointerShmem->ckpt_failed++; + CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started; + SpinLockRelease(&CheckpointerShmem->ckpt_lck); + + ConditionVariableBroadcast(&CheckpointerShmem->done_cv); + + ckpt_active = false; + } + + /* + * Now return to normal top-level context and clear ErrorContext for + * next time. + */ + MemoryContextSwitchTo(checkpointer_context); + FlushErrorState(); + + /* Flush any leaked data in the top-level context */ + MemoryContextResetAndDeleteChildren(checkpointer_context); + + /* Now we can allow interrupts again */ + RESUME_INTERRUPTS(); + + /* + * Sleep at least 1 second after any error. A write error is likely + * to be repeated, and we don't want to be filling the error logs as + * fast as we can. + */ + pg_usleep(1000000L); + + /* + * Close all open files after any error. This is helpful on Windows, + * where holding deleted files open causes various strange errors. + * It's not clear we need it elsewhere, but shouldn't hurt. + */ + smgrcloseall(); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + /* + * Unblock signals (they were blocked when the postmaster forked us) + */ + PG_SETMASK(&UnBlockSig); + + /* + * Ensure all shared memory values are set correctly for the config. Doing + * this here ensures no race conditions from other concurrent updaters. + */ + UpdateSharedMemoryConfig(); + + /* + * Advertise our latch that backends can use to wake us up while we're + * sleeping. + */ + ProcGlobal->checkpointerLatch = &MyProc->procLatch; + + /* + * Loop forever + */ + for (;;) + { + bool do_checkpoint = false; + int flags = 0; + pg_time_t now; + int elapsed_secs; + int cur_timeout; + + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + /* + * Process any requests or signals received recently. + */ + AbsorbSyncRequests(); + HandleCheckpointerInterrupts(); + + /* + * Detect a pending checkpoint request by checking whether the flags + * word in shared memory is nonzero. We shouldn't need to acquire the + * ckpt_lck for this. + */ + if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags) + { + do_checkpoint = true; + BgWriterStats.m_requested_checkpoints++; + } + + /* + * Force a checkpoint if too much time has elapsed since the last one. + * Note that we count a timed checkpoint in stats only when this + * occurs without an external request, but we set the CAUSE_TIME flag + * bit even if there is also an external request. + */ + now = (pg_time_t) time(NULL); + elapsed_secs = now - last_checkpoint_time; + if (elapsed_secs >= CheckPointTimeout) + { + if (!do_checkpoint) + BgWriterStats.m_timed_checkpoints++; + do_checkpoint = true; + flags |= CHECKPOINT_CAUSE_TIME; + } + + /* + * Do a checkpoint if requested. + */ + if (do_checkpoint) + { + bool ckpt_performed = false; + bool do_restartpoint; + + /* + * Check if we should perform a checkpoint or a restartpoint. As a + * side-effect, RecoveryInProgress() initializes TimeLineID if + * it's not set yet. + */ + do_restartpoint = RecoveryInProgress(); + + /* + * Atomically fetch the request flags to figure out what kind of a + * checkpoint we should perform, and increase the started-counter + * to acknowledge that we've started a new checkpoint. + */ + SpinLockAcquire(&CheckpointerShmem->ckpt_lck); + flags |= CheckpointerShmem->ckpt_flags; + CheckpointerShmem->ckpt_flags = 0; + CheckpointerShmem->ckpt_started++; + SpinLockRelease(&CheckpointerShmem->ckpt_lck); + + ConditionVariableBroadcast(&CheckpointerShmem->start_cv); + + /* + * The end-of-recovery checkpoint is a real checkpoint that's + * performed while we're still in recovery. + */ + if (flags & CHECKPOINT_END_OF_RECOVERY) + do_restartpoint = false; + + /* + * We will warn if (a) too soon since last checkpoint (whatever + * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag + * since the last checkpoint start. Note in particular that this + * implementation will not generate warnings caused by + * CheckPointTimeout < CheckPointWarning. + */ + if (!do_restartpoint && + (flags & CHECKPOINT_CAUSE_XLOG) && + elapsed_secs < CheckPointWarning) + ereport(LOG, + (errmsg_plural("checkpoints are occurring too frequently (%d second apart)", + "checkpoints are occurring too frequently (%d seconds apart)", + elapsed_secs, + elapsed_secs), + errhint("Consider increasing the configuration parameter \"max_wal_size\"."))); + + /* + * Initialize checkpointer-private variables used during + * checkpoint. + */ + ckpt_active = true; + if (do_restartpoint) + ckpt_start_recptr = GetXLogReplayRecPtr(NULL); + else + ckpt_start_recptr = GetInsertRecPtr(); + ckpt_start_time = now; + ckpt_cached_elapsed = 0; + + /* + * Do the checkpoint. + */ + if (!do_restartpoint) + { + CreateCheckPoint(flags); + ckpt_performed = true; + } + else + ckpt_performed = CreateRestartPoint(flags); + + /* + * After any checkpoint, close all smgr files. This is so we + * won't hang onto smgr references to deleted files indefinitely. + */ + smgrcloseall(); + + /* + * Indicate checkpoint completion to any waiting backends. + */ + SpinLockAcquire(&CheckpointerShmem->ckpt_lck); + CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started; + SpinLockRelease(&CheckpointerShmem->ckpt_lck); + + ConditionVariableBroadcast(&CheckpointerShmem->done_cv); + + if (ckpt_performed) + { + /* + * Note we record the checkpoint start time not end time as + * last_checkpoint_time. This is so that time-driven + * checkpoints happen at a predictable spacing. + */ + last_checkpoint_time = now; + } + else + { + /* + * We were not able to perform the restartpoint (checkpoints + * throw an ERROR in case of error). Most likely because we + * have not received any new checkpoint WAL records since the + * last restartpoint. Try again in 15 s. + */ + last_checkpoint_time = now - CheckPointTimeout + 15; + } + + ckpt_active = false; + + /* We may have received an interrupt during the checkpoint. */ + HandleCheckpointerInterrupts(); + } + + /* Check for archive_timeout and switch xlog files if necessary. */ + CheckArchiveTimeout(); + + /* + * Send off activity statistics to the stats collector. (The reason + * why we re-use bgwriter-related code for this is that the bgwriter + * and checkpointer used to be just one process. It's probably not + * worth the trouble to split the stats support into two independent + * stats message types.) + */ + pgstat_send_bgwriter(); + + /* Send WAL statistics to the stats collector. */ + pgstat_send_wal(true); + + /* + * If any checkpoint flags have been set, redo the loop to handle the + * checkpoint without sleeping. + */ + if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags) + continue; + + /* + * Sleep until we are signaled or it's time for another checkpoint or + * xlog file switch. + */ + now = (pg_time_t) time(NULL); + elapsed_secs = now - last_checkpoint_time; + if (elapsed_secs >= CheckPointTimeout) + continue; /* no sleep for us ... */ + cur_timeout = CheckPointTimeout - elapsed_secs; + if (XLogArchiveTimeout > 0 && !RecoveryInProgress()) + { + elapsed_secs = now - last_xlog_switch_time; + if (elapsed_secs >= XLogArchiveTimeout) + continue; /* no sleep for us ... */ + cur_timeout = Min(cur_timeout, XLogArchiveTimeout - elapsed_secs); + } + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + cur_timeout * 1000L /* convert to ms */ , + WAIT_EVENT_CHECKPOINTER_MAIN); + } +} + +/* + * Process any new interrupts. + */ +static void +HandleCheckpointerInterrupts(void) +{ + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + + /* + * Checkpointer is the last process to shut down, so we ask it to hold + * the keys for a range of other tasks required most of which have + * nothing to do with checkpointing at all. + * + * For various reasons, some config values can change dynamically so + * the primary copy of them is held in shared memory to make sure all + * backends see the same value. We make Checkpointer responsible for + * updating the shared memory copy if the parameter setting changes + * because of SIGHUP. + */ + UpdateSharedMemoryConfig(); + } + if (ShutdownRequestPending) + { + /* + * From here on, elog(ERROR) should end with exit(1), not send control + * back to the sigsetjmp block above + */ + ExitOnAnyError = true; + + /* + * Close down the database. + * + * Since ShutdownXLOG() creates restartpoint or checkpoint, and + * updates the statistics, increment the checkpoint request and send + * the statistics to the stats collector. + */ + BgWriterStats.m_requested_checkpoints++; + ShutdownXLOG(0, 0); + pgstat_send_bgwriter(); + pgstat_send_wal(true); + + /* Normal exit from the checkpointer is here */ + proc_exit(0); /* done */ + } +} + +/* + * CheckArchiveTimeout -- check for archive_timeout and switch xlog files + * + * This will switch to a new WAL file and force an archive file write if + * meaningful activity is recorded in the current WAL file. This includes most + * writes, including just a single checkpoint record, but excludes WAL records + * that were inserted with the XLOG_MARK_UNIMPORTANT flag being set (like + * snapshots of running transactions). Such records, depending on + * configuration, occur on regular intervals and don't contain important + * information. This avoids generating archives with a few unimportant + * records. + */ +static void +CheckArchiveTimeout(void) +{ + pg_time_t now; + pg_time_t last_time; + XLogRecPtr last_switch_lsn; + + if (XLogArchiveTimeout <= 0 || RecoveryInProgress()) + return; + + now = (pg_time_t) time(NULL); + + /* First we do a quick check using possibly-stale local state. */ + if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout) + return; + + /* + * Update local state ... note that last_xlog_switch_time is the last time + * a switch was performed *or requested*. + */ + last_time = GetLastSegSwitchData(&last_switch_lsn); + + last_xlog_switch_time = Max(last_xlog_switch_time, last_time); + + /* Now we can do the real checks */ + if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout) + { + /* + * Switch segment only when "important" WAL has been logged since the + * last segment switch (last_switch_lsn points to end of segment + * switch occurred in). + */ + if (GetLastImportantRecPtr() > last_switch_lsn) + { + XLogRecPtr switchpoint; + + /* mark switch as unimportant, avoids triggering checkpoints */ + switchpoint = RequestXLogSwitch(true); + + /* + * If the returned pointer points exactly to a segment boundary, + * assume nothing happened. + */ + if (XLogSegmentOffset(switchpoint, wal_segment_size) != 0) + elog(DEBUG1, "write-ahead log switch forced (archive_timeout=%d)", + XLogArchiveTimeout); + } + + /* + * Update state in any case, so we don't retry constantly when the + * system is idle. + */ + last_xlog_switch_time = now; + } +} + +/* + * Returns true if an immediate checkpoint request is pending. (Note that + * this does not check the *current* checkpoint's IMMEDIATE flag, but whether + * there is one pending behind it.) + */ +static bool +ImmediateCheckpointRequested(void) +{ + volatile CheckpointerShmemStruct *cps = CheckpointerShmem; + + /* + * We don't need to acquire the ckpt_lck in this case because we're only + * looking at a single flag bit. + */ + if (cps->ckpt_flags & CHECKPOINT_IMMEDIATE) + return true; + return false; +} + +/* + * CheckpointWriteDelay -- control rate of checkpoint + * + * This function is called after each page write performed by BufferSync(). + * It is responsible for throttling BufferSync()'s write rate to hit + * checkpoint_completion_target. + * + * The checkpoint request flags should be passed in; currently the only one + * examined is CHECKPOINT_IMMEDIATE, which disables delays between writes. + * + * 'progress' is an estimate of how much of the work has been done, as a + * fraction between 0.0 meaning none, and 1.0 meaning all done. + */ +void +CheckpointWriteDelay(int flags, double progress) +{ + static int absorb_counter = WRITES_PER_ABSORB; + + /* Do nothing if checkpoint is being executed by non-checkpointer process */ + if (!AmCheckpointerProcess()) + return; + + /* + * Perform the usual duties and take a nap, unless we're behind schedule, + * in which case we just try to catch up as quickly as possible. + */ + if (!(flags & CHECKPOINT_IMMEDIATE) && + !ShutdownRequestPending && + !ImmediateCheckpointRequested() && + IsCheckpointOnSchedule(progress)) + { + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + /* update shmem copies of config variables */ + UpdateSharedMemoryConfig(); + } + + AbsorbSyncRequests(); + absorb_counter = WRITES_PER_ABSORB; + + CheckArchiveTimeout(); + + /* + * Report interim activity statistics to the stats collector. + */ + pgstat_send_bgwriter(); + + /* + * This sleep used to be connected to bgwriter_delay, typically 200ms. + * That resulted in more frequent wakeups if not much work to do. + * Checkpointer and bgwriter are no longer related so take the Big + * Sleep. + */ + WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, + 100, + WAIT_EVENT_CHECKPOINT_WRITE_DELAY); + ResetLatch(MyLatch); + } + else if (--absorb_counter <= 0) + { + /* + * Absorb pending fsync requests after each WRITES_PER_ABSORB write + * operations even when we don't sleep, to prevent overflow of the + * fsync request queue. + */ + AbsorbSyncRequests(); + absorb_counter = WRITES_PER_ABSORB; + } + + /* Check for barrier events. */ + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); +} + +/* + * IsCheckpointOnSchedule -- are we on schedule to finish this checkpoint + * (or restartpoint) in time? + * + * Compares the current progress against the time/segments elapsed since last + * checkpoint, and returns true if the progress we've made this far is greater + * than the elapsed time/segments. + */ +static bool +IsCheckpointOnSchedule(double progress) +{ + XLogRecPtr recptr; + struct timeval now; + double elapsed_xlogs, + elapsed_time; + + Assert(ckpt_active); + + /* Scale progress according to checkpoint_completion_target. */ + progress *= CheckPointCompletionTarget; + + /* + * Check against the cached value first. Only do the more expensive + * calculations once we reach the target previously calculated. Since + * neither time or WAL insert pointer moves backwards, a freshly + * calculated value can only be greater than or equal to the cached value. + */ + if (progress < ckpt_cached_elapsed) + return false; + + /* + * Check progress against WAL segments written and CheckPointSegments. + * + * We compare the current WAL insert location against the location + * computed before calling CreateCheckPoint. The code in XLogInsert that + * actually triggers a checkpoint when CheckPointSegments is exceeded + * compares against RedoRecPtr, so this is not completely accurate. + * However, it's good enough for our purposes, we're only calculating an + * estimate anyway. + * + * During recovery, we compare last replayed WAL record's location with + * the location computed before calling CreateRestartPoint. That maintains + * the same pacing as we have during checkpoints in normal operation, but + * we might exceed max_wal_size by a fair amount. That's because there can + * be a large gap between a checkpoint's redo-pointer and the checkpoint + * record itself, and we only start the restartpoint after we've seen the + * checkpoint record. (The gap is typically up to CheckPointSegments * + * checkpoint_completion_target where checkpoint_completion_target is the + * value that was in effect when the WAL was generated). + */ + if (RecoveryInProgress()) + recptr = GetXLogReplayRecPtr(NULL); + else + recptr = GetInsertRecPtr(); + elapsed_xlogs = (((double) (recptr - ckpt_start_recptr)) / + wal_segment_size) / CheckPointSegments; + + if (progress < elapsed_xlogs) + { + ckpt_cached_elapsed = elapsed_xlogs; + return false; + } + + /* + * Check progress against time elapsed and checkpoint_timeout. + */ + gettimeofday(&now, NULL); + elapsed_time = ((double) ((pg_time_t) now.tv_sec - ckpt_start_time) + + now.tv_usec / 1000000.0) / CheckPointTimeout; + + if (progress < elapsed_time) + { + ckpt_cached_elapsed = elapsed_time; + return false; + } + + /* It looks like we're on schedule. */ + return true; +} + + +/* -------------------------------- + * signal handler routines + * -------------------------------- + */ + +/* SIGINT: set flag to run a normal checkpoint right away */ +static void +ReqCheckpointHandler(SIGNAL_ARGS) +{ + int save_errno = errno; + + /* + * The signaling process should have set ckpt_flags nonzero, so all we + * need do is ensure that our main loop gets kicked out of any wait. + */ + SetLatch(MyLatch); + + errno = save_errno; +} + + +/* -------------------------------- + * communication with backends + * -------------------------------- + */ + +/* + * CheckpointerShmemSize + * Compute space needed for checkpointer-related shared memory + */ +Size +CheckpointerShmemSize(void) +{ + Size size; + + /* + * Currently, the size of the requests[] array is arbitrarily set equal to + * NBuffers. This may prove too large or small ... + */ + size = offsetof(CheckpointerShmemStruct, requests); + size = add_size(size, mul_size(NBuffers, sizeof(CheckpointerRequest))); + + return size; +} + +/* + * CheckpointerShmemInit + * Allocate and initialize checkpointer-related shared memory + */ +void +CheckpointerShmemInit(void) +{ + Size size = CheckpointerShmemSize(); + bool found; + + CheckpointerShmem = (CheckpointerShmemStruct *) + ShmemInitStruct("Checkpointer Data", + size, + &found); + + if (!found) + { + /* + * First time through, so initialize. Note that we zero the whole + * requests array; this is so that CompactCheckpointerRequestQueue can + * assume that any pad bytes in the request structs are zeroes. + */ + MemSet(CheckpointerShmem, 0, size); + SpinLockInit(&CheckpointerShmem->ckpt_lck); + CheckpointerShmem->max_requests = NBuffers; + ConditionVariableInit(&CheckpointerShmem->start_cv); + ConditionVariableInit(&CheckpointerShmem->done_cv); + } +} + +/* + * RequestCheckpoint + * Called in backend processes to request a checkpoint + * + * flags is a bitwise OR of the following: + * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. + * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. + * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, + * ignoring checkpoint_completion_target parameter. + * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred + * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or + * CHECKPOINT_END_OF_RECOVERY). + * CHECKPOINT_WAIT: wait for completion before returning (otherwise, + * just signal checkpointer to do it, and return). + * CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling. + * (This affects logging, and in particular enables CheckPointWarning.) + */ +void +RequestCheckpoint(int flags) +{ + int ntries; + int old_failed, + old_started; + + /* + * If in a standalone backend, just do it ourselves. + */ + if (!IsPostmasterEnvironment) + { + /* + * There's no point in doing slow checkpoints in a standalone backend, + * because there's no other backends the checkpoint could disrupt. + */ + CreateCheckPoint(flags | CHECKPOINT_IMMEDIATE); + + /* + * After any checkpoint, close all smgr files. This is so we won't + * hang onto smgr references to deleted files indefinitely. + */ + smgrcloseall(); + + return; + } + + /* + * Atomically set the request flags, and take a snapshot of the counters. + * When we see ckpt_started > old_started, we know the flags we set here + * have been seen by checkpointer. + * + * Note that we OR the flags with any existing flags, to avoid overriding + * a "stronger" request by another backend. The flag senses must be + * chosen to make this work! + */ + SpinLockAcquire(&CheckpointerShmem->ckpt_lck); + + old_failed = CheckpointerShmem->ckpt_failed; + old_started = CheckpointerShmem->ckpt_started; + CheckpointerShmem->ckpt_flags |= (flags | CHECKPOINT_REQUESTED); + + SpinLockRelease(&CheckpointerShmem->ckpt_lck); + + /* + * Send signal to request checkpoint. It's possible that the checkpointer + * hasn't started yet, or is in process of restarting, so we will retry a + * few times if needed. (Actually, more than a few times, since on slow + * or overloaded buildfarm machines, it's been observed that the + * checkpointer can take several seconds to start.) However, if not told + * to wait for the checkpoint to occur, we consider failure to send the + * signal to be nonfatal and merely LOG it. The checkpointer should see + * the request when it does start, with or without getting a signal. + */ +#define MAX_SIGNAL_TRIES 600 /* max wait 60.0 sec */ + for (ntries = 0;; ntries++) + { + if (CheckpointerShmem->checkpointer_pid == 0) + { + if (ntries >= MAX_SIGNAL_TRIES || !(flags & CHECKPOINT_WAIT)) + { + elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG, + "could not signal for checkpoint: checkpointer is not running"); + break; + } + } + else if (kill(CheckpointerShmem->checkpointer_pid, SIGINT) != 0) + { + if (ntries >= MAX_SIGNAL_TRIES || !(flags & CHECKPOINT_WAIT)) + { + elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG, + "could not signal for checkpoint: %m"); + break; + } + } + else + break; /* signal sent successfully */ + + CHECK_FOR_INTERRUPTS(); + pg_usleep(100000L); /* wait 0.1 sec, then retry */ + } + + /* + * If requested, wait for completion. We detect completion according to + * the algorithm given above. + */ + if (flags & CHECKPOINT_WAIT) + { + int new_started, + new_failed; + + /* Wait for a new checkpoint to start. */ + ConditionVariablePrepareToSleep(&CheckpointerShmem->start_cv); + for (;;) + { + SpinLockAcquire(&CheckpointerShmem->ckpt_lck); + new_started = CheckpointerShmem->ckpt_started; + SpinLockRelease(&CheckpointerShmem->ckpt_lck); + + if (new_started != old_started) + break; + + ConditionVariableSleep(&CheckpointerShmem->start_cv, + WAIT_EVENT_CHECKPOINT_START); + } + ConditionVariableCancelSleep(); + + /* + * We are waiting for ckpt_done >= new_started, in a modulo sense. + */ + ConditionVariablePrepareToSleep(&CheckpointerShmem->done_cv); + for (;;) + { + int new_done; + + SpinLockAcquire(&CheckpointerShmem->ckpt_lck); + new_done = CheckpointerShmem->ckpt_done; + new_failed = CheckpointerShmem->ckpt_failed; + SpinLockRelease(&CheckpointerShmem->ckpt_lck); + + if (new_done - new_started >= 0) + break; + + ConditionVariableSleep(&CheckpointerShmem->done_cv, + WAIT_EVENT_CHECKPOINT_DONE); + } + ConditionVariableCancelSleep(); + + if (new_failed != old_failed) + ereport(ERROR, + (errmsg("checkpoint request failed"), + errhint("Consult recent messages in the server log for details."))); + } +} + +/* + * ForwardSyncRequest + * Forward a file-fsync request from a backend to the checkpointer + * + * Whenever a backend is compelled to write directly to a relation + * (which should be seldom, if the background writer is getting its job done), + * the backend calls this routine to pass over knowledge that the relation + * is dirty and must be fsync'd before next checkpoint. We also use this + * opportunity to count such writes for statistical purposes. + * + * To avoid holding the lock for longer than necessary, we normally write + * to the requests[] queue without checking for duplicates. The checkpointer + * will have to eliminate dups internally anyway. However, if we discover + * that the queue is full, we make a pass over the entire queue to compact + * it. This is somewhat expensive, but the alternative is for the backend + * to perform its own fsync, which is far more expensive in practice. It + * is theoretically possible a backend fsync might still be necessary, if + * the queue is full and contains no duplicate entries. In that case, we + * let the backend know by returning false. + */ +bool +ForwardSyncRequest(const FileTag *ftag, SyncRequestType type) +{ + CheckpointerRequest *request; + bool too_full; + + if (!IsUnderPostmaster) + return false; /* probably shouldn't even get here */ + + if (AmCheckpointerProcess()) + elog(ERROR, "ForwardSyncRequest must not be called in checkpointer"); + + LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); + + /* Count all backend writes regardless of if they fit in the queue */ + if (!AmBackgroundWriterProcess()) + CheckpointerShmem->num_backend_writes++; + + /* + * If the checkpointer isn't running or the request queue is full, the + * backend will have to perform its own fsync request. But before forcing + * that to happen, we can try to compact the request queue. + */ + if (CheckpointerShmem->checkpointer_pid == 0 || + (CheckpointerShmem->num_requests >= CheckpointerShmem->max_requests && + !CompactCheckpointerRequestQueue())) + { + /* + * Count the subset of writes where backends have to do their own + * fsync + */ + if (!AmBackgroundWriterProcess()) + CheckpointerShmem->num_backend_fsync++; + LWLockRelease(CheckpointerCommLock); + return false; + } + + /* OK, insert request */ + request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++]; + request->ftag = *ftag; + request->type = type; + + /* If queue is more than half full, nudge the checkpointer to empty it */ + too_full = (CheckpointerShmem->num_requests >= + CheckpointerShmem->max_requests / 2); + + LWLockRelease(CheckpointerCommLock); + + /* ... but not till after we release the lock */ + if (too_full && ProcGlobal->checkpointerLatch) + SetLatch(ProcGlobal->checkpointerLatch); + + return true; +} + +/* + * CompactCheckpointerRequestQueue + * Remove duplicates from the request queue to avoid backend fsyncs. + * Returns "true" if any entries were removed. + * + * Although a full fsync request queue is not common, it can lead to severe + * performance problems when it does happen. So far, this situation has + * only been observed to occur when the system is under heavy write load, + * and especially during the "sync" phase of a checkpoint. Without this + * logic, each backend begins doing an fsync for every block written, which + * gets very expensive and can slow down the whole system. + * + * Trying to do this every time the queue is full could lose if there + * aren't any removable entries. But that should be vanishingly rare in + * practice: there's one queue entry per shared buffer. + */ +static bool +CompactCheckpointerRequestQueue(void) +{ + struct CheckpointerSlotMapping + { + CheckpointerRequest request; + int slot; + }; + + int n, + preserve_count; + int num_skipped = 0; + HASHCTL ctl; + HTAB *htab; + bool *skip_slot; + + /* must hold CheckpointerCommLock in exclusive mode */ + Assert(LWLockHeldByMe(CheckpointerCommLock)); + + /* Initialize skip_slot array */ + skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests); + + /* Initialize temporary hash table */ + ctl.keysize = sizeof(CheckpointerRequest); + ctl.entrysize = sizeof(struct CheckpointerSlotMapping); + ctl.hcxt = CurrentMemoryContext; + + htab = hash_create("CompactCheckpointerRequestQueue", + CheckpointerShmem->num_requests, + &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + /* + * The basic idea here is that a request can be skipped if it's followed + * by a later, identical request. It might seem more sensible to work + * backwards from the end of the queue and check whether a request is + * *preceded* by an earlier, identical request, in the hopes of doing less + * copying. But that might change the semantics, if there's an + * intervening SYNC_FORGET_REQUEST or SYNC_FILTER_REQUEST, so we do it + * this way. It would be possible to be even smarter if we made the code + * below understand the specific semantics of such requests (it could blow + * away preceding entries that would end up being canceled anyhow), but + * it's not clear that the extra complexity would buy us anything. + */ + for (n = 0; n < CheckpointerShmem->num_requests; n++) + { + CheckpointerRequest *request; + struct CheckpointerSlotMapping *slotmap; + bool found; + + /* + * We use the request struct directly as a hashtable key. This + * assumes that any padding bytes in the structs are consistently the + * same, which should be okay because we zeroed them in + * CheckpointerShmemInit. Note also that RelFileNode had better + * contain no pad bytes. + */ + request = &CheckpointerShmem->requests[n]; + slotmap = hash_search(htab, request, HASH_ENTER, &found); + if (found) + { + /* Duplicate, so mark the previous occurrence as skippable */ + skip_slot[slotmap->slot] = true; + num_skipped++; + } + /* Remember slot containing latest occurrence of this request value */ + slotmap->slot = n; + } + + /* Done with the hash table. */ + hash_destroy(htab); + + /* If no duplicates, we're out of luck. */ + if (!num_skipped) + { + pfree(skip_slot); + return false; + } + + /* We found some duplicates; remove them. */ + preserve_count = 0; + for (n = 0; n < CheckpointerShmem->num_requests; n++) + { + if (skip_slot[n]) + continue; + CheckpointerShmem->requests[preserve_count++] = CheckpointerShmem->requests[n]; + } + ereport(DEBUG1, + (errmsg_internal("compacted fsync request queue from %d entries to %d entries", + CheckpointerShmem->num_requests, preserve_count))); + CheckpointerShmem->num_requests = preserve_count; + + /* Cleanup. */ + pfree(skip_slot); + return true; +} + +/* + * AbsorbSyncRequests + * Retrieve queued sync requests and pass them to sync mechanism. + * + * This is exported because it must be called during CreateCheckPoint; + * we have to be sure we have accepted all pending requests just before + * we start fsync'ing. Since CreateCheckPoint sometimes runs in + * non-checkpointer processes, do nothing if not checkpointer. + */ +void +AbsorbSyncRequests(void) +{ + CheckpointerRequest *requests = NULL; + CheckpointerRequest *request; + int n; + + if (!AmCheckpointerProcess()) + return; + + LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); + + /* Transfer stats counts into pending pgstats message */ + BgWriterStats.m_buf_written_backend += CheckpointerShmem->num_backend_writes; + BgWriterStats.m_buf_fsync_backend += CheckpointerShmem->num_backend_fsync; + + CheckpointerShmem->num_backend_writes = 0; + CheckpointerShmem->num_backend_fsync = 0; + + /* + * We try to avoid holding the lock for a long time by copying the request + * array, and processing the requests after releasing the lock. + * + * Once we have cleared the requests from shared memory, we have to PANIC + * if we then fail to absorb them (eg, because our hashtable runs out of + * memory). This is because the system cannot run safely if we are unable + * to fsync what we have been told to fsync. Fortunately, the hashtable + * is so small that the problem is quite unlikely to arise in practice. + */ + n = CheckpointerShmem->num_requests; + if (n > 0) + { + requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest)); + memcpy(requests, CheckpointerShmem->requests, n * sizeof(CheckpointerRequest)); + } + + START_CRIT_SECTION(); + + CheckpointerShmem->num_requests = 0; + + LWLockRelease(CheckpointerCommLock); + + for (request = requests; n > 0; request++, n--) + RememberSyncRequest(&request->ftag, request->type); + + END_CRIT_SECTION(); + + if (requests) + pfree(requests); +} + +/* + * Update any shared memory configurations based on config parameters + */ +static void +UpdateSharedMemoryConfig(void) +{ + /* update global shmem state for sync rep */ + SyncRepUpdateSyncStandbysDefined(); + + /* + * If full_page_writes has been changed by SIGHUP, we update it in shared + * memory and write an XLOG_FPW_CHANGE record. + */ + UpdateFullPageWrites(); + + elog(DEBUG2, "checkpointer updated shared memory configuration values"); +} + +/* + * FirstCallSinceLastCheckpoint allows a process to take an action once + * per checkpoint cycle by asynchronously checking for checkpoint completion. + */ +bool +FirstCallSinceLastCheckpoint(void) +{ + static int ckpt_done = 0; + int new_done; + bool FirstCall = false; + + SpinLockAcquire(&CheckpointerShmem->ckpt_lck); + new_done = CheckpointerShmem->ckpt_done; + SpinLockRelease(&CheckpointerShmem->ckpt_lck); + + if (new_done != ckpt_done) + FirstCall = true; + + ckpt_done = new_done; + + return FirstCall; +} diff --git a/src/backend/postmaster/fork_process.c b/src/backend/postmaster/fork_process.c new file mode 100644 index 0000000..62d068b --- /dev/null +++ b/src/backend/postmaster/fork_process.c @@ -0,0 +1,115 @@ +/* + * fork_process.c + * A simple wrapper on top of fork(). This does not handle the + * EXEC_BACKEND case; it might be extended to do so, but it would be + * considerably more complex. + * + * Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/postmaster/fork_process.c + */ +#include "postgres.h" + +#include <fcntl.h> +#include <time.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <unistd.h> + +#include "postmaster/fork_process.h" + +#ifndef WIN32 +/* + * Wrapper for fork(). Return values are the same as those for fork(): + * -1 if the fork failed, 0 in the child process, and the PID of the + * child in the parent process. + */ +pid_t +fork_process(void) +{ + pid_t result; + const char *oomfilename; + +#ifdef LINUX_PROFILE + struct itimerval prof_itimer; +#endif + + /* + * Flush stdio channels just before fork, to avoid double-output problems. + * Ideally we'd use fflush(NULL) here, but there are still a few non-ANSI + * stdio libraries out there (like SunOS 4.1.x) that coredump if we do. + * Presently stdout and stderr are the only stdio output channels used by + * the postmaster, so fflush'ing them should be sufficient. + */ + fflush(stdout); + fflush(stderr); + +#ifdef LINUX_PROFILE + + /* + * Linux's fork() resets the profiling timer in the child process. If we + * want to profile child processes then we need to save and restore the + * timer setting. This is a waste of time if not profiling, however, so + * only do it if commanded by specific -DLINUX_PROFILE switch. + */ + getitimer(ITIMER_PROF, &prof_itimer); +#endif + + result = fork(); + if (result == 0) + { + /* fork succeeded, in child */ +#ifdef LINUX_PROFILE + setitimer(ITIMER_PROF, &prof_itimer, NULL); +#endif + + /* + * By default, Linux tends to kill the postmaster in out-of-memory + * situations, because it blames the postmaster for the sum of child + * process sizes *including shared memory*. (This is unbelievably + * stupid, but the kernel hackers seem uninterested in improving it.) + * Therefore it's often a good idea to protect the postmaster by + * setting its OOM score adjustment negative (which has to be done in + * a root-owned startup script). Since the adjustment is inherited by + * child processes, this would ordinarily mean that all the + * postmaster's children are equally protected against OOM kill, which + * is not such a good idea. So we provide this code to allow the + * children to change their OOM score adjustments again. Both the + * file name to write to and the value to write are controlled by + * environment variables, which can be set by the same startup script + * that did the original adjustment. + */ + oomfilename = getenv("PG_OOM_ADJUST_FILE"); + + if (oomfilename != NULL) + { + /* + * Use open() not stdio, to ensure we control the open flags. Some + * Linux security environments reject anything but O_WRONLY. + */ + int fd = open(oomfilename, O_WRONLY, 0); + + /* We ignore all errors */ + if (fd >= 0) + { + const char *oomvalue = getenv("PG_OOM_ADJUST_VALUE"); + int rc; + + if (oomvalue == NULL) /* supply a useful default */ + oomvalue = "0"; + + rc = write(fd, oomvalue, strlen(oomvalue)); + (void) rc; + close(fd); + } + } + + /* do post-fork initialization for random number generation */ + pg_strong_random_init(); + } + + return result; +} + +#endif /* ! WIN32 */ diff --git a/src/backend/postmaster/interrupt.c b/src/backend/postmaster/interrupt.c new file mode 100644 index 0000000..dd9136a --- /dev/null +++ b/src/backend/postmaster/interrupt.c @@ -0,0 +1,112 @@ +/*------------------------------------------------------------------------- + * + * interrupt.c + * Interrupt handling routines. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/postmaster/interrupt.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <unistd.h> + +#include "miscadmin.h" +#include "postmaster/interrupt.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/procsignal.h" +#include "utils/guc.h" + +volatile sig_atomic_t ConfigReloadPending = false; +volatile sig_atomic_t ShutdownRequestPending = false; + +/* + * Simple interrupt handler for main loops of background processes. + */ +void +HandleMainLoopInterrupts(void) +{ + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + if (ShutdownRequestPending) + proc_exit(0); +} + +/* + * Simple signal handler for triggering a configuration reload. + * + * Normally, this handler would be used for SIGHUP. The idea is that code + * which uses it would arrange to check the ConfigReloadPending flag at + * convenient places inside main loops, or else call HandleMainLoopInterrupts. + */ +void +SignalHandlerForConfigReload(SIGNAL_ARGS) +{ + int save_errno = errno; + + ConfigReloadPending = true; + SetLatch(MyLatch); + + errno = save_errno; +} + +/* + * Simple signal handler for exiting quickly as if due to a crash. + * + * Normally, this would be used for handling SIGQUIT. + */ +void +SignalHandlerForCrashExit(SIGNAL_ARGS) +{ + /* + * We DO NOT want to run proc_exit() or atexit() callbacks -- we're here + * because shared memory may be corrupted, so we don't want to try to + * clean up our transaction. Just nail the windows shut and get out of + * town. The callbacks wouldn't be safe to run from a signal handler, + * anyway. + * + * Note we do _exit(2) not _exit(0). This is to force the postmaster into + * a system reset cycle if someone sends a manual SIGQUIT to a random + * backend. This is necessary precisely because we don't clean up our + * shared memory state. (The "dead man switch" mechanism in pmsignal.c + * should ensure the postmaster sees this as a crash, too, but no harm in + * being doubly sure.) + */ + _exit(2); +} + +/* + * Simple signal handler for triggering a long-running background process to + * shut down and exit. + * + * Typically, this handler would be used for SIGTERM, but some processes use + * other signals. In particular, the checkpointer exits on SIGUSR2, the + * stats collector on SIGQUIT, and the WAL writer exits on either SIGINT + * or SIGTERM. + * + * ShutdownRequestPending should be checked at a convenient place within the + * main loop, or else the main loop should call HandleMainLoopInterrupts. + */ +void +SignalHandlerForShutdownRequest(SIGNAL_ARGS) +{ + int save_errno = errno; + + ShutdownRequestPending = true; + SetLatch(MyLatch); + + errno = save_errno; +} diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c new file mode 100644 index 0000000..74a7d7c --- /dev/null +++ b/src/backend/postmaster/pgarch.c @@ -0,0 +1,718 @@ +/*------------------------------------------------------------------------- + * + * pgarch.c + * + * PostgreSQL WAL archiver + * + * All functions relating to archiver are included here + * + * - All functions executed by archiver process + * + * - archiver is forked from postmaster, and the two + * processes then communicate using signals. All functions + * executed by postmaster are included in this file. + * + * Initial author: Simon Riggs simon@2ndquadrant.com + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/postmaster/pgarch.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <fcntl.h> +#include <signal.h> +#include <time.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <unistd.h> + +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/interrupt.h" +#include "postmaster/pgarch.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" +#include "utils/guc.h" +#include "utils/ps_status.h" + + +/* ---------- + * Timer definitions. + * ---------- + */ +#define PGARCH_AUTOWAKE_INTERVAL 60 /* How often to force a poll of the + * archive status directory; in seconds. */ +#define PGARCH_RESTART_INTERVAL 10 /* How often to attempt to restart a + * failed archiver; in seconds. */ + +/* + * Maximum number of retries allowed when attempting to archive a WAL + * file. + */ +#define NUM_ARCHIVE_RETRIES 3 + +/* + * Maximum number of retries allowed when attempting to remove an + * orphan archive status file. + */ +#define NUM_ORPHAN_CLEANUP_RETRIES 3 + +/* Shared memory area for archiver process */ +typedef struct PgArchData +{ + int pgprocno; /* pgprocno of archiver process */ +} PgArchData; + + +/* ---------- + * Local data + * ---------- + */ +static time_t last_sigterm_time = 0; +static PgArchData *PgArch = NULL; + +/* + * Flags set by interrupt handlers for later service in the main loop. + */ +static volatile sig_atomic_t ready_to_stop = false; + +/* ---------- + * Local function forward declarations + * ---------- + */ +static void pgarch_waken_stop(SIGNAL_ARGS); +static void pgarch_MainLoop(void); +static void pgarch_ArchiverCopyLoop(void); +static bool pgarch_archiveXlog(char *xlog); +static bool pgarch_readyXlog(char *xlog); +static void pgarch_archiveDone(char *xlog); +static void pgarch_die(int code, Datum arg); +static void HandlePgArchInterrupts(void); + +/* Report shared memory space needed by PgArchShmemInit */ +Size +PgArchShmemSize(void) +{ + Size size = 0; + + size = add_size(size, sizeof(PgArchData)); + + return size; +} + +/* Allocate and initialize archiver-related shared memory */ +void +PgArchShmemInit(void) +{ + bool found; + + PgArch = (PgArchData *) + ShmemInitStruct("Archiver Data", PgArchShmemSize(), &found); + + if (!found) + { + /* First time through, so initialize */ + MemSet(PgArch, 0, PgArchShmemSize()); + PgArch->pgprocno = INVALID_PGPROCNO; + } +} + +/* + * PgArchCanRestart + * + * Return true and archiver is allowed to restart if enough time has + * passed since it was launched last to reach PGARCH_RESTART_INTERVAL. + * Otherwise return false. + * + * This is a safety valve to protect against continuous respawn attempts if the + * archiver is dying immediately at launch. Note that since we will retry to + * launch the archiver from the postmaster main loop, we will get another + * chance later. + */ +bool +PgArchCanRestart(void) +{ + static time_t last_pgarch_start_time = 0; + time_t curtime = time(NULL); + + /* + * Return false and don't restart archiver if too soon since last archiver + * start. + */ + if ((unsigned int) (curtime - last_pgarch_start_time) < + (unsigned int) PGARCH_RESTART_INTERVAL) + return false; + + last_pgarch_start_time = curtime; + return true; +} + + +/* Main entry point for archiver process */ +void +PgArchiverMain(void) +{ + /* + * Ignore all signals usually bound to some action in the postmaster, + * except for SIGHUP, SIGTERM, SIGUSR1, SIGUSR2, and SIGQUIT. + */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGINT, SIG_IGN); + pqsignal(SIGTERM, SignalHandlerForShutdownRequest); + /* SIGQUIT handler was already set up by InitPostmasterChild */ + pqsignal(SIGALRM, SIG_IGN); + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, pgarch_waken_stop); + + /* Reset some signals that are accepted by postmaster but not here */ + pqsignal(SIGCHLD, SIG_DFL); + + /* Unblock signals (they were blocked when the postmaster forked us) */ + PG_SETMASK(&UnBlockSig); + + /* We shouldn't be launched unnecessarily. */ + Assert(XLogArchivingActive()); + + /* Arrange to clean up at archiver exit */ + on_shmem_exit(pgarch_die, 0); + + /* + * Advertise our pgprocno so that backends can use our latch to wake us up + * while we're sleeping. + */ + PgArch->pgprocno = MyProc->pgprocno; + + pgarch_MainLoop(); + + proc_exit(0); +} + +/* + * Wake up the archiver + */ +void +PgArchWakeup(void) +{ + int arch_pgprocno = PgArch->pgprocno; + + /* + * We don't acquire ProcArrayLock here. It's actually fine because + * procLatch isn't ever freed, so we just can potentially set the wrong + * process' (or no process') latch. Even in that case the archiver will + * be relaunched shortly and will start archiving. + */ + if (arch_pgprocno != INVALID_PGPROCNO) + SetLatch(&ProcGlobal->allProcs[arch_pgprocno].procLatch); +} + + +/* SIGUSR2 signal handler for archiver process */ +static void +pgarch_waken_stop(SIGNAL_ARGS) +{ + int save_errno = errno; + + /* set flag to do a final cycle and shut down afterwards */ + ready_to_stop = true; + SetLatch(MyLatch); + + errno = save_errno; +} + +/* + * pgarch_MainLoop + * + * Main loop for archiver + */ +static void +pgarch_MainLoop(void) +{ + pg_time_t last_copy_time = 0; + bool time_to_stop; + + /* + * There shouldn't be anything for the archiver to do except to wait for a + * signal ... however, the archiver exists to protect our data, so she + * wakes up occasionally to allow herself to be proactive. + */ + do + { + ResetLatch(MyLatch); + + /* When we get SIGUSR2, we do one more archive cycle, then exit */ + time_to_stop = ready_to_stop; + + /* Check for barrier events and config update */ + HandlePgArchInterrupts(); + + /* + * If we've gotten SIGTERM, we normally just sit and do nothing until + * SIGUSR2 arrives. However, that means a random SIGTERM would + * disable archiving indefinitely, which doesn't seem like a good + * idea. If more than 60 seconds pass since SIGTERM, exit anyway, so + * that the postmaster can start a new archiver if needed. + */ + if (ShutdownRequestPending) + { + time_t curtime = time(NULL); + + if (last_sigterm_time == 0) + last_sigterm_time = curtime; + else if ((unsigned int) (curtime - last_sigterm_time) >= + (unsigned int) 60) + break; + } + + /* Do what we're here for */ + pgarch_ArchiverCopyLoop(); + last_copy_time = time(NULL); + + /* + * Sleep until a signal is received, or until a poll is forced by + * PGARCH_AUTOWAKE_INTERVAL having passed since last_copy_time, or + * until postmaster dies. + */ + if (!time_to_stop) /* Don't wait during last iteration */ + { + pg_time_t curtime = (pg_time_t) time(NULL); + int timeout; + + timeout = PGARCH_AUTOWAKE_INTERVAL - (curtime - last_copy_time); + if (timeout > 0) + { + int rc; + + rc = WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + timeout * 1000L, + WAIT_EVENT_ARCHIVER_MAIN); + if (rc & WL_POSTMASTER_DEATH) + time_to_stop = true; + } + } + + /* + * The archiver quits either when the postmaster dies (not expected) + * or after completing one more archiving cycle after receiving + * SIGUSR2. + */ + } while (!time_to_stop); +} + +/* + * pgarch_ArchiverCopyLoop + * + * Archives all outstanding xlogs then returns + */ +static void +pgarch_ArchiverCopyLoop(void) +{ + char xlog[MAX_XFN_CHARS + 1]; + + /* + * loop through all xlogs with archive_status of .ready and archive + * them...mostly we expect this to be a single file, though it is possible + * some backend will add files onto the list of those that need archiving + * while we are still copying earlier archives + */ + while (pgarch_readyXlog(xlog)) + { + int failures = 0; + int failures_orphan = 0; + + for (;;) + { + struct stat stat_buf; + char pathname[MAXPGPATH]; + + /* + * Do not initiate any more archive commands after receiving + * SIGTERM, nor after the postmaster has died unexpectedly. The + * first condition is to try to keep from having init SIGKILL the + * command, and the second is to avoid conflicts with another + * archiver spawned by a newer postmaster. + */ + if (ShutdownRequestPending || !PostmasterIsAlive()) + return; + + /* + * Check for barrier events and config update. This is so that + * we'll adopt a new setting for archive_command as soon as + * possible, even if there is a backlog of files to be archived. + */ + HandlePgArchInterrupts(); + + /* can't do anything if no command ... */ + if (!XLogArchiveCommandSet()) + { + ereport(WARNING, + (errmsg("archive_mode enabled, yet archive_command is not set"))); + return; + } + + /* + * Since archive status files are not removed in a durable manner, + * a system crash could leave behind .ready files for WAL segments + * that have already been recycled or removed. In this case, + * simply remove the orphan status file and move on. unlink() is + * used here as even on subsequent crashes the same orphan files + * would get removed, so there is no need to worry about + * durability. + */ + snprintf(pathname, MAXPGPATH, XLOGDIR "/%s", xlog); + if (stat(pathname, &stat_buf) != 0 && errno == ENOENT) + { + char xlogready[MAXPGPATH]; + + StatusFilePath(xlogready, xlog, ".ready"); + if (unlink(xlogready) == 0) + { + ereport(WARNING, + (errmsg("removed orphan archive status file \"%s\"", + xlogready))); + + /* leave loop and move to the next status file */ + break; + } + + if (++failures_orphan >= NUM_ORPHAN_CLEANUP_RETRIES) + { + ereport(WARNING, + (errmsg("removal of orphan archive status file \"%s\" failed too many times, will try again later", + xlogready))); + + /* give up cleanup of orphan status files */ + return; + } + + /* wait a bit before retrying */ + pg_usleep(1000000L); + continue; + } + + if (pgarch_archiveXlog(xlog)) + { + /* successful */ + pgarch_archiveDone(xlog); + + /* + * Tell the collector about the WAL file that we successfully + * archived + */ + pgstat_send_archiver(xlog, false); + + break; /* out of inner retry loop */ + } + else + { + /* + * Tell the collector about the WAL file that we failed to + * archive + */ + pgstat_send_archiver(xlog, true); + + if (++failures >= NUM_ARCHIVE_RETRIES) + { + ereport(WARNING, + (errmsg("archiving write-ahead log file \"%s\" failed too many times, will try again later", + xlog))); + return; /* give up archiving for now */ + } + pg_usleep(1000000L); /* wait a bit before retrying */ + } + } + } +} + +/* + * pgarch_archiveXlog + * + * Invokes system(3) to copy one archive file to wherever it should go + * + * Returns true if successful + */ +static bool +pgarch_archiveXlog(char *xlog) +{ + char xlogarchcmd[MAXPGPATH]; + char pathname[MAXPGPATH]; + char activitymsg[MAXFNAMELEN + 16]; + char *dp; + char *endp; + const char *sp; + int rc; + + snprintf(pathname, MAXPGPATH, XLOGDIR "/%s", xlog); + + /* + * construct the command to be executed + */ + dp = xlogarchcmd; + endp = xlogarchcmd + MAXPGPATH - 1; + *endp = '\0'; + + for (sp = XLogArchiveCommand; *sp; sp++) + { + if (*sp == '%') + { + switch (sp[1]) + { + case 'p': + /* %p: relative path of source file */ + sp++; + strlcpy(dp, pathname, endp - dp); + make_native_path(dp); + dp += strlen(dp); + break; + case 'f': + /* %f: filename of source file */ + sp++; + strlcpy(dp, xlog, endp - dp); + dp += strlen(dp); + break; + case '%': + /* convert %% to a single % */ + sp++; + if (dp < endp) + *dp++ = *sp; + break; + default: + /* otherwise treat the % as not special */ + if (dp < endp) + *dp++ = *sp; + break; + } + } + else + { + if (dp < endp) + *dp++ = *sp; + } + } + *dp = '\0'; + + ereport(DEBUG3, + (errmsg_internal("executing archive command \"%s\"", + xlogarchcmd))); + + /* Report archive activity in PS display */ + snprintf(activitymsg, sizeof(activitymsg), "archiving %s", xlog); + set_ps_display(activitymsg); + + rc = system(xlogarchcmd); + if (rc != 0) + { + /* + * If either the shell itself, or a called command, died on a signal, + * abort the archiver. We do this because system() ignores SIGINT and + * SIGQUIT while waiting; so a signal is very likely something that + * should have interrupted us too. Also die if the shell got a hard + * "command not found" type of error. If we overreact it's no big + * deal, the postmaster will just start the archiver again. + */ + int lev = wait_result_is_any_signal(rc, true) ? FATAL : LOG; + + if (WIFEXITED(rc)) + { + ereport(lev, + (errmsg("archive command failed with exit code %d", + WEXITSTATUS(rc)), + errdetail("The failed archive command was: %s", + xlogarchcmd))); + } + else if (WIFSIGNALED(rc)) + { +#if defined(WIN32) + ereport(lev, + (errmsg("archive command was terminated by exception 0x%X", + WTERMSIG(rc)), + errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value."), + errdetail("The failed archive command was: %s", + xlogarchcmd))); +#else + ereport(lev, + (errmsg("archive command was terminated by signal %d: %s", + WTERMSIG(rc), pg_strsignal(WTERMSIG(rc))), + errdetail("The failed archive command was: %s", + xlogarchcmd))); +#endif + } + else + { + ereport(lev, + (errmsg("archive command exited with unrecognized status %d", + rc), + errdetail("The failed archive command was: %s", + xlogarchcmd))); + } + + snprintf(activitymsg, sizeof(activitymsg), "failed on %s", xlog); + set_ps_display(activitymsg); + + return false; + } + elog(DEBUG1, "archived write-ahead log file \"%s\"", xlog); + + snprintf(activitymsg, sizeof(activitymsg), "last was %s", xlog); + set_ps_display(activitymsg); + + return true; +} + +/* + * pgarch_readyXlog + * + * Return name of the oldest xlog file that has not yet been archived. + * No notification is set that file archiving is now in progress, so + * this would need to be extended if multiple concurrent archival + * tasks were created. If a failure occurs, we will completely + * re-copy the file at the next available opportunity. + * + * It is important that we return the oldest, so that we archive xlogs + * in order that they were written, for two reasons: + * 1) to maintain the sequential chain of xlogs required for recovery + * 2) because the oldest ones will sooner become candidates for + * recycling at time of checkpoint + * + * NOTE: the "oldest" comparison will consider any .history file to be older + * than any other file except another .history file. Segments on a timeline + * with a smaller ID will be older than all segments on a timeline with a + * larger ID; the net result being that past timelines are given higher + * priority for archiving. This seems okay, or at least not obviously worth + * changing. + */ +static bool +pgarch_readyXlog(char *xlog) +{ + /* + * open xlog status directory and read through list of xlogs that have the + * .ready suffix, looking for earliest file. It is possible to optimise + * this code, though only a single file is expected on the vast majority + * of calls, so.... + */ + char XLogArchiveStatusDir[MAXPGPATH]; + DIR *rldir; + struct dirent *rlde; + bool found = false; + bool historyFound = false; + + snprintf(XLogArchiveStatusDir, MAXPGPATH, XLOGDIR "/archive_status"); + rldir = AllocateDir(XLogArchiveStatusDir); + + while ((rlde = ReadDir(rldir, XLogArchiveStatusDir)) != NULL) + { + int basenamelen = (int) strlen(rlde->d_name) - 6; + char basename[MAX_XFN_CHARS + 1]; + bool ishistory; + + /* Ignore entries with unexpected number of characters */ + if (basenamelen < MIN_XFN_CHARS || + basenamelen > MAX_XFN_CHARS) + continue; + + /* Ignore entries with unexpected characters */ + if (strspn(rlde->d_name, VALID_XFN_CHARS) < basenamelen) + continue; + + /* Ignore anything not suffixed with .ready */ + if (strcmp(rlde->d_name + basenamelen, ".ready") != 0) + continue; + + /* Truncate off the .ready */ + memcpy(basename, rlde->d_name, basenamelen); + basename[basenamelen] = '\0'; + + /* Is this a history file? */ + ishistory = IsTLHistoryFileName(basename); + + /* + * Consume the file to archive. History files have the highest + * priority. If this is the first file or the first history file + * ever, copy it. In the presence of a history file already chosen as + * target, ignore all other files except history files which have been + * generated for an older timeline than what is already chosen as + * target to archive. + */ + if (!found || (ishistory && !historyFound)) + { + strcpy(xlog, basename); + found = true; + historyFound = ishistory; + } + else if (ishistory || !historyFound) + { + if (strcmp(basename, xlog) < 0) + strcpy(xlog, basename); + } + } + FreeDir(rldir); + + return found; +} + +/* + * pgarch_archiveDone + * + * Emit notification that an xlog file has been successfully archived. + * We do this by renaming the status file from NNN.ready to NNN.done. + * Eventually, a checkpoint process will notice this and delete both the + * NNN.done file and the xlog file itself. + */ +static void +pgarch_archiveDone(char *xlog) +{ + char rlogready[MAXPGPATH]; + char rlogdone[MAXPGPATH]; + + StatusFilePath(rlogready, xlog, ".ready"); + StatusFilePath(rlogdone, xlog, ".done"); + (void) durable_rename(rlogready, rlogdone, WARNING); +} + + +/* + * pgarch_die + * + * Exit-time cleanup handler + */ +static void +pgarch_die(int code, Datum arg) +{ + PgArch->pgprocno = INVALID_PGPROCNO; +} + +/* + * Interrupt handler for WAL archiver process. + * + * This is called in the loops pgarch_MainLoop and pgarch_ArchiverCopyLoop. + * It checks for barrier events and config update, but not shutdown request + * because how to handle shutdown request is different between those loops. + */ +static void +HandlePgArchInterrupts(void) +{ + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } +} diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c new file mode 100644 index 0000000..a2f75b2 --- /dev/null +++ b/src/backend/postmaster/pgstat.c @@ -0,0 +1,5851 @@ +/* ---------- + * pgstat.c + * + * All the statistics collector stuff hacked up in one big, ugly file. + * + * TODO: - Separate collector, postmaster and backend stuff + * into different files. + * + * - Add some automatic call for pgstat vacuuming. + * + * - Add a pgstat config column to pg_database, so this + * entire thing can be enabled/disabled on a per db basis. + * + * Copyright (c) 2001-2021, PostgreSQL Global Development Group + * + * src/backend/postmaster/pgstat.c + * ---------- + */ +#include "postgres.h" + +#include <unistd.h> +#include <fcntl.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/socket.h> +#include <netdb.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <signal.h> +#include <time.h> +#ifdef HAVE_SYS_SELECT_H +#include <sys/select.h> +#endif + +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/tableam.h" +#include "access/transam.h" +#include "access/twophase_rmgr.h" +#include "access/xact.h" +#include "catalog/pg_database.h" +#include "catalog/pg_proc.h" +#include "common/ip.h" +#include "executor/instrument.h" +#include "libpq/libpq.h" +#include "libpq/pqsignal.h" +#include "mb/pg_wchar.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/autovacuum.h" +#include "postmaster/fork_process.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "replication/slot.h" +#include "replication/walsender.h" +#include "storage/backendid.h" +#include "storage/dsm.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/lmgr.h" +#include "storage/pg_shmem.h" +#include "storage/proc.h" +#include "storage/procsignal.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/ps_status.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" +#include "utils/timestamp.h" + +/* ---------- + * Timer definitions. + * ---------- + */ +#define PGSTAT_STAT_INTERVAL 500 /* Minimum time between stats file + * updates; in milliseconds. */ + +#define PGSTAT_RETRY_DELAY 10 /* How long to wait between checks for a + * new file; in milliseconds. */ + +#define PGSTAT_MAX_WAIT_TIME 10000 /* Maximum time to wait for a stats + * file update; in milliseconds. */ + +#define PGSTAT_INQ_INTERVAL 640 /* How often to ping the collector for a + * new file; in milliseconds. */ + +#define PGSTAT_RESTART_INTERVAL 60 /* How often to attempt to restart a + * failed statistics collector; in + * seconds. */ + +#define PGSTAT_POLL_LOOP_COUNT (PGSTAT_MAX_WAIT_TIME / PGSTAT_RETRY_DELAY) +#define PGSTAT_INQ_LOOP_COUNT (PGSTAT_INQ_INTERVAL / PGSTAT_RETRY_DELAY) + +/* Minimum receive buffer size for the collector's socket. */ +#define PGSTAT_MIN_RCVBUF (100 * 1024) + + +/* ---------- + * The initial size hints for the hash tables used in the collector. + * ---------- + */ +#define PGSTAT_DB_HASH_SIZE 16 +#define PGSTAT_TAB_HASH_SIZE 512 +#define PGSTAT_FUNCTION_HASH_SIZE 512 +#define PGSTAT_REPLSLOT_HASH_SIZE 32 + + +/* ---------- + * GUC parameters + * ---------- + */ +bool pgstat_track_counts = false; +int pgstat_track_functions = TRACK_FUNC_OFF; + +/* ---------- + * Built from GUC parameter + * ---------- + */ +char *pgstat_stat_directory = NULL; +char *pgstat_stat_filename = NULL; +char *pgstat_stat_tmpname = NULL; + +/* + * BgWriter and WAL global statistics counters. + * Stored directly in a stats message structure so they can be sent + * without needing to copy things around. We assume these init to zeroes. + */ +PgStat_MsgBgWriter BgWriterStats; +PgStat_MsgWal WalStats; + +/* + * WAL usage counters saved from pgWALUsage at the previous call to + * pgstat_send_wal(). This is used to calculate how much WAL usage + * happens between pgstat_send_wal() calls, by substracting + * the previous counters from the current ones. + */ +static WalUsage prevWalUsage; + +/* + * List of SLRU names that we keep stats for. There is no central registry of + * SLRUs, so we use this fixed list instead. The "other" entry is used for + * all SLRUs without an explicit entry (e.g. SLRUs in extensions). + */ +static const char *const slru_names[] = { + "CommitTs", + "MultiXactMember", + "MultiXactOffset", + "Notify", + "Serial", + "Subtrans", + "Xact", + "other" /* has to be last */ +}; + +#define SLRU_NUM_ELEMENTS lengthof(slru_names) + +/* + * SLRU statistics counts waiting to be sent to the collector. These are + * stored directly in stats message format so they can be sent without needing + * to copy things around. We assume this variable inits to zeroes. Entries + * are one-to-one with slru_names[]. + */ +static PgStat_MsgSLRU SLRUStats[SLRU_NUM_ELEMENTS]; + +/* ---------- + * Local data + * ---------- + */ +NON_EXEC_STATIC pgsocket pgStatSock = PGINVALID_SOCKET; + +static struct sockaddr_storage pgStatAddr; + +static time_t last_pgstat_start_time; + +static bool pgStatRunningInCollector = false; + +/* + * Structures in which backends store per-table info that's waiting to be + * sent to the collector. + * + * NOTE: once allocated, TabStatusArray structures are never moved or deleted + * for the life of the backend. Also, we zero out the t_id fields of the + * contained PgStat_TableStatus structs whenever they are not actively in use. + * This allows relcache pgstat_info pointers to be treated as long-lived data, + * avoiding repeated searches in pgstat_initstats() when a relation is + * repeatedly opened during a transaction. + */ +#define TABSTAT_QUANTUM 100 /* we alloc this many at a time */ + +typedef struct TabStatusArray +{ + struct TabStatusArray *tsa_next; /* link to next array, if any */ + int tsa_used; /* # entries currently used */ + PgStat_TableStatus tsa_entries[TABSTAT_QUANTUM]; /* per-table data */ +} TabStatusArray; + +static TabStatusArray *pgStatTabList = NULL; + +/* + * pgStatTabHash entry: map from relation OID to PgStat_TableStatus pointer + */ +typedef struct TabStatHashEntry +{ + Oid t_id; + PgStat_TableStatus *tsa_entry; +} TabStatHashEntry; + +/* + * Hash table for O(1) t_id -> tsa_entry lookup + */ +static HTAB *pgStatTabHash = NULL; + +/* + * Backends store per-function info that's waiting to be sent to the collector + * in this hash table (indexed by function OID). + */ +static HTAB *pgStatFunctions = NULL; + +/* + * Indicates if backend has some function stats that it hasn't yet + * sent to the collector. + */ +static bool have_function_stats = false; + +/* + * Tuple insertion/deletion counts for an open transaction can't be propagated + * into PgStat_TableStatus counters until we know if it is going to commit + * or abort. Hence, we keep these counts in per-subxact structs that live + * in TopTransactionContext. This data structure is designed on the assumption + * that subxacts won't usually modify very many tables. + */ +typedef struct PgStat_SubXactStatus +{ + int nest_level; /* subtransaction nest level */ + struct PgStat_SubXactStatus *prev; /* higher-level subxact if any */ + PgStat_TableXactStatus *first; /* head of list for this subxact */ +} PgStat_SubXactStatus; + +static PgStat_SubXactStatus *pgStatXactStack = NULL; + +static int pgStatXactCommit = 0; +static int pgStatXactRollback = 0; +PgStat_Counter pgStatBlockReadTime = 0; +PgStat_Counter pgStatBlockWriteTime = 0; +static PgStat_Counter pgLastSessionReportTime = 0; +PgStat_Counter pgStatActiveTime = 0; +PgStat_Counter pgStatTransactionIdleTime = 0; +SessionEndType pgStatSessionEndCause = DISCONNECT_NORMAL; + +/* Record that's written to 2PC state file when pgstat state is persisted */ +typedef struct TwoPhasePgStatRecord +{ + PgStat_Counter tuples_inserted; /* tuples inserted in xact */ + PgStat_Counter tuples_updated; /* tuples updated in xact */ + PgStat_Counter tuples_deleted; /* tuples deleted in xact */ + PgStat_Counter inserted_pre_trunc; /* tuples inserted prior to truncate */ + PgStat_Counter updated_pre_trunc; /* tuples updated prior to truncate */ + PgStat_Counter deleted_pre_trunc; /* tuples deleted prior to truncate */ + Oid t_id; /* table's OID */ + bool t_shared; /* is it a shared catalog? */ + bool t_truncated; /* was the relation truncated? */ +} TwoPhasePgStatRecord; + +/* + * Info about current "snapshot" of stats file + */ +static MemoryContext pgStatLocalContext = NULL; +static HTAB *pgStatDBHash = NULL; + +/* + * Cluster wide statistics, kept in the stats collector. + * Contains statistics that are not collected per database + * or per table. + */ +static PgStat_ArchiverStats archiverStats; +static PgStat_GlobalStats globalStats; +static PgStat_WalStats walStats; +static PgStat_SLRUStats slruStats[SLRU_NUM_ELEMENTS]; +static HTAB *replSlotStatHash = NULL; + +/* + * List of OIDs of databases we need to write out. If an entry is InvalidOid, + * it means to write only the shared-catalog stats ("DB 0"); otherwise, we + * will write both that DB's data and the shared stats. + */ +static List *pending_write_requests = NIL; + +/* + * Total time charged to functions so far in the current backend. + * We use this to help separate "self" and "other" time charges. + * (We assume this initializes to zero.) + */ +static instr_time total_func_time; + + +/* ---------- + * Local function forward declarations + * ---------- + */ +#ifdef EXEC_BACKEND +static pid_t pgstat_forkexec(void); +#endif + +NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]) pg_attribute_noreturn(); + +static PgStat_StatDBEntry *pgstat_get_db_entry(Oid databaseid, bool create); +static PgStat_StatTabEntry *pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, + Oid tableoid, bool create); +static void pgstat_write_statsfiles(bool permanent, bool allDbs); +static void pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent); +static HTAB *pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep); +static void pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, bool permanent); +static void backend_read_statsfile(void); + +static bool pgstat_write_statsfile_needed(void); +static bool pgstat_db_requested(Oid databaseid); + +static PgStat_StatReplSlotEntry *pgstat_get_replslot_entry(NameData name, bool create_it); +static void pgstat_reset_replslot(PgStat_StatReplSlotEntry *slotstats, TimestampTz ts); + +static void pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg, TimestampTz now); +static void pgstat_send_funcstats(void); +static void pgstat_send_slru(void); +static HTAB *pgstat_collect_oids(Oid catalogid, AttrNumber anum_oid); +static bool pgstat_should_report_connstat(void); +static void pgstat_report_disconnect(Oid dboid); + +static PgStat_TableStatus *get_tabstat_entry(Oid rel_id, bool isshared); + +static void pgstat_setup_memcxt(void); + +static void pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype); +static void pgstat_send(void *msg, int len); + +static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len); +static void pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len); +static void pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len); +static void pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len); +static void pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len); +static void pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len); +static void pgstat_recv_resetsinglecounter(PgStat_MsgResetsinglecounter *msg, int len); +static void pgstat_recv_resetslrucounter(PgStat_MsgResetslrucounter *msg, int len); +static void pgstat_recv_resetreplslotcounter(PgStat_MsgResetreplslotcounter *msg, int len); +static void pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len); +static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len); +static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len); +static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len); +static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len); +static void pgstat_recv_wal(PgStat_MsgWal *msg, int len); +static void pgstat_recv_slru(PgStat_MsgSLRU *msg, int len); +static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len); +static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len); +static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len); +static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len); +static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len); +static void pgstat_recv_connect(PgStat_MsgConnect *msg, int len); +static void pgstat_recv_disconnect(PgStat_MsgDisconnect *msg, int len); +static void pgstat_recv_replslot(PgStat_MsgReplSlot *msg, int len); +static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len); + +/* ------------------------------------------------------------ + * Public functions called from postmaster follow + * ------------------------------------------------------------ + */ + +/* ---------- + * pgstat_init() - + * + * Called from postmaster at startup. Create the resources required + * by the statistics collector process. If unable to do so, do not + * fail --- better to let the postmaster start with stats collection + * disabled. + * ---------- + */ +void +pgstat_init(void) +{ + ACCEPT_TYPE_ARG3 alen; + struct addrinfo *addrs = NULL, + *addr, + hints; + int ret; + fd_set rset; + struct timeval tv; + char test_byte; + int sel_res; + int tries = 0; + +#define TESTBYTEVAL ((char) 199) + + /* + * This static assertion verifies that we didn't mess up the calculations + * involved in selecting maximum payload sizes for our UDP messages. + * Because the only consequence of overrunning PGSTAT_MAX_MSG_SIZE would + * be silent performance loss from fragmentation, it seems worth having a + * compile-time cross-check that we didn't. + */ + StaticAssertStmt(sizeof(PgStat_Msg) <= PGSTAT_MAX_MSG_SIZE, + "maximum stats message size exceeds PGSTAT_MAX_MSG_SIZE"); + + /* + * Create the UDP socket for sending and receiving statistic messages + */ + hints.ai_flags = AI_PASSIVE; + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_DGRAM; + hints.ai_protocol = 0; + hints.ai_addrlen = 0; + hints.ai_addr = NULL; + hints.ai_canonname = NULL; + hints.ai_next = NULL; + ret = pg_getaddrinfo_all("localhost", NULL, &hints, &addrs); + if (ret || !addrs) + { + ereport(LOG, + (errmsg("could not resolve \"localhost\": %s", + gai_strerror(ret)))); + goto startup_failed; + } + + /* + * On some platforms, pg_getaddrinfo_all() may return multiple addresses + * only one of which will actually work (eg, both IPv6 and IPv4 addresses + * when kernel will reject IPv6). Worse, the failure may occur at the + * bind() or perhaps even connect() stage. So we must loop through the + * results till we find a working combination. We will generate LOG + * messages, but no error, for bogus combinations. + */ + for (addr = addrs; addr; addr = addr->ai_next) + { +#ifdef HAVE_UNIX_SOCKETS + /* Ignore AF_UNIX sockets, if any are returned. */ + if (addr->ai_family == AF_UNIX) + continue; +#endif + + if (++tries > 1) + ereport(LOG, + (errmsg("trying another address for the statistics collector"))); + + /* + * Create the socket. + */ + if ((pgStatSock = socket(addr->ai_family, SOCK_DGRAM, 0)) == PGINVALID_SOCKET) + { + ereport(LOG, + (errcode_for_socket_access(), + errmsg("could not create socket for statistics collector: %m"))); + continue; + } + + /* + * Bind it to a kernel assigned port on localhost and get the assigned + * port via getsockname(). + */ + if (bind(pgStatSock, addr->ai_addr, addr->ai_addrlen) < 0) + { + ereport(LOG, + (errcode_for_socket_access(), + errmsg("could not bind socket for statistics collector: %m"))); + closesocket(pgStatSock); + pgStatSock = PGINVALID_SOCKET; + continue; + } + + alen = sizeof(pgStatAddr); + if (getsockname(pgStatSock, (struct sockaddr *) &pgStatAddr, &alen) < 0) + { + ereport(LOG, + (errcode_for_socket_access(), + errmsg("could not get address of socket for statistics collector: %m"))); + closesocket(pgStatSock); + pgStatSock = PGINVALID_SOCKET; + continue; + } + + /* + * Connect the socket to its own address. This saves a few cycles by + * not having to respecify the target address on every send. This also + * provides a kernel-level check that only packets from this same + * address will be received. + */ + if (connect(pgStatSock, (struct sockaddr *) &pgStatAddr, alen) < 0) + { + ereport(LOG, + (errcode_for_socket_access(), + errmsg("could not connect socket for statistics collector: %m"))); + closesocket(pgStatSock); + pgStatSock = PGINVALID_SOCKET; + continue; + } + + /* + * Try to send and receive a one-byte test message on the socket. This + * is to catch situations where the socket can be created but will not + * actually pass data (for instance, because kernel packet filtering + * rules prevent it). + */ + test_byte = TESTBYTEVAL; + +retry1: + if (send(pgStatSock, &test_byte, 1, 0) != 1) + { + if (errno == EINTR) + goto retry1; /* if interrupted, just retry */ + ereport(LOG, + (errcode_for_socket_access(), + errmsg("could not send test message on socket for statistics collector: %m"))); + closesocket(pgStatSock); + pgStatSock = PGINVALID_SOCKET; + continue; + } + + /* + * There could possibly be a little delay before the message can be + * received. We arbitrarily allow up to half a second before deciding + * it's broken. + */ + for (;;) /* need a loop to handle EINTR */ + { + FD_ZERO(&rset); + FD_SET(pgStatSock, &rset); + + tv.tv_sec = 0; + tv.tv_usec = 500000; + sel_res = select(pgStatSock + 1, &rset, NULL, NULL, &tv); + if (sel_res >= 0 || errno != EINTR) + break; + } + if (sel_res < 0) + { + ereport(LOG, + (errcode_for_socket_access(), + errmsg("select() failed in statistics collector: %m"))); + closesocket(pgStatSock); + pgStatSock = PGINVALID_SOCKET; + continue; + } + if (sel_res == 0 || !FD_ISSET(pgStatSock, &rset)) + { + /* + * This is the case we actually think is likely, so take pains to + * give a specific message for it. + * + * errno will not be set meaningfully here, so don't use it. + */ + ereport(LOG, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("test message did not get through on socket for statistics collector"))); + closesocket(pgStatSock); + pgStatSock = PGINVALID_SOCKET; + continue; + } + + test_byte++; /* just make sure variable is changed */ + +retry2: + if (recv(pgStatSock, &test_byte, 1, 0) != 1) + { + if (errno == EINTR) + goto retry2; /* if interrupted, just retry */ + ereport(LOG, + (errcode_for_socket_access(), + errmsg("could not receive test message on socket for statistics collector: %m"))); + closesocket(pgStatSock); + pgStatSock = PGINVALID_SOCKET; + continue; + } + + if (test_byte != TESTBYTEVAL) /* strictly paranoia ... */ + { + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("incorrect test message transmission on socket for statistics collector"))); + closesocket(pgStatSock); + pgStatSock = PGINVALID_SOCKET; + continue; + } + + /* If we get here, we have a working socket */ + break; + } + + /* Did we find a working address? */ + if (!addr || pgStatSock == PGINVALID_SOCKET) + goto startup_failed; + + /* + * Set the socket to non-blocking IO. This ensures that if the collector + * falls behind, statistics messages will be discarded; backends won't + * block waiting to send messages to the collector. + */ + if (!pg_set_noblock(pgStatSock)) + { + ereport(LOG, + (errcode_for_socket_access(), + errmsg("could not set statistics collector socket to nonblocking mode: %m"))); + goto startup_failed; + } + + /* + * Try to ensure that the socket's receive buffer is at least + * PGSTAT_MIN_RCVBUF bytes, so that it won't easily overflow and lose + * data. Use of UDP protocol means that we are willing to lose data under + * heavy load, but we don't want it to happen just because of ridiculously + * small default buffer sizes (such as 8KB on older Windows versions). + */ + { + int old_rcvbuf; + int new_rcvbuf; + ACCEPT_TYPE_ARG3 rcvbufsize = sizeof(old_rcvbuf); + + if (getsockopt(pgStatSock, SOL_SOCKET, SO_RCVBUF, + (char *) &old_rcvbuf, &rcvbufsize) < 0) + { + ereport(LOG, + (errmsg("%s(%s) failed: %m", "getsockopt", "SO_RCVBUF"))); + /* if we can't get existing size, always try to set it */ + old_rcvbuf = 0; + } + + new_rcvbuf = PGSTAT_MIN_RCVBUF; + if (old_rcvbuf < new_rcvbuf) + { + if (setsockopt(pgStatSock, SOL_SOCKET, SO_RCVBUF, + (char *) &new_rcvbuf, sizeof(new_rcvbuf)) < 0) + ereport(LOG, + (errmsg("%s(%s) failed: %m", "setsockopt", "SO_RCVBUF"))); + } + } + + pg_freeaddrinfo_all(hints.ai_family, addrs); + + /* Now that we have a long-lived socket, tell fd.c about it. */ + ReserveExternalFD(); + + return; + +startup_failed: + ereport(LOG, + (errmsg("disabling statistics collector for lack of working socket"))); + + if (addrs) + pg_freeaddrinfo_all(hints.ai_family, addrs); + + if (pgStatSock != PGINVALID_SOCKET) + closesocket(pgStatSock); + pgStatSock = PGINVALID_SOCKET; + + /* + * Adjust GUC variables to suppress useless activity, and for debugging + * purposes (seeing track_counts off is a clue that we failed here). We + * use PGC_S_OVERRIDE because there is no point in trying to turn it back + * on from postgresql.conf without a restart. + */ + SetConfigOption("track_counts", "off", PGC_INTERNAL, PGC_S_OVERRIDE); +} + +/* + * subroutine for pgstat_reset_all + */ +static void +pgstat_reset_remove_files(const char *directory) +{ + DIR *dir; + struct dirent *entry; + char fname[MAXPGPATH * 2]; + + dir = AllocateDir(directory); + while ((entry = ReadDir(dir, directory)) != NULL) + { + int nchars; + Oid tmp_oid; + + /* + * Skip directory entries that don't match the file names we write. + * See get_dbstat_filename for the database-specific pattern. + */ + if (strncmp(entry->d_name, "global.", 7) == 0) + nchars = 7; + else + { + nchars = 0; + (void) sscanf(entry->d_name, "db_%u.%n", + &tmp_oid, &nchars); + if (nchars <= 0) + continue; + /* %u allows leading whitespace, so reject that */ + if (strchr("0123456789", entry->d_name[3]) == NULL) + continue; + } + + if (strcmp(entry->d_name + nchars, "tmp") != 0 && + strcmp(entry->d_name + nchars, "stat") != 0) + continue; + + snprintf(fname, sizeof(fname), "%s/%s", directory, + entry->d_name); + unlink(fname); + } + FreeDir(dir); +} + +/* + * pgstat_reset_all() - + * + * Remove the stats files. This is currently used only if WAL + * recovery is needed after a crash. + */ +void +pgstat_reset_all(void) +{ + pgstat_reset_remove_files(pgstat_stat_directory); + pgstat_reset_remove_files(PGSTAT_STAT_PERMANENT_DIRECTORY); +} + +#ifdef EXEC_BACKEND + +/* + * pgstat_forkexec() - + * + * Format up the arglist for, then fork and exec, statistics collector process + */ +static pid_t +pgstat_forkexec(void) +{ + char *av[10]; + int ac = 0; + + av[ac++] = "postgres"; + av[ac++] = "--forkcol"; + av[ac++] = NULL; /* filled in by postmaster_forkexec */ + + av[ac] = NULL; + Assert(ac < lengthof(av)); + + return postmaster_forkexec(ac, av); +} +#endif /* EXEC_BACKEND */ + + +/* + * pgstat_start() - + * + * Called from postmaster at startup or after an existing collector + * died. Attempt to fire up a fresh statistics collector. + * + * Returns PID of child process, or 0 if fail. + * + * Note: if fail, we will be called again from the postmaster main loop. + */ +int +pgstat_start(void) +{ + time_t curtime; + pid_t pgStatPid; + + /* + * Check that the socket is there, else pgstat_init failed and we can do + * nothing useful. + */ + if (pgStatSock == PGINVALID_SOCKET) + return 0; + + /* + * Do nothing if too soon since last collector start. This is a safety + * valve to protect against continuous respawn attempts if the collector + * is dying immediately at launch. Note that since we will be re-called + * from the postmaster main loop, we will get another chance later. + */ + curtime = time(NULL); + if ((unsigned int) (curtime - last_pgstat_start_time) < + (unsigned int) PGSTAT_RESTART_INTERVAL) + return 0; + last_pgstat_start_time = curtime; + + /* + * Okay, fork off the collector. + */ +#ifdef EXEC_BACKEND + switch ((pgStatPid = pgstat_forkexec())) +#else + switch ((pgStatPid = fork_process())) +#endif + { + case -1: + ereport(LOG, + (errmsg("could not fork statistics collector: %m"))); + return 0; + +#ifndef EXEC_BACKEND + case 0: + /* in postmaster child ... */ + InitPostmasterChild(); + + /* Close the postmaster's sockets */ + ClosePostmasterPorts(false); + + /* Drop our connection to postmaster's shared memory, as well */ + dsm_detach_all(); + PGSharedMemoryDetach(); + + PgstatCollectorMain(0, NULL); + break; +#endif + + default: + return (int) pgStatPid; + } + + /* shouldn't get here */ + return 0; +} + +void +allow_immediate_pgstat_restart(void) +{ + last_pgstat_start_time = 0; +} + +/* ------------------------------------------------------------ + * Public functions used by backends follow + *------------------------------------------------------------ + */ + + +/* ---------- + * pgstat_report_stat() - + * + * Must be called by processes that performs DML: tcop/postgres.c, logical + * receiver processes, SPI worker, etc. to send the so far collected + * per-table and function usage statistics to the collector. Note that this + * is called only when not within a transaction, so it is fair to use + * transaction stop time as an approximation of current time. + * + * "disconnect" is "true" only for the last call before the backend + * exits. This makes sure that no data is lost and that interrupted + * sessions are reported correctly. + * ---------- + */ +void +pgstat_report_stat(bool disconnect) +{ + /* we assume this inits to all zeroes: */ + static const PgStat_TableCounts all_zeroes; + static TimestampTz last_report = 0; + + TimestampTz now; + PgStat_MsgTabstat regular_msg; + PgStat_MsgTabstat shared_msg; + TabStatusArray *tsa; + int i; + + /* + * Don't expend a clock check if nothing to do. + * + * To determine whether any WAL activity has occurred since last time, not + * only the number of generated WAL records but also the numbers of WAL + * writes and syncs need to be checked. Because even transaction that + * generates no WAL records can write or sync WAL data when flushing the + * data pages. + */ + if ((pgStatTabList == NULL || pgStatTabList->tsa_used == 0) && + pgStatXactCommit == 0 && pgStatXactRollback == 0 && + pgWalUsage.wal_records == prevWalUsage.wal_records && + WalStats.m_wal_write == 0 && WalStats.m_wal_sync == 0 && + !have_function_stats && !disconnect) + return; + + /* + * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL + * msec since we last sent one, or the backend is about to exit. + */ + now = GetCurrentTransactionStopTimestamp(); + if (!disconnect && + !TimestampDifferenceExceeds(last_report, now, PGSTAT_STAT_INTERVAL)) + return; + + last_report = now; + + if (disconnect) + pgstat_report_disconnect(MyDatabaseId); + + /* + * Destroy pgStatTabHash before we start invalidating PgStat_TableEntry + * entries it points to. (Should we fail partway through the loop below, + * it's okay to have removed the hashtable already --- the only + * consequence is we'd get multiple entries for the same table in the + * pgStatTabList, and that's safe.) + */ + if (pgStatTabHash) + hash_destroy(pgStatTabHash); + pgStatTabHash = NULL; + + /* + * Scan through the TabStatusArray struct(s) to find tables that actually + * have counts, and build messages to send. We have to separate shared + * relations from regular ones because the databaseid field in the message + * header has to depend on that. + */ + regular_msg.m_databaseid = MyDatabaseId; + shared_msg.m_databaseid = InvalidOid; + regular_msg.m_nentries = 0; + shared_msg.m_nentries = 0; + + for (tsa = pgStatTabList; tsa != NULL; tsa = tsa->tsa_next) + { + for (i = 0; i < tsa->tsa_used; i++) + { + PgStat_TableStatus *entry = &tsa->tsa_entries[i]; + PgStat_MsgTabstat *this_msg; + PgStat_TableEntry *this_ent; + + /* Shouldn't have any pending transaction-dependent counts */ + Assert(entry->trans == NULL); + + /* + * Ignore entries that didn't accumulate any actual counts, such + * as indexes that were opened by the planner but not used. + */ + if (memcmp(&entry->t_counts, &all_zeroes, + sizeof(PgStat_TableCounts)) == 0) + continue; + + /* + * OK, insert data into the appropriate message, and send if full. + */ + this_msg = entry->t_shared ? &shared_msg : ®ular_msg; + this_ent = &this_msg->m_entry[this_msg->m_nentries]; + this_ent->t_id = entry->t_id; + memcpy(&this_ent->t_counts, &entry->t_counts, + sizeof(PgStat_TableCounts)); + if (++this_msg->m_nentries >= PGSTAT_NUM_TABENTRIES) + { + pgstat_send_tabstat(this_msg, now); + this_msg->m_nentries = 0; + } + } + /* zero out PgStat_TableStatus structs after use */ + MemSet(tsa->tsa_entries, 0, + tsa->tsa_used * sizeof(PgStat_TableStatus)); + tsa->tsa_used = 0; + } + + /* + * Send partial messages. Make sure that any pending xact commit/abort + * and connection stats get counted, even if there are no table stats to + * send. + */ + if (regular_msg.m_nentries > 0 || + pgStatXactCommit > 0 || pgStatXactRollback > 0 || disconnect) + pgstat_send_tabstat(®ular_msg, now); + if (shared_msg.m_nentries > 0) + pgstat_send_tabstat(&shared_msg, now); + + /* Now, send function statistics */ + pgstat_send_funcstats(); + + /* Send WAL statistics */ + pgstat_send_wal(true); + + /* Finally send SLRU statistics */ + pgstat_send_slru(); +} + +/* + * Subroutine for pgstat_report_stat: finish and send a tabstat message + */ +static void +pgstat_send_tabstat(PgStat_MsgTabstat *tsmsg, TimestampTz now) +{ + int n; + int len; + + /* It's unlikely we'd get here with no socket, but maybe not impossible */ + if (pgStatSock == PGINVALID_SOCKET) + return; + + /* + * Report and reset accumulated xact commit/rollback and I/O timings + * whenever we send a normal tabstat message + */ + if (OidIsValid(tsmsg->m_databaseid)) + { + tsmsg->m_xact_commit = pgStatXactCommit; + tsmsg->m_xact_rollback = pgStatXactRollback; + tsmsg->m_block_read_time = pgStatBlockReadTime; + tsmsg->m_block_write_time = pgStatBlockWriteTime; + + if (pgstat_should_report_connstat()) + { + long secs; + int usecs; + + /* + * pgLastSessionReportTime is initialized to MyStartTimestamp by + * pgstat_report_connect(). + */ + TimestampDifference(pgLastSessionReportTime, now, &secs, &usecs); + pgLastSessionReportTime = now; + tsmsg->m_session_time = (PgStat_Counter) secs * 1000000 + usecs; + tsmsg->m_active_time = pgStatActiveTime; + tsmsg->m_idle_in_xact_time = pgStatTransactionIdleTime; + } + else + { + tsmsg->m_session_time = 0; + tsmsg->m_active_time = 0; + tsmsg->m_idle_in_xact_time = 0; + } + pgStatXactCommit = 0; + pgStatXactRollback = 0; + pgStatBlockReadTime = 0; + pgStatBlockWriteTime = 0; + pgStatActiveTime = 0; + pgStatTransactionIdleTime = 0; + } + else + { + tsmsg->m_xact_commit = 0; + tsmsg->m_xact_rollback = 0; + tsmsg->m_block_read_time = 0; + tsmsg->m_block_write_time = 0; + tsmsg->m_session_time = 0; + tsmsg->m_active_time = 0; + tsmsg->m_idle_in_xact_time = 0; + } + + n = tsmsg->m_nentries; + len = offsetof(PgStat_MsgTabstat, m_entry[0]) + + n * sizeof(PgStat_TableEntry); + + pgstat_setheader(&tsmsg->m_hdr, PGSTAT_MTYPE_TABSTAT); + pgstat_send(tsmsg, len); +} + +/* + * Subroutine for pgstat_report_stat: populate and send a function stat message + */ +static void +pgstat_send_funcstats(void) +{ + /* we assume this inits to all zeroes: */ + static const PgStat_FunctionCounts all_zeroes; + + PgStat_MsgFuncstat msg; + PgStat_BackendFunctionEntry *entry; + HASH_SEQ_STATUS fstat; + + if (pgStatFunctions == NULL) + return; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_FUNCSTAT); + msg.m_databaseid = MyDatabaseId; + msg.m_nentries = 0; + + hash_seq_init(&fstat, pgStatFunctions); + while ((entry = (PgStat_BackendFunctionEntry *) hash_seq_search(&fstat)) != NULL) + { + PgStat_FunctionEntry *m_ent; + + /* Skip it if no counts accumulated since last time */ + if (memcmp(&entry->f_counts, &all_zeroes, + sizeof(PgStat_FunctionCounts)) == 0) + continue; + + /* need to convert format of time accumulators */ + m_ent = &msg.m_entry[msg.m_nentries]; + m_ent->f_id = entry->f_id; + m_ent->f_numcalls = entry->f_counts.f_numcalls; + m_ent->f_total_time = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_total_time); + m_ent->f_self_time = INSTR_TIME_GET_MICROSEC(entry->f_counts.f_self_time); + + if (++msg.m_nentries >= PGSTAT_NUM_FUNCENTRIES) + { + pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) + + msg.m_nentries * sizeof(PgStat_FunctionEntry)); + msg.m_nentries = 0; + } + + /* reset the entry's counts */ + MemSet(&entry->f_counts, 0, sizeof(PgStat_FunctionCounts)); + } + + if (msg.m_nentries > 0) + pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) + + msg.m_nentries * sizeof(PgStat_FunctionEntry)); + + have_function_stats = false; +} + + +/* ---------- + * pgstat_vacuum_stat() - + * + * Will tell the collector about objects he can get rid of. + * ---------- + */ +void +pgstat_vacuum_stat(void) +{ + HTAB *htab; + PgStat_MsgTabpurge msg; + PgStat_MsgFuncpurge f_msg; + HASH_SEQ_STATUS hstat; + PgStat_StatDBEntry *dbentry; + PgStat_StatTabEntry *tabentry; + PgStat_StatFuncEntry *funcentry; + int len; + + if (pgStatSock == PGINVALID_SOCKET) + return; + + /* + * If not done for this transaction, read the statistics collector stats + * file into some hash tables. + */ + backend_read_statsfile(); + + /* + * Read pg_database and make a list of OIDs of all existing databases + */ + htab = pgstat_collect_oids(DatabaseRelationId, Anum_pg_database_oid); + + /* + * Search the database hash table for dead databases and tell the + * collector to drop them. + */ + hash_seq_init(&hstat, pgStatDBHash); + while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL) + { + Oid dbid = dbentry->databaseid; + + CHECK_FOR_INTERRUPTS(); + + /* the DB entry for shared tables (with InvalidOid) is never dropped */ + if (OidIsValid(dbid) && + hash_search(htab, (void *) &dbid, HASH_FIND, NULL) == NULL) + pgstat_drop_database(dbid); + } + + /* Clean up */ + hash_destroy(htab); + + /* + * Search for all the dead replication slots in stats hashtable and tell + * the stats collector to drop them. + */ + if (replSlotStatHash) + { + PgStat_StatReplSlotEntry *slotentry; + + hash_seq_init(&hstat, replSlotStatHash); + while ((slotentry = (PgStat_StatReplSlotEntry *) hash_seq_search(&hstat)) != NULL) + { + CHECK_FOR_INTERRUPTS(); + + if (SearchNamedReplicationSlot(NameStr(slotentry->slotname), true) == NULL) + pgstat_report_replslot_drop(NameStr(slotentry->slotname)); + } + } + + /* + * Lookup our own database entry; if not found, nothing more to do. + */ + dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash, + (void *) &MyDatabaseId, + HASH_FIND, NULL); + if (dbentry == NULL || dbentry->tables == NULL) + return; + + /* + * Similarly to above, make a list of all known relations in this DB. + */ + htab = pgstat_collect_oids(RelationRelationId, Anum_pg_class_oid); + + /* + * Initialize our messages table counter to zero + */ + msg.m_nentries = 0; + + /* + * Check for all tables listed in stats hashtable if they still exist. + */ + hash_seq_init(&hstat, dbentry->tables); + while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&hstat)) != NULL) + { + Oid tabid = tabentry->tableid; + + CHECK_FOR_INTERRUPTS(); + + if (hash_search(htab, (void *) &tabid, HASH_FIND, NULL) != NULL) + continue; + + /* + * Not there, so add this table's Oid to the message + */ + msg.m_tableid[msg.m_nentries++] = tabid; + + /* + * If the message is full, send it out and reinitialize to empty + */ + if (msg.m_nentries >= PGSTAT_NUM_TABPURGE) + { + len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) + + msg.m_nentries * sizeof(Oid); + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE); + msg.m_databaseid = MyDatabaseId; + pgstat_send(&msg, len); + + msg.m_nentries = 0; + } + } + + /* + * Send the rest + */ + if (msg.m_nentries > 0) + { + len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) + + msg.m_nentries * sizeof(Oid); + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE); + msg.m_databaseid = MyDatabaseId; + pgstat_send(&msg, len); + } + + /* Clean up */ + hash_destroy(htab); + + /* + * Now repeat the above steps for functions. However, we needn't bother + * in the common case where no function stats are being collected. + */ + if (dbentry->functions != NULL && + hash_get_num_entries(dbentry->functions) > 0) + { + htab = pgstat_collect_oids(ProcedureRelationId, Anum_pg_proc_oid); + + pgstat_setheader(&f_msg.m_hdr, PGSTAT_MTYPE_FUNCPURGE); + f_msg.m_databaseid = MyDatabaseId; + f_msg.m_nentries = 0; + + hash_seq_init(&hstat, dbentry->functions); + while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&hstat)) != NULL) + { + Oid funcid = funcentry->functionid; + + CHECK_FOR_INTERRUPTS(); + + if (hash_search(htab, (void *) &funcid, HASH_FIND, NULL) != NULL) + continue; + + /* + * Not there, so add this function's Oid to the message + */ + f_msg.m_functionid[f_msg.m_nentries++] = funcid; + + /* + * If the message is full, send it out and reinitialize to empty + */ + if (f_msg.m_nentries >= PGSTAT_NUM_FUNCPURGE) + { + len = offsetof(PgStat_MsgFuncpurge, m_functionid[0]) + + f_msg.m_nentries * sizeof(Oid); + + pgstat_send(&f_msg, len); + + f_msg.m_nentries = 0; + } + } + + /* + * Send the rest + */ + if (f_msg.m_nentries > 0) + { + len = offsetof(PgStat_MsgFuncpurge, m_functionid[0]) + + f_msg.m_nentries * sizeof(Oid); + + pgstat_send(&f_msg, len); + } + + hash_destroy(htab); + } +} + + +/* ---------- + * pgstat_collect_oids() - + * + * Collect the OIDs of all objects listed in the specified system catalog + * into a temporary hash table. Caller should hash_destroy the result + * when done with it. (However, we make the table in CurrentMemoryContext + * so that it will be freed properly in event of an error.) + * ---------- + */ +static HTAB * +pgstat_collect_oids(Oid catalogid, AttrNumber anum_oid) +{ + HTAB *htab; + HASHCTL hash_ctl; + Relation rel; + TableScanDesc scan; + HeapTuple tup; + Snapshot snapshot; + + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(Oid); + hash_ctl.hcxt = CurrentMemoryContext; + htab = hash_create("Temporary table of OIDs", + PGSTAT_TAB_HASH_SIZE, + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + rel = table_open(catalogid, AccessShareLock); + snapshot = RegisterSnapshot(GetLatestSnapshot()); + scan = table_beginscan(rel, snapshot, 0, NULL); + while ((tup = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + Oid thisoid; + bool isnull; + + thisoid = heap_getattr(tup, anum_oid, RelationGetDescr(rel), &isnull); + Assert(!isnull); + + CHECK_FOR_INTERRUPTS(); + + (void) hash_search(htab, (void *) &thisoid, HASH_ENTER, NULL); + } + table_endscan(scan); + UnregisterSnapshot(snapshot); + table_close(rel, AccessShareLock); + + return htab; +} + + +/* ---------- + * pgstat_drop_database() - + * + * Tell the collector that we just dropped a database. + * (If the message gets lost, we will still clean the dead DB eventually + * via future invocations of pgstat_vacuum_stat().) + * ---------- + */ +void +pgstat_drop_database(Oid databaseid) +{ + PgStat_MsgDropdb msg; + + if (pgStatSock == PGINVALID_SOCKET) + return; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DROPDB); + msg.m_databaseid = databaseid; + pgstat_send(&msg, sizeof(msg)); +} + + +/* ---------- + * pgstat_drop_relation() - + * + * Tell the collector that we just dropped a relation. + * (If the message gets lost, we will still clean the dead entry eventually + * via future invocations of pgstat_vacuum_stat().) + * + * Currently not used for lack of any good place to call it; we rely + * entirely on pgstat_vacuum_stat() to clean out stats for dead rels. + * ---------- + */ +#ifdef NOT_USED +void +pgstat_drop_relation(Oid relid) +{ + PgStat_MsgTabpurge msg; + int len; + + if (pgStatSock == PGINVALID_SOCKET) + return; + + msg.m_tableid[0] = relid; + msg.m_nentries = 1; + + len = offsetof(PgStat_MsgTabpurge, m_tableid[0]) + sizeof(Oid); + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TABPURGE); + msg.m_databaseid = MyDatabaseId; + pgstat_send(&msg, len); +} +#endif /* NOT_USED */ + +/* ---------- + * pgstat_reset_counters() - + * + * Tell the statistics collector to reset counters for our database. + * + * Permission checking for this function is managed through the normal + * GRANT system. + * ---------- + */ +void +pgstat_reset_counters(void) +{ + PgStat_MsgResetcounter msg; + + if (pgStatSock == PGINVALID_SOCKET) + return; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETCOUNTER); + msg.m_databaseid = MyDatabaseId; + pgstat_send(&msg, sizeof(msg)); +} + +/* ---------- + * pgstat_reset_shared_counters() - + * + * Tell the statistics collector to reset cluster-wide shared counters. + * + * Permission checking for this function is managed through the normal + * GRANT system. + * ---------- + */ +void +pgstat_reset_shared_counters(const char *target) +{ + PgStat_MsgResetsharedcounter msg; + + if (pgStatSock == PGINVALID_SOCKET) + return; + + if (strcmp(target, "archiver") == 0) + msg.m_resettarget = RESET_ARCHIVER; + else if (strcmp(target, "bgwriter") == 0) + msg.m_resettarget = RESET_BGWRITER; + else if (strcmp(target, "wal") == 0) + msg.m_resettarget = RESET_WAL; + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized reset target: \"%s\"", target), + errhint("Target must be \"archiver\", \"bgwriter\", or \"wal\"."))); + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSHAREDCOUNTER); + pgstat_send(&msg, sizeof(msg)); +} + +/* ---------- + * pgstat_reset_single_counter() - + * + * Tell the statistics collector to reset a single counter. + * + * Permission checking for this function is managed through the normal + * GRANT system. + * ---------- + */ +void +pgstat_reset_single_counter(Oid objoid, PgStat_Single_Reset_Type type) +{ + PgStat_MsgResetsinglecounter msg; + + if (pgStatSock == PGINVALID_SOCKET) + return; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSINGLECOUNTER); + msg.m_databaseid = MyDatabaseId; + msg.m_resettype = type; + msg.m_objectid = objoid; + + pgstat_send(&msg, sizeof(msg)); +} + +/* ---------- + * pgstat_reset_slru_counter() - + * + * Tell the statistics collector to reset a single SLRU counter, or all + * SLRU counters (when name is null). + * + * Permission checking for this function is managed through the normal + * GRANT system. + * ---------- + */ +void +pgstat_reset_slru_counter(const char *name) +{ + PgStat_MsgResetslrucounter msg; + + if (pgStatSock == PGINVALID_SOCKET) + return; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETSLRUCOUNTER); + msg.m_index = (name) ? pgstat_slru_index(name) : -1; + + pgstat_send(&msg, sizeof(msg)); +} + +/* ---------- + * pgstat_reset_replslot_counter() - + * + * Tell the statistics collector to reset a single replication slot + * counter, or all replication slots counters (when name is null). + * + * Permission checking for this function is managed through the normal + * GRANT system. + * ---------- + */ +void +pgstat_reset_replslot_counter(const char *name) +{ + PgStat_MsgResetreplslotcounter msg; + + if (pgStatSock == PGINVALID_SOCKET) + return; + + if (name) + { + namestrcpy(&msg.m_slotname, name); + msg.clearall = false; + } + else + msg.clearall = true; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RESETREPLSLOTCOUNTER); + + pgstat_send(&msg, sizeof(msg)); +} + +/* ---------- + * pgstat_report_autovac() - + * + * Called from autovacuum.c to report startup of an autovacuum process. + * We are called before InitPostgres is done, so can't rely on MyDatabaseId; + * the db OID must be passed in, instead. + * ---------- + */ +void +pgstat_report_autovac(Oid dboid) +{ + PgStat_MsgAutovacStart msg; + + if (pgStatSock == PGINVALID_SOCKET) + return; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_AUTOVAC_START); + msg.m_databaseid = dboid; + msg.m_start_time = GetCurrentTimestamp(); + + pgstat_send(&msg, sizeof(msg)); +} + + +/* --------- + * pgstat_report_vacuum() - + * + * Tell the collector about the table we just vacuumed. + * --------- + */ +void +pgstat_report_vacuum(Oid tableoid, bool shared, + PgStat_Counter livetuples, PgStat_Counter deadtuples) +{ + PgStat_MsgVacuum msg; + + if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + return; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_VACUUM); + msg.m_databaseid = shared ? InvalidOid : MyDatabaseId; + msg.m_tableoid = tableoid; + msg.m_autovacuum = IsAutoVacuumWorkerProcess(); + msg.m_vacuumtime = GetCurrentTimestamp(); + msg.m_live_tuples = livetuples; + msg.m_dead_tuples = deadtuples; + pgstat_send(&msg, sizeof(msg)); +} + +/* -------- + * pgstat_report_analyze() - + * + * Tell the collector about the table we just analyzed. + * + * Caller must provide new live- and dead-tuples estimates, as well as a + * flag indicating whether to reset the changes_since_analyze counter. + * -------- + */ +void +pgstat_report_analyze(Relation rel, + PgStat_Counter livetuples, PgStat_Counter deadtuples, + bool resetcounter) +{ + PgStat_MsgAnalyze msg; + + if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + return; + + /* + * Unlike VACUUM, ANALYZE might be running inside a transaction that has + * already inserted and/or deleted rows in the target table. ANALYZE will + * have counted such rows as live or dead respectively. Because we will + * report our counts of such rows at transaction end, we should subtract + * off these counts from what we send to the collector now, else they'll + * be double-counted after commit. (This approach also ensures that the + * collector ends up with the right numbers if we abort instead of + * committing.) + * + * Waste no time on partitioned tables, though. + */ + if (rel->pgstat_info != NULL && + rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + { + PgStat_TableXactStatus *trans; + + for (trans = rel->pgstat_info->trans; trans; trans = trans->upper) + { + livetuples -= trans->tuples_inserted - trans->tuples_deleted; + deadtuples -= trans->tuples_updated + trans->tuples_deleted; + } + /* count stuff inserted by already-aborted subxacts, too */ + deadtuples -= rel->pgstat_info->t_counts.t_delta_dead_tuples; + /* Since ANALYZE's counts are estimates, we could have underflowed */ + livetuples = Max(livetuples, 0); + deadtuples = Max(deadtuples, 0); + } + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ANALYZE); + msg.m_databaseid = rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId; + msg.m_tableoid = RelationGetRelid(rel); + msg.m_autovacuum = IsAutoVacuumWorkerProcess(); + msg.m_resetcounter = resetcounter; + msg.m_analyzetime = GetCurrentTimestamp(); + msg.m_live_tuples = livetuples; + msg.m_dead_tuples = deadtuples; + pgstat_send(&msg, sizeof(msg)); +} + +/* -------- + * pgstat_report_recovery_conflict() - + * + * Tell the collector about a Hot Standby recovery conflict. + * -------- + */ +void +pgstat_report_recovery_conflict(int reason) +{ + PgStat_MsgRecoveryConflict msg; + + if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + return; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_RECOVERYCONFLICT); + msg.m_databaseid = MyDatabaseId; + msg.m_reason = reason; + pgstat_send(&msg, sizeof(msg)); +} + +/* -------- + * pgstat_report_deadlock() - + * + * Tell the collector about a deadlock detected. + * -------- + */ +void +pgstat_report_deadlock(void) +{ + PgStat_MsgDeadlock msg; + + if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + return; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DEADLOCK); + msg.m_databaseid = MyDatabaseId; + pgstat_send(&msg, sizeof(msg)); +} + + + +/* -------- + * pgstat_report_checksum_failures_in_db() - + * + * Tell the collector about one or more checksum failures. + * -------- + */ +void +pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount) +{ + PgStat_MsgChecksumFailure msg; + + if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + return; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_CHECKSUMFAILURE); + msg.m_databaseid = dboid; + msg.m_failurecount = failurecount; + msg.m_failure_time = GetCurrentTimestamp(); + + pgstat_send(&msg, sizeof(msg)); +} + +/* -------- + * pgstat_report_checksum_failure() - + * + * Tell the collector about a checksum failure. + * -------- + */ +void +pgstat_report_checksum_failure(void) +{ + pgstat_report_checksum_failures_in_db(MyDatabaseId, 1); +} + +/* -------- + * pgstat_report_tempfile() - + * + * Tell the collector about a temporary file. + * -------- + */ +void +pgstat_report_tempfile(size_t filesize) +{ + PgStat_MsgTempFile msg; + + if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + return; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_TEMPFILE); + msg.m_databaseid = MyDatabaseId; + msg.m_filesize = filesize; + pgstat_send(&msg, sizeof(msg)); +} + +/* -------- + * pgstat_report_connect() - + * + * Tell the collector about a new connection. + * -------- + */ +void +pgstat_report_connect(Oid dboid) +{ + PgStat_MsgConnect msg; + + if (!pgstat_should_report_connstat()) + return; + + pgLastSessionReportTime = MyStartTimestamp; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_CONNECT); + msg.m_databaseid = MyDatabaseId; + pgstat_send(&msg, sizeof(PgStat_MsgConnect)); +} + +/* -------- + * pgstat_report_disconnect() - + * + * Tell the collector about a disconnect. + * -------- + */ +static void +pgstat_report_disconnect(Oid dboid) +{ + PgStat_MsgDisconnect msg; + + if (!pgstat_should_report_connstat()) + return; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DISCONNECT); + msg.m_databaseid = MyDatabaseId; + msg.m_cause = pgStatSessionEndCause; + pgstat_send(&msg, sizeof(PgStat_MsgDisconnect)); +} + +/* -------- + * pgstat_should_report_connstats() - + * + * We report session statistics only for normal backend processes. Parallel + * workers run in parallel, so they don't contribute to session times, even + * though they use CPU time. Walsender processes could be considered here, + * but they have different session characteristics from normal backends (for + * example, they are always "active"), so they would skew session statistics. + * ---------- + */ +static bool +pgstat_should_report_connstat(void) +{ + return MyBackendType == B_BACKEND; +} + +/* ---------- + * pgstat_report_replslot() - + * + * Tell the collector about replication slot statistics. + * ---------- + */ +void +pgstat_report_replslot(const PgStat_StatReplSlotEntry *repSlotStat) +{ + PgStat_MsgReplSlot msg; + + /* + * Prepare and send the message + */ + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_REPLSLOT); + namestrcpy(&msg.m_slotname, NameStr(repSlotStat->slotname)); + msg.m_create = false; + msg.m_drop = false; + msg.m_spill_txns = repSlotStat->spill_txns; + msg.m_spill_count = repSlotStat->spill_count; + msg.m_spill_bytes = repSlotStat->spill_bytes; + msg.m_stream_txns = repSlotStat->stream_txns; + msg.m_stream_count = repSlotStat->stream_count; + msg.m_stream_bytes = repSlotStat->stream_bytes; + msg.m_total_txns = repSlotStat->total_txns; + msg.m_total_bytes = repSlotStat->total_bytes; + pgstat_send(&msg, sizeof(PgStat_MsgReplSlot)); +} + +/* ---------- + * pgstat_report_replslot_create() - + * + * Tell the collector about creating the replication slot. + * ---------- + */ +void +pgstat_report_replslot_create(const char *slotname) +{ + PgStat_MsgReplSlot msg; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_REPLSLOT); + namestrcpy(&msg.m_slotname, slotname); + msg.m_create = true; + msg.m_drop = false; + pgstat_send(&msg, sizeof(PgStat_MsgReplSlot)); +} + +/* ---------- + * pgstat_report_replslot_drop() - + * + * Tell the collector about dropping the replication slot. + * ---------- + */ +void +pgstat_report_replslot_drop(const char *slotname) +{ + PgStat_MsgReplSlot msg; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_REPLSLOT); + namestrcpy(&msg.m_slotname, slotname); + msg.m_create = false; + msg.m_drop = true; + pgstat_send(&msg, sizeof(PgStat_MsgReplSlot)); +} + +/* ---------- + * pgstat_ping() - + * + * Send some junk data to the collector to increase traffic. + * ---------- + */ +void +pgstat_ping(void) +{ + PgStat_MsgDummy msg; + + if (pgStatSock == PGINVALID_SOCKET) + return; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_DUMMY); + pgstat_send(&msg, sizeof(msg)); +} + +/* ---------- + * pgstat_send_inquiry() - + * + * Notify collector that we need fresh data. + * ---------- + */ +static void +pgstat_send_inquiry(TimestampTz clock_time, TimestampTz cutoff_time, Oid databaseid) +{ + PgStat_MsgInquiry msg; + + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY); + msg.clock_time = clock_time; + msg.cutoff_time = cutoff_time; + msg.databaseid = databaseid; + pgstat_send(&msg, sizeof(msg)); +} + + +/* + * Initialize function call usage data. + * Called by the executor before invoking a function. + */ +void +pgstat_init_function_usage(FunctionCallInfo fcinfo, + PgStat_FunctionCallUsage *fcu) +{ + PgStat_BackendFunctionEntry *htabent; + bool found; + + if (pgstat_track_functions <= fcinfo->flinfo->fn_stats) + { + /* stats not wanted */ + fcu->fs = NULL; + return; + } + + if (!pgStatFunctions) + { + /* First time through - initialize function stat table */ + HASHCTL hash_ctl; + + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(PgStat_BackendFunctionEntry); + pgStatFunctions = hash_create("Function stat entries", + PGSTAT_FUNCTION_HASH_SIZE, + &hash_ctl, + HASH_ELEM | HASH_BLOBS); + } + + /* Get the stats entry for this function, create if necessary */ + htabent = hash_search(pgStatFunctions, &fcinfo->flinfo->fn_oid, + HASH_ENTER, &found); + if (!found) + MemSet(&htabent->f_counts, 0, sizeof(PgStat_FunctionCounts)); + + fcu->fs = &htabent->f_counts; + + /* save stats for this function, later used to compensate for recursion */ + fcu->save_f_total_time = htabent->f_counts.f_total_time; + + /* save current backend-wide total time */ + fcu->save_total = total_func_time; + + /* get clock time as of function start */ + INSTR_TIME_SET_CURRENT(fcu->f_start); +} + +/* + * find_funcstat_entry - find any existing PgStat_BackendFunctionEntry entry + * for specified function + * + * If no entry, return NULL, don't create a new one + */ +PgStat_BackendFunctionEntry * +find_funcstat_entry(Oid func_id) +{ + if (pgStatFunctions == NULL) + return NULL; + + return (PgStat_BackendFunctionEntry *) hash_search(pgStatFunctions, + (void *) &func_id, + HASH_FIND, NULL); +} + +/* + * Calculate function call usage and update stat counters. + * Called by the executor after invoking a function. + * + * In the case of a set-returning function that runs in value-per-call mode, + * we will see multiple pgstat_init_function_usage/pgstat_end_function_usage + * calls for what the user considers a single call of the function. The + * finalize flag should be TRUE on the last call. + */ +void +pgstat_end_function_usage(PgStat_FunctionCallUsage *fcu, bool finalize) +{ + PgStat_FunctionCounts *fs = fcu->fs; + instr_time f_total; + instr_time f_others; + instr_time f_self; + + /* stats not wanted? */ + if (fs == NULL) + return; + + /* total elapsed time in this function call */ + INSTR_TIME_SET_CURRENT(f_total); + INSTR_TIME_SUBTRACT(f_total, fcu->f_start); + + /* self usage: elapsed minus anything already charged to other calls */ + f_others = total_func_time; + INSTR_TIME_SUBTRACT(f_others, fcu->save_total); + f_self = f_total; + INSTR_TIME_SUBTRACT(f_self, f_others); + + /* update backend-wide total time */ + INSTR_TIME_ADD(total_func_time, f_self); + + /* + * Compute the new f_total_time as the total elapsed time added to the + * pre-call value of f_total_time. This is necessary to avoid + * double-counting any time taken by recursive calls of myself. (We do + * not need any similar kluge for self time, since that already excludes + * any recursive calls.) + */ + INSTR_TIME_ADD(f_total, fcu->save_f_total_time); + + /* update counters in function stats table */ + if (finalize) + fs->f_numcalls++; + fs->f_total_time = f_total; + INSTR_TIME_ADD(fs->f_self_time, f_self); + + /* indicate that we have something to send */ + have_function_stats = true; +} + + +/* ---------- + * pgstat_initstats() - + * + * Initialize a relcache entry to count access statistics. + * Called whenever a relation is opened. + * + * We assume that a relcache entry's pgstat_info field is zeroed by + * relcache.c when the relcache entry is made; thereafter it is long-lived + * data. We can avoid repeated searches of the TabStatus arrays when the + * same relation is touched repeatedly within a transaction. + * ---------- + */ +void +pgstat_initstats(Relation rel) +{ + Oid rel_id = rel->rd_id; + char relkind = rel->rd_rel->relkind; + + /* + * We only count stats for relations with storage and partitioned tables + */ + if (!RELKIND_HAS_STORAGE(relkind) && relkind != RELKIND_PARTITIONED_TABLE) + { + rel->pgstat_info = NULL; + return; + } + + if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts) + { + /* We're not counting at all */ + rel->pgstat_info = NULL; + return; + } + + /* + * If we already set up this relation in the current transaction, nothing + * to do. + */ + if (rel->pgstat_info != NULL && + rel->pgstat_info->t_id == rel_id) + return; + + /* Else find or make the PgStat_TableStatus entry, and update link */ + rel->pgstat_info = get_tabstat_entry(rel_id, rel->rd_rel->relisshared); +} + +/* + * get_tabstat_entry - find or create a PgStat_TableStatus entry for rel + */ +static PgStat_TableStatus * +get_tabstat_entry(Oid rel_id, bool isshared) +{ + TabStatHashEntry *hash_entry; + PgStat_TableStatus *entry; + TabStatusArray *tsa; + bool found; + + /* + * Create hash table if we don't have it already. + */ + if (pgStatTabHash == NULL) + { + HASHCTL ctl; + + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(TabStatHashEntry); + + pgStatTabHash = hash_create("pgstat TabStatusArray lookup hash table", + TABSTAT_QUANTUM, + &ctl, + HASH_ELEM | HASH_BLOBS); + } + + /* + * Find an entry or create a new one. + */ + hash_entry = hash_search(pgStatTabHash, &rel_id, HASH_ENTER, &found); + if (!found) + { + /* initialize new entry with null pointer */ + hash_entry->tsa_entry = NULL; + } + + /* + * If entry is already valid, we're done. + */ + if (hash_entry->tsa_entry) + return hash_entry->tsa_entry; + + /* + * Locate the first pgStatTabList entry with free space, making a new list + * entry if needed. Note that we could get an OOM failure here, but if so + * we have left the hashtable and the list in a consistent state. + */ + if (pgStatTabList == NULL) + { + /* Set up first pgStatTabList entry */ + pgStatTabList = (TabStatusArray *) + MemoryContextAllocZero(TopMemoryContext, + sizeof(TabStatusArray)); + } + + tsa = pgStatTabList; + while (tsa->tsa_used >= TABSTAT_QUANTUM) + { + if (tsa->tsa_next == NULL) + tsa->tsa_next = (TabStatusArray *) + MemoryContextAllocZero(TopMemoryContext, + sizeof(TabStatusArray)); + tsa = tsa->tsa_next; + } + + /* + * Allocate a PgStat_TableStatus entry within this list entry. We assume + * the entry was already zeroed, either at creation or after last use. + */ + entry = &tsa->tsa_entries[tsa->tsa_used++]; + entry->t_id = rel_id; + entry->t_shared = isshared; + + /* + * Now we can fill the entry in pgStatTabHash. + */ + hash_entry->tsa_entry = entry; + + return entry; +} + +/* + * find_tabstat_entry - find any existing PgStat_TableStatus entry for rel + * + * If no entry, return NULL, don't create a new one + * + * Note: if we got an error in the most recent execution of pgstat_report_stat, + * it's possible that an entry exists but there's no hashtable entry for it. + * That's okay, we'll treat this case as "doesn't exist". + */ +PgStat_TableStatus * +find_tabstat_entry(Oid rel_id) +{ + TabStatHashEntry *hash_entry; + + /* If hashtable doesn't exist, there are no entries at all */ + if (!pgStatTabHash) + return NULL; + + hash_entry = hash_search(pgStatTabHash, &rel_id, HASH_FIND, NULL); + if (!hash_entry) + return NULL; + + /* Note that this step could also return NULL, but that's correct */ + return hash_entry->tsa_entry; +} + +/* + * get_tabstat_stack_level - add a new (sub)transaction stack entry if needed + */ +static PgStat_SubXactStatus * +get_tabstat_stack_level(int nest_level) +{ + PgStat_SubXactStatus *xact_state; + + xact_state = pgStatXactStack; + if (xact_state == NULL || xact_state->nest_level != nest_level) + { + xact_state = (PgStat_SubXactStatus *) + MemoryContextAlloc(TopTransactionContext, + sizeof(PgStat_SubXactStatus)); + xact_state->nest_level = nest_level; + xact_state->prev = pgStatXactStack; + xact_state->first = NULL; + pgStatXactStack = xact_state; + } + return xact_state; +} + +/* + * add_tabstat_xact_level - add a new (sub)transaction state record + */ +static void +add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level) +{ + PgStat_SubXactStatus *xact_state; + PgStat_TableXactStatus *trans; + + /* + * If this is the first rel to be modified at the current nest level, we + * first have to push a transaction stack entry. + */ + xact_state = get_tabstat_stack_level(nest_level); + + /* Now make a per-table stack entry */ + trans = (PgStat_TableXactStatus *) + MemoryContextAllocZero(TopTransactionContext, + sizeof(PgStat_TableXactStatus)); + trans->nest_level = nest_level; + trans->upper = pgstat_info->trans; + trans->parent = pgstat_info; + trans->next = xact_state->first; + xact_state->first = trans; + pgstat_info->trans = trans; +} + +/* + * pgstat_count_heap_insert - count a tuple insertion of n tuples + */ +void +pgstat_count_heap_insert(Relation rel, PgStat_Counter n) +{ + PgStat_TableStatus *pgstat_info = rel->pgstat_info; + + if (pgstat_info != NULL) + { + /* We have to log the effect at the proper transactional level */ + int nest_level = GetCurrentTransactionNestLevel(); + + if (pgstat_info->trans == NULL || + pgstat_info->trans->nest_level != nest_level) + add_tabstat_xact_level(pgstat_info, nest_level); + + pgstat_info->trans->tuples_inserted += n; + } +} + +/* + * pgstat_count_heap_update - count a tuple update + */ +void +pgstat_count_heap_update(Relation rel, bool hot) +{ + PgStat_TableStatus *pgstat_info = rel->pgstat_info; + + if (pgstat_info != NULL) + { + /* We have to log the effect at the proper transactional level */ + int nest_level = GetCurrentTransactionNestLevel(); + + if (pgstat_info->trans == NULL || + pgstat_info->trans->nest_level != nest_level) + add_tabstat_xact_level(pgstat_info, nest_level); + + pgstat_info->trans->tuples_updated++; + + /* t_tuples_hot_updated is nontransactional, so just advance it */ + if (hot) + pgstat_info->t_counts.t_tuples_hot_updated++; + } +} + +/* + * pgstat_count_heap_delete - count a tuple deletion + */ +void +pgstat_count_heap_delete(Relation rel) +{ + PgStat_TableStatus *pgstat_info = rel->pgstat_info; + + if (pgstat_info != NULL) + { + /* We have to log the effect at the proper transactional level */ + int nest_level = GetCurrentTransactionNestLevel(); + + if (pgstat_info->trans == NULL || + pgstat_info->trans->nest_level != nest_level) + add_tabstat_xact_level(pgstat_info, nest_level); + + pgstat_info->trans->tuples_deleted++; + } +} + +/* + * pgstat_truncate_save_counters + * + * Whenever a table is truncated, we save its i/u/d counters so that they can + * be cleared, and if the (sub)xact that executed the truncate later aborts, + * the counters can be restored to the saved (pre-truncate) values. Note we do + * this on the first truncate in any particular subxact level only. + */ +static void +pgstat_truncate_save_counters(PgStat_TableXactStatus *trans) +{ + if (!trans->truncated) + { + trans->inserted_pre_trunc = trans->tuples_inserted; + trans->updated_pre_trunc = trans->tuples_updated; + trans->deleted_pre_trunc = trans->tuples_deleted; + trans->truncated = true; + } +} + +/* + * pgstat_truncate_restore_counters - restore counters when a truncate aborts + */ +static void +pgstat_truncate_restore_counters(PgStat_TableXactStatus *trans) +{ + if (trans->truncated) + { + trans->tuples_inserted = trans->inserted_pre_trunc; + trans->tuples_updated = trans->updated_pre_trunc; + trans->tuples_deleted = trans->deleted_pre_trunc; + } +} + +/* + * pgstat_count_truncate - update tuple counters due to truncate + */ +void +pgstat_count_truncate(Relation rel) +{ + PgStat_TableStatus *pgstat_info = rel->pgstat_info; + + if (pgstat_info != NULL) + { + /* We have to log the effect at the proper transactional level */ + int nest_level = GetCurrentTransactionNestLevel(); + + if (pgstat_info->trans == NULL || + pgstat_info->trans->nest_level != nest_level) + add_tabstat_xact_level(pgstat_info, nest_level); + + pgstat_truncate_save_counters(pgstat_info->trans); + pgstat_info->trans->tuples_inserted = 0; + pgstat_info->trans->tuples_updated = 0; + pgstat_info->trans->tuples_deleted = 0; + } +} + +/* + * pgstat_update_heap_dead_tuples - update dead-tuples count + * + * The semantics of this are that we are reporting the nontransactional + * recovery of "delta" dead tuples; so t_delta_dead_tuples decreases + * rather than increasing, and the change goes straight into the per-table + * counter, not into transactional state. + */ +void +pgstat_update_heap_dead_tuples(Relation rel, int delta) +{ + PgStat_TableStatus *pgstat_info = rel->pgstat_info; + + if (pgstat_info != NULL) + pgstat_info->t_counts.t_delta_dead_tuples -= delta; +} + + +/* ---------- + * AtEOXact_PgStat + * + * Called from access/transam/xact.c at top-level transaction commit/abort. + * ---------- + */ +void +AtEOXact_PgStat(bool isCommit, bool parallel) +{ + PgStat_SubXactStatus *xact_state; + + /* Don't count parallel worker transaction stats */ + if (!parallel) + { + /* + * Count transaction commit or abort. (We use counters, not just + * bools, in case the reporting message isn't sent right away.) + */ + if (isCommit) + pgStatXactCommit++; + else + pgStatXactRollback++; + } + + /* + * Transfer transactional insert/update counts into the base tabstat + * entries. We don't bother to free any of the transactional state, since + * it's all in TopTransactionContext and will go away anyway. + */ + xact_state = pgStatXactStack; + if (xact_state != NULL) + { + PgStat_TableXactStatus *trans; + + Assert(xact_state->nest_level == 1); + Assert(xact_state->prev == NULL); + for (trans = xact_state->first; trans != NULL; trans = trans->next) + { + PgStat_TableStatus *tabstat; + + Assert(trans->nest_level == 1); + Assert(trans->upper == NULL); + tabstat = trans->parent; + Assert(tabstat->trans == trans); + /* restore pre-truncate stats (if any) in case of aborted xact */ + if (!isCommit) + pgstat_truncate_restore_counters(trans); + /* count attempted actions regardless of commit/abort */ + tabstat->t_counts.t_tuples_inserted += trans->tuples_inserted; + tabstat->t_counts.t_tuples_updated += trans->tuples_updated; + tabstat->t_counts.t_tuples_deleted += trans->tuples_deleted; + if (isCommit) + { + tabstat->t_counts.t_truncated = trans->truncated; + if (trans->truncated) + { + /* forget live/dead stats seen by backend thus far */ + tabstat->t_counts.t_delta_live_tuples = 0; + tabstat->t_counts.t_delta_dead_tuples = 0; + } + /* insert adds a live tuple, delete removes one */ + tabstat->t_counts.t_delta_live_tuples += + trans->tuples_inserted - trans->tuples_deleted; + /* update and delete each create a dead tuple */ + tabstat->t_counts.t_delta_dead_tuples += + trans->tuples_updated + trans->tuples_deleted; + /* insert, update, delete each count as one change event */ + tabstat->t_counts.t_changed_tuples += + trans->tuples_inserted + trans->tuples_updated + + trans->tuples_deleted; + } + else + { + /* inserted tuples are dead, deleted tuples are unaffected */ + tabstat->t_counts.t_delta_dead_tuples += + trans->tuples_inserted + trans->tuples_updated; + /* an aborted xact generates no changed_tuple events */ + } + tabstat->trans = NULL; + } + } + pgStatXactStack = NULL; + + /* Make sure any stats snapshot is thrown away */ + pgstat_clear_snapshot(); +} + +/* ---------- + * AtEOSubXact_PgStat + * + * Called from access/transam/xact.c at subtransaction commit/abort. + * ---------- + */ +void +AtEOSubXact_PgStat(bool isCommit, int nestDepth) +{ + PgStat_SubXactStatus *xact_state; + + /* + * Transfer transactional insert/update counts into the next higher + * subtransaction state. + */ + xact_state = pgStatXactStack; + if (xact_state != NULL && + xact_state->nest_level >= nestDepth) + { + PgStat_TableXactStatus *trans; + PgStat_TableXactStatus *next_trans; + + /* delink xact_state from stack immediately to simplify reuse case */ + pgStatXactStack = xact_state->prev; + + for (trans = xact_state->first; trans != NULL; trans = next_trans) + { + PgStat_TableStatus *tabstat; + + next_trans = trans->next; + Assert(trans->nest_level == nestDepth); + tabstat = trans->parent; + Assert(tabstat->trans == trans); + if (isCommit) + { + if (trans->upper && trans->upper->nest_level == nestDepth - 1) + { + if (trans->truncated) + { + /* propagate the truncate status one level up */ + pgstat_truncate_save_counters(trans->upper); + /* replace upper xact stats with ours */ + trans->upper->tuples_inserted = trans->tuples_inserted; + trans->upper->tuples_updated = trans->tuples_updated; + trans->upper->tuples_deleted = trans->tuples_deleted; + } + else + { + trans->upper->tuples_inserted += trans->tuples_inserted; + trans->upper->tuples_updated += trans->tuples_updated; + trans->upper->tuples_deleted += trans->tuples_deleted; + } + tabstat->trans = trans->upper; + pfree(trans); + } + else + { + /* + * When there isn't an immediate parent state, we can just + * reuse the record instead of going through a + * palloc/pfree pushup (this works since it's all in + * TopTransactionContext anyway). We have to re-link it + * into the parent level, though, and that might mean + * pushing a new entry into the pgStatXactStack. + */ + PgStat_SubXactStatus *upper_xact_state; + + upper_xact_state = get_tabstat_stack_level(nestDepth - 1); + trans->next = upper_xact_state->first; + upper_xact_state->first = trans; + trans->nest_level = nestDepth - 1; + } + } + else + { + /* + * On abort, update top-level tabstat counts, then forget the + * subtransaction + */ + + /* first restore values obliterated by truncate */ + pgstat_truncate_restore_counters(trans); + /* count attempted actions regardless of commit/abort */ + tabstat->t_counts.t_tuples_inserted += trans->tuples_inserted; + tabstat->t_counts.t_tuples_updated += trans->tuples_updated; + tabstat->t_counts.t_tuples_deleted += trans->tuples_deleted; + /* inserted tuples are dead, deleted tuples are unaffected */ + tabstat->t_counts.t_delta_dead_tuples += + trans->tuples_inserted + trans->tuples_updated; + tabstat->trans = trans->upper; + pfree(trans); + } + } + pfree(xact_state); + } +} + + +/* + * AtPrepare_PgStat + * Save the transactional stats state at 2PC transaction prepare. + * + * In this phase we just generate 2PC records for all the pending + * transaction-dependent stats work. + */ +void +AtPrepare_PgStat(void) +{ + PgStat_SubXactStatus *xact_state; + + xact_state = pgStatXactStack; + if (xact_state != NULL) + { + PgStat_TableXactStatus *trans; + + Assert(xact_state->nest_level == 1); + Assert(xact_state->prev == NULL); + for (trans = xact_state->first; trans != NULL; trans = trans->next) + { + PgStat_TableStatus *tabstat; + TwoPhasePgStatRecord record; + + Assert(trans->nest_level == 1); + Assert(trans->upper == NULL); + tabstat = trans->parent; + Assert(tabstat->trans == trans); + + record.tuples_inserted = trans->tuples_inserted; + record.tuples_updated = trans->tuples_updated; + record.tuples_deleted = trans->tuples_deleted; + record.inserted_pre_trunc = trans->inserted_pre_trunc; + record.updated_pre_trunc = trans->updated_pre_trunc; + record.deleted_pre_trunc = trans->deleted_pre_trunc; + record.t_id = tabstat->t_id; + record.t_shared = tabstat->t_shared; + record.t_truncated = trans->truncated; + + RegisterTwoPhaseRecord(TWOPHASE_RM_PGSTAT_ID, 0, + &record, sizeof(TwoPhasePgStatRecord)); + } + } +} + +/* + * PostPrepare_PgStat + * Clean up after successful PREPARE. + * + * All we need do here is unlink the transaction stats state from the + * nontransactional state. The nontransactional action counts will be + * reported to the stats collector immediately, while the effects on live + * and dead tuple counts are preserved in the 2PC state file. + * + * Note: AtEOXact_PgStat is not called during PREPARE. + */ +void +PostPrepare_PgStat(void) +{ + PgStat_SubXactStatus *xact_state; + + /* + * We don't bother to free any of the transactional state, since it's all + * in TopTransactionContext and will go away anyway. + */ + xact_state = pgStatXactStack; + if (xact_state != NULL) + { + PgStat_TableXactStatus *trans; + + for (trans = xact_state->first; trans != NULL; trans = trans->next) + { + PgStat_TableStatus *tabstat; + + tabstat = trans->parent; + tabstat->trans = NULL; + } + } + pgStatXactStack = NULL; + + /* Make sure any stats snapshot is thrown away */ + pgstat_clear_snapshot(); +} + +/* + * 2PC processing routine for COMMIT PREPARED case. + * + * Load the saved counts into our local pgstats state. + */ +void +pgstat_twophase_postcommit(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata; + PgStat_TableStatus *pgstat_info; + + /* Find or create a tabstat entry for the rel */ + pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared); + + /* Same math as in AtEOXact_PgStat, commit case */ + pgstat_info->t_counts.t_tuples_inserted += rec->tuples_inserted; + pgstat_info->t_counts.t_tuples_updated += rec->tuples_updated; + pgstat_info->t_counts.t_tuples_deleted += rec->tuples_deleted; + pgstat_info->t_counts.t_truncated = rec->t_truncated; + if (rec->t_truncated) + { + /* forget live/dead stats seen by backend thus far */ + pgstat_info->t_counts.t_delta_live_tuples = 0; + pgstat_info->t_counts.t_delta_dead_tuples = 0; + } + pgstat_info->t_counts.t_delta_live_tuples += + rec->tuples_inserted - rec->tuples_deleted; + pgstat_info->t_counts.t_delta_dead_tuples += + rec->tuples_updated + rec->tuples_deleted; + pgstat_info->t_counts.t_changed_tuples += + rec->tuples_inserted + rec->tuples_updated + + rec->tuples_deleted; +} + +/* + * 2PC processing routine for ROLLBACK PREPARED case. + * + * Load the saved counts into our local pgstats state, but treat them + * as aborted. + */ +void +pgstat_twophase_postabort(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata; + PgStat_TableStatus *pgstat_info; + + /* Find or create a tabstat entry for the rel */ + pgstat_info = get_tabstat_entry(rec->t_id, rec->t_shared); + + /* Same math as in AtEOXact_PgStat, abort case */ + if (rec->t_truncated) + { + rec->tuples_inserted = rec->inserted_pre_trunc; + rec->tuples_updated = rec->updated_pre_trunc; + rec->tuples_deleted = rec->deleted_pre_trunc; + } + pgstat_info->t_counts.t_tuples_inserted += rec->tuples_inserted; + pgstat_info->t_counts.t_tuples_updated += rec->tuples_updated; + pgstat_info->t_counts.t_tuples_deleted += rec->tuples_deleted; + pgstat_info->t_counts.t_delta_dead_tuples += + rec->tuples_inserted + rec->tuples_updated; +} + + +/* ---------- + * pgstat_fetch_stat_dbentry() - + * + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one database or NULL. NULL doesn't mean + * that the database doesn't exist, it is just not yet known by the + * collector, so the caller is better off to report ZERO instead. + * ---------- + */ +PgStat_StatDBEntry * +pgstat_fetch_stat_dbentry(Oid dbid) +{ + /* + * If not done for this transaction, read the statistics collector stats + * file into some hash tables. + */ + backend_read_statsfile(); + + /* + * Lookup the requested database; return NULL if not found + */ + return (PgStat_StatDBEntry *) hash_search(pgStatDBHash, + (void *) &dbid, + HASH_FIND, NULL); +} + + +/* ---------- + * pgstat_fetch_stat_tabentry() - + * + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one table or NULL. NULL doesn't mean + * that the table doesn't exist, it is just not yet known by the + * collector, so the caller is better off to report ZERO instead. + * ---------- + */ +PgStat_StatTabEntry * +pgstat_fetch_stat_tabentry(Oid relid) +{ + Oid dbid; + PgStat_StatDBEntry *dbentry; + PgStat_StatTabEntry *tabentry; + + /* + * If not done for this transaction, read the statistics collector stats + * file into some hash tables. + */ + backend_read_statsfile(); + + /* + * Lookup our database, then look in its table hash table. + */ + dbid = MyDatabaseId; + dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash, + (void *) &dbid, + HASH_FIND, NULL); + if (dbentry != NULL && dbentry->tables != NULL) + { + tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables, + (void *) &relid, + HASH_FIND, NULL); + if (tabentry) + return tabentry; + } + + /* + * If we didn't find it, maybe it's a shared table. + */ + dbid = InvalidOid; + dbentry = (PgStat_StatDBEntry *) hash_search(pgStatDBHash, + (void *) &dbid, + HASH_FIND, NULL); + if (dbentry != NULL && dbentry->tables != NULL) + { + tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables, + (void *) &relid, + HASH_FIND, NULL); + if (tabentry) + return tabentry; + } + + return NULL; +} + + +/* ---------- + * pgstat_fetch_stat_funcentry() - + * + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one function or NULL. + * ---------- + */ +PgStat_StatFuncEntry * +pgstat_fetch_stat_funcentry(Oid func_id) +{ + PgStat_StatDBEntry *dbentry; + PgStat_StatFuncEntry *funcentry = NULL; + + /* load the stats file if needed */ + backend_read_statsfile(); + + /* Lookup our database, then find the requested function. */ + dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId); + if (dbentry != NULL && dbentry->functions != NULL) + { + funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions, + (void *) &func_id, + HASH_FIND, NULL); + } + + return funcentry; +} + + +/* + * --------- + * pgstat_fetch_stat_archiver() - + * + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the archiver statistics struct. + * --------- + */ +PgStat_ArchiverStats * +pgstat_fetch_stat_archiver(void) +{ + backend_read_statsfile(); + + return &archiverStats; +} + + +/* + * --------- + * pgstat_fetch_global() - + * + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the global statistics struct. + * --------- + */ +PgStat_GlobalStats * +pgstat_fetch_global(void) +{ + backend_read_statsfile(); + + return &globalStats; +} + +/* + * --------- + * pgstat_fetch_stat_wal() - + * + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the WAL statistics struct. + * --------- + */ +PgStat_WalStats * +pgstat_fetch_stat_wal(void) +{ + backend_read_statsfile(); + + return &walStats; +} + +/* + * --------- + * pgstat_fetch_slru() - + * + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the slru statistics struct. + * --------- + */ +PgStat_SLRUStats * +pgstat_fetch_slru(void) +{ + backend_read_statsfile(); + + return slruStats; +} + +/* + * --------- + * pgstat_fetch_replslot() - + * + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the replication slot statistics struct. + * --------- + */ +PgStat_StatReplSlotEntry * +pgstat_fetch_replslot(NameData slotname) +{ + backend_read_statsfile(); + + return pgstat_get_replslot_entry(slotname, false); +} + +/* + * Shut down a single backend's statistics reporting at process exit. + * + * Flush any remaining statistics counts out to the collector. + * Without this, operations triggered during backend exit (such as + * temp table deletions) won't be counted. + */ +static void +pgstat_shutdown_hook(int code, Datum arg) +{ + /* + * If we got as far as discovering our own database ID, we can report what + * we did to the collector. Otherwise, we'd be sending an invalid + * database ID, so forget it. (This means that accesses to pg_database + * during failed backend starts might never get counted.) + */ + if (OidIsValid(MyDatabaseId)) + pgstat_report_stat(true); +} + +/* ---------- + * pgstat_initialize() - + * + * Initialize pgstats state, and set up our on-proc-exit hook. + * Called from InitPostgres and AuxiliaryProcessMain. + * + * NOTE: MyDatabaseId isn't set yet; so the shutdown hook has to be careful. + * ---------- + */ +void +pgstat_initialize(void) +{ + /* + * Initialize prevWalUsage with pgWalUsage so that pgstat_send_wal() can + * calculate how much pgWalUsage counters are increased by substracting + * prevWalUsage from pgWalUsage. + */ + prevWalUsage = pgWalUsage; + + /* Set up a process-exit hook to clean up */ + on_shmem_exit(pgstat_shutdown_hook, 0); +} + +/* ------------------------------------------------------------ + * Local support functions follow + * ------------------------------------------------------------ + */ + + +/* ---------- + * pgstat_setheader() - + * + * Set common header fields in a statistics message + * ---------- + */ +static void +pgstat_setheader(PgStat_MsgHdr *hdr, StatMsgType mtype) +{ + hdr->m_type = mtype; +} + + +/* ---------- + * pgstat_send() - + * + * Send out one statistics message to the collector + * ---------- + */ +static void +pgstat_send(void *msg, int len) +{ + int rc; + + if (pgStatSock == PGINVALID_SOCKET) + return; + + ((PgStat_MsgHdr *) msg)->m_size = len; + + /* We'll retry after EINTR, but ignore all other failures */ + do + { + rc = send(pgStatSock, msg, len, 0); + } while (rc < 0 && errno == EINTR); + +#ifdef USE_ASSERT_CHECKING + /* In debug builds, log send failures ... */ + if (rc < 0) + elog(LOG, "could not send to statistics collector: %m"); +#endif +} + +/* ---------- + * pgstat_send_archiver() - + * + * Tell the collector about the WAL file that we successfully + * archived or failed to archive. + * ---------- + */ +void +pgstat_send_archiver(const char *xlog, bool failed) +{ + PgStat_MsgArchiver msg; + + /* + * Prepare and send the message + */ + pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ARCHIVER); + msg.m_failed = failed; + strlcpy(msg.m_xlog, xlog, sizeof(msg.m_xlog)); + msg.m_timestamp = GetCurrentTimestamp(); + pgstat_send(&msg, sizeof(msg)); +} + +/* ---------- + * pgstat_send_bgwriter() - + * + * Send bgwriter statistics to the collector + * ---------- + */ +void +pgstat_send_bgwriter(void) +{ + /* We assume this initializes to zeroes */ + static const PgStat_MsgBgWriter all_zeroes; + + /* + * This function can be called even if nothing at all has happened. In + * this case, avoid sending a completely empty message to the stats + * collector. + */ + if (memcmp(&BgWriterStats, &all_zeroes, sizeof(PgStat_MsgBgWriter)) == 0) + return; + + /* + * Prepare and send the message + */ + pgstat_setheader(&BgWriterStats.m_hdr, PGSTAT_MTYPE_BGWRITER); + pgstat_send(&BgWriterStats, sizeof(BgWriterStats)); + + /* + * Clear out the statistics buffer, so it can be re-used. + */ + MemSet(&BgWriterStats, 0, sizeof(BgWriterStats)); +} + +/* ---------- + * pgstat_send_wal() - + * + * Send WAL statistics to the collector. + * + * If 'force' is not set, WAL stats message is only sent if enough time has + * passed since last one was sent to reach PGSTAT_STAT_INTERVAL. + * ---------- + */ +void +pgstat_send_wal(bool force) +{ + static TimestampTz sendTime = 0; + + /* + * This function can be called even if nothing at all has happened. In + * this case, avoid sending a completely empty message to the stats + * collector. + * + * Check wal_records counter to determine whether any WAL activity has + * happened since last time. Note that other WalUsage counters don't need + * to be checked because they are incremented always together with + * wal_records counter. + * + * m_wal_buffers_full also doesn't need to be checked because it's + * incremented only when at least one WAL record is generated (i.e., + * wal_records counter is incremented). But for safely, we assert that + * m_wal_buffers_full is always zero when no WAL record is generated + * + * This function can be called by a process like walwriter that normally + * generates no WAL records. To determine whether any WAL activity has + * happened at that process since the last time, the numbers of WAL writes + * and syncs are also checked. + */ + if (pgWalUsage.wal_records == prevWalUsage.wal_records && + WalStats.m_wal_write == 0 && WalStats.m_wal_sync == 0) + { + Assert(WalStats.m_wal_buffers_full == 0); + return; + } + + if (!force) + { + TimestampTz now = GetCurrentTimestamp(); + + /* + * Don't send a message unless it's been at least PGSTAT_STAT_INTERVAL + * msec since we last sent one to avoid overloading the stats + * collector. + */ + if (!TimestampDifferenceExceeds(sendTime, now, PGSTAT_STAT_INTERVAL)) + return; + sendTime = now; + } + + /* + * Set the counters related to generated WAL data if the counters were + * updated. + */ + if (pgWalUsage.wal_records != prevWalUsage.wal_records) + { + WalUsage walusage; + + /* + * Calculate how much WAL usage counters were increased by + * substracting the previous counters from the current ones. Fill the + * results in WAL stats message. + */ + MemSet(&walusage, 0, sizeof(WalUsage)); + WalUsageAccumDiff(&walusage, &pgWalUsage, &prevWalUsage); + + WalStats.m_wal_records = walusage.wal_records; + WalStats.m_wal_fpi = walusage.wal_fpi; + WalStats.m_wal_bytes = walusage.wal_bytes; + + /* + * Save the current counters for the subsequent calculation of WAL + * usage. + */ + prevWalUsage = pgWalUsage; + } + + /* + * Prepare and send the message + */ + pgstat_setheader(&WalStats.m_hdr, PGSTAT_MTYPE_WAL); + pgstat_send(&WalStats, sizeof(WalStats)); + + /* + * Clear out the statistics buffer, so it can be re-used. + */ + MemSet(&WalStats, 0, sizeof(WalStats)); +} + +/* ---------- + * pgstat_send_slru() - + * + * Send SLRU statistics to the collector + * ---------- + */ +static void +pgstat_send_slru(void) +{ + /* We assume this initializes to zeroes */ + static const PgStat_MsgSLRU all_zeroes; + + for (int i = 0; i < SLRU_NUM_ELEMENTS; i++) + { + /* + * This function can be called even if nothing at all has happened. In + * this case, avoid sending a completely empty message to the stats + * collector. + */ + if (memcmp(&SLRUStats[i], &all_zeroes, sizeof(PgStat_MsgSLRU)) == 0) + continue; + + /* set the SLRU type before each send */ + SLRUStats[i].m_index = i; + + /* + * Prepare and send the message + */ + pgstat_setheader(&SLRUStats[i].m_hdr, PGSTAT_MTYPE_SLRU); + pgstat_send(&SLRUStats[i], sizeof(PgStat_MsgSLRU)); + + /* + * Clear out the statistics buffer, so it can be re-used. + */ + MemSet(&SLRUStats[i], 0, sizeof(PgStat_MsgSLRU)); + } +} + + +/* ---------- + * PgstatCollectorMain() - + * + * Start up the statistics collector process. This is the body of the + * postmaster child process. + * + * The argc/argv parameters are valid only in EXEC_BACKEND case. + * ---------- + */ +NON_EXEC_STATIC void +PgstatCollectorMain(int argc, char *argv[]) +{ + int len; + PgStat_Msg msg; + int wr; + WaitEvent event; + WaitEventSet *wes; + + /* + * Ignore all signals usually bound to some action in the postmaster, + * except SIGHUP and SIGQUIT. Note we don't need a SIGUSR1 handler to + * support latch operations, because we only use a local latch. + */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGINT, SIG_IGN); + pqsignal(SIGTERM, SIG_IGN); + pqsignal(SIGQUIT, SignalHandlerForShutdownRequest); + pqsignal(SIGALRM, SIG_IGN); + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, SIG_IGN); + pqsignal(SIGUSR2, SIG_IGN); + /* Reset some signals that are accepted by postmaster but not here */ + pqsignal(SIGCHLD, SIG_DFL); + PG_SETMASK(&UnBlockSig); + + MyBackendType = B_STATS_COLLECTOR; + init_ps_display(NULL); + + /* + * Read in existing stats files or initialize the stats to zero. + */ + pgStatRunningInCollector = true; + pgStatDBHash = pgstat_read_statsfiles(InvalidOid, true, true); + + /* Prepare to wait for our latch or data in our socket. */ + wes = CreateWaitEventSet(CurrentMemoryContext, 3); + AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); + AddWaitEventToSet(wes, WL_POSTMASTER_DEATH, PGINVALID_SOCKET, NULL, NULL); + AddWaitEventToSet(wes, WL_SOCKET_READABLE, pgStatSock, NULL, NULL); + + /* + * Loop to process messages until we get SIGQUIT or detect ungraceful + * death of our parent postmaster. + * + * For performance reasons, we don't want to do ResetLatch/WaitLatch after + * every message; instead, do that only after a recv() fails to obtain a + * message. (This effectively means that if backends are sending us stuff + * like mad, we won't notice postmaster death until things slack off a + * bit; which seems fine.) To do that, we have an inner loop that + * iterates as long as recv() succeeds. We do check ConfigReloadPending + * inside the inner loop, which means that such interrupts will get + * serviced but the latch won't get cleared until next time there is a + * break in the action. + */ + for (;;) + { + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + /* + * Quit if we get SIGQUIT from the postmaster. + */ + if (ShutdownRequestPending) + break; + + /* + * Inner loop iterates as long as we keep getting messages, or until + * ShutdownRequestPending becomes set. + */ + while (!ShutdownRequestPending) + { + /* + * Reload configuration if we got SIGHUP from the postmaster. + */ + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + /* + * Write the stats file(s) if a new request has arrived that is + * not satisfied by existing file(s). + */ + if (pgstat_write_statsfile_needed()) + pgstat_write_statsfiles(false, false); + + /* + * Try to receive and process a message. This will not block, + * since the socket is set to non-blocking mode. + * + * XXX On Windows, we have to force pgwin32_recv to cooperate, + * despite the previous use of pg_set_noblock() on the socket. + * This is extremely broken and should be fixed someday. + */ +#ifdef WIN32 + pgwin32_noblock = 1; +#endif + + len = recv(pgStatSock, (char *) &msg, + sizeof(PgStat_Msg), 0); + +#ifdef WIN32 + pgwin32_noblock = 0; +#endif + + if (len < 0) + { + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) + break; /* out of inner loop */ + ereport(ERROR, + (errcode_for_socket_access(), + errmsg("could not read statistics message: %m"))); + } + + /* + * We ignore messages that are smaller than our common header + */ + if (len < sizeof(PgStat_MsgHdr)) + continue; + + /* + * The received length must match the length in the header + */ + if (msg.msg_hdr.m_size != len) + continue; + + /* + * O.K. - we accept this message. Process it. + */ + switch (msg.msg_hdr.m_type) + { + case PGSTAT_MTYPE_DUMMY: + break; + + case PGSTAT_MTYPE_INQUIRY: + pgstat_recv_inquiry(&msg.msg_inquiry, len); + break; + + case PGSTAT_MTYPE_TABSTAT: + pgstat_recv_tabstat(&msg.msg_tabstat, len); + break; + + case PGSTAT_MTYPE_TABPURGE: + pgstat_recv_tabpurge(&msg.msg_tabpurge, len); + break; + + case PGSTAT_MTYPE_DROPDB: + pgstat_recv_dropdb(&msg.msg_dropdb, len); + break; + + case PGSTAT_MTYPE_RESETCOUNTER: + pgstat_recv_resetcounter(&msg.msg_resetcounter, len); + break; + + case PGSTAT_MTYPE_RESETSHAREDCOUNTER: + pgstat_recv_resetsharedcounter(&msg.msg_resetsharedcounter, + len); + break; + + case PGSTAT_MTYPE_RESETSINGLECOUNTER: + pgstat_recv_resetsinglecounter(&msg.msg_resetsinglecounter, + len); + break; + + case PGSTAT_MTYPE_RESETSLRUCOUNTER: + pgstat_recv_resetslrucounter(&msg.msg_resetslrucounter, + len); + break; + + case PGSTAT_MTYPE_RESETREPLSLOTCOUNTER: + pgstat_recv_resetreplslotcounter(&msg.msg_resetreplslotcounter, + len); + break; + + case PGSTAT_MTYPE_AUTOVAC_START: + pgstat_recv_autovac(&msg.msg_autovacuum_start, len); + break; + + case PGSTAT_MTYPE_VACUUM: + pgstat_recv_vacuum(&msg.msg_vacuum, len); + break; + + case PGSTAT_MTYPE_ANALYZE: + pgstat_recv_analyze(&msg.msg_analyze, len); + break; + + case PGSTAT_MTYPE_ARCHIVER: + pgstat_recv_archiver(&msg.msg_archiver, len); + break; + + case PGSTAT_MTYPE_BGWRITER: + pgstat_recv_bgwriter(&msg.msg_bgwriter, len); + break; + + case PGSTAT_MTYPE_WAL: + pgstat_recv_wal(&msg.msg_wal, len); + break; + + case PGSTAT_MTYPE_SLRU: + pgstat_recv_slru(&msg.msg_slru, len); + break; + + case PGSTAT_MTYPE_FUNCSTAT: + pgstat_recv_funcstat(&msg.msg_funcstat, len); + break; + + case PGSTAT_MTYPE_FUNCPURGE: + pgstat_recv_funcpurge(&msg.msg_funcpurge, len); + break; + + case PGSTAT_MTYPE_RECOVERYCONFLICT: + pgstat_recv_recoveryconflict(&msg.msg_recoveryconflict, + len); + break; + + case PGSTAT_MTYPE_DEADLOCK: + pgstat_recv_deadlock(&msg.msg_deadlock, len); + break; + + case PGSTAT_MTYPE_TEMPFILE: + pgstat_recv_tempfile(&msg.msg_tempfile, len); + break; + + case PGSTAT_MTYPE_CHECKSUMFAILURE: + pgstat_recv_checksum_failure(&msg.msg_checksumfailure, + len); + break; + + case PGSTAT_MTYPE_REPLSLOT: + pgstat_recv_replslot(&msg.msg_replslot, len); + break; + + case PGSTAT_MTYPE_CONNECT: + pgstat_recv_connect(&msg.msg_connect, len); + break; + + case PGSTAT_MTYPE_DISCONNECT: + pgstat_recv_disconnect(&msg.msg_disconnect, len); + break; + + default: + break; + } + } /* end of inner message-processing loop */ + + /* Sleep until there's something to do */ +#ifndef WIN32 + wr = WaitEventSetWait(wes, -1L, &event, 1, WAIT_EVENT_PGSTAT_MAIN); +#else + + /* + * Windows, at least in its Windows Server 2003 R2 incarnation, + * sometimes loses FD_READ events. Waking up and retrying the recv() + * fixes that, so don't sleep indefinitely. This is a crock of the + * first water, but until somebody wants to debug exactly what's + * happening there, this is the best we can do. The two-second + * timeout matches our pre-9.2 behavior, and needs to be short enough + * to not provoke "using stale statistics" complaints from + * backend_read_statsfile. + */ + wr = WaitEventSetWait(wes, 2 * 1000L /* msec */ , &event, 1, + WAIT_EVENT_PGSTAT_MAIN); +#endif + + /* + * Emergency bailout if postmaster has died. This is to avoid the + * necessity for manual cleanup of all postmaster children. + */ + if (wr == 1 && event.events == WL_POSTMASTER_DEATH) + break; + } /* end of outer loop */ + + /* + * Save the final stats to reuse at next startup. + */ + pgstat_write_statsfiles(true, true); + + FreeWaitEventSet(wes); + + exit(0); +} + +/* + * Subroutine to clear stats in a database entry + * + * Tables and functions hashes are initialized to empty. + */ +static void +reset_dbentry_counters(PgStat_StatDBEntry *dbentry) +{ + HASHCTL hash_ctl; + + dbentry->n_xact_commit = 0; + dbentry->n_xact_rollback = 0; + dbentry->n_blocks_fetched = 0; + dbentry->n_blocks_hit = 0; + dbentry->n_tuples_returned = 0; + dbentry->n_tuples_fetched = 0; + dbentry->n_tuples_inserted = 0; + dbentry->n_tuples_updated = 0; + dbentry->n_tuples_deleted = 0; + dbentry->last_autovac_time = 0; + dbentry->n_conflict_tablespace = 0; + dbentry->n_conflict_lock = 0; + dbentry->n_conflict_snapshot = 0; + dbentry->n_conflict_bufferpin = 0; + dbentry->n_conflict_startup_deadlock = 0; + dbentry->n_temp_files = 0; + dbentry->n_temp_bytes = 0; + dbentry->n_deadlocks = 0; + dbentry->n_checksum_failures = 0; + dbentry->last_checksum_failure = 0; + dbentry->n_block_read_time = 0; + dbentry->n_block_write_time = 0; + dbentry->n_sessions = 0; + dbentry->total_session_time = 0; + dbentry->total_active_time = 0; + dbentry->total_idle_in_xact_time = 0; + dbentry->n_sessions_abandoned = 0; + dbentry->n_sessions_fatal = 0; + dbentry->n_sessions_killed = 0; + + dbentry->stat_reset_timestamp = GetCurrentTimestamp(); + dbentry->stats_timestamp = 0; + + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(PgStat_StatTabEntry); + dbentry->tables = hash_create("Per-database table", + PGSTAT_TAB_HASH_SIZE, + &hash_ctl, + HASH_ELEM | HASH_BLOBS); + + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry); + dbentry->functions = hash_create("Per-database function", + PGSTAT_FUNCTION_HASH_SIZE, + &hash_ctl, + HASH_ELEM | HASH_BLOBS); +} + +/* + * Lookup the hash table entry for the specified database. If no hash + * table entry exists, initialize it, if the create parameter is true. + * Else, return NULL. + */ +static PgStat_StatDBEntry * +pgstat_get_db_entry(Oid databaseid, bool create) +{ + PgStat_StatDBEntry *result; + bool found; + HASHACTION action = (create ? HASH_ENTER : HASH_FIND); + + /* Lookup or create the hash table entry for this database */ + result = (PgStat_StatDBEntry *) hash_search(pgStatDBHash, + &databaseid, + action, &found); + + if (!create && !found) + return NULL; + + /* + * If not found, initialize the new one. This creates empty hash tables + * for tables and functions, too. + */ + if (!found) + reset_dbentry_counters(result); + + return result; +} + + +/* + * Lookup the hash table entry for the specified table. If no hash + * table entry exists, initialize it, if the create parameter is true. + * Else, return NULL. + */ +static PgStat_StatTabEntry * +pgstat_get_tab_entry(PgStat_StatDBEntry *dbentry, Oid tableoid, bool create) +{ + PgStat_StatTabEntry *result; + bool found; + HASHACTION action = (create ? HASH_ENTER : HASH_FIND); + + /* Lookup or create the hash table entry for this table */ + result = (PgStat_StatTabEntry *) hash_search(dbentry->tables, + &tableoid, + action, &found); + + if (!create && !found) + return NULL; + + /* If not found, initialize the new one. */ + if (!found) + { + result->numscans = 0; + result->tuples_returned = 0; + result->tuples_fetched = 0; + result->tuples_inserted = 0; + result->tuples_updated = 0; + result->tuples_deleted = 0; + result->tuples_hot_updated = 0; + result->n_live_tuples = 0; + result->n_dead_tuples = 0; + result->changes_since_analyze = 0; + result->inserts_since_vacuum = 0; + result->blocks_fetched = 0; + result->blocks_hit = 0; + result->vacuum_timestamp = 0; + result->vacuum_count = 0; + result->autovac_vacuum_timestamp = 0; + result->autovac_vacuum_count = 0; + result->analyze_timestamp = 0; + result->analyze_count = 0; + result->autovac_analyze_timestamp = 0; + result->autovac_analyze_count = 0; + } + + return result; +} + + +/* ---------- + * pgstat_write_statsfiles() - + * Write the global statistics file, as well as requested DB files. + * + * 'permanent' specifies writing to the permanent files not temporary ones. + * When true (happens only when the collector is shutting down), also remove + * the temporary files so that backends starting up under a new postmaster + * can't read old data before the new collector is ready. + * + * When 'allDbs' is false, only the requested databases (listed in + * pending_write_requests) will be written; otherwise, all databases + * will be written. + * ---------- + */ +static void +pgstat_write_statsfiles(bool permanent, bool allDbs) +{ + HASH_SEQ_STATUS hstat; + PgStat_StatDBEntry *dbentry; + FILE *fpout; + int32 format_id; + const char *tmpfile = permanent ? PGSTAT_STAT_PERMANENT_TMPFILE : pgstat_stat_tmpname; + const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; + int rc; + + elog(DEBUG2, "writing stats file \"%s\"", statfile); + + /* + * Open the statistics temp file to write out the current values. + */ + fpout = AllocateFile(tmpfile, PG_BINARY_W); + if (fpout == NULL) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open temporary statistics file \"%s\": %m", + tmpfile))); + return; + } + + /* + * Set the timestamp of the stats file. + */ + globalStats.stats_timestamp = GetCurrentTimestamp(); + + /* + * Write the file header --- currently just a format ID. + */ + format_id = PGSTAT_FILE_FORMAT_ID; + rc = fwrite(&format_id, sizeof(format_id), 1, fpout); + (void) rc; /* we'll check for error with ferror */ + + /* + * Write global stats struct + */ + rc = fwrite(&globalStats, sizeof(globalStats), 1, fpout); + (void) rc; /* we'll check for error with ferror */ + + /* + * Write archiver stats struct + */ + rc = fwrite(&archiverStats, sizeof(archiverStats), 1, fpout); + (void) rc; /* we'll check for error with ferror */ + + /* + * Write WAL stats struct + */ + rc = fwrite(&walStats, sizeof(walStats), 1, fpout); + (void) rc; /* we'll check for error with ferror */ + + /* + * Write SLRU stats struct + */ + rc = fwrite(slruStats, sizeof(slruStats), 1, fpout); + (void) rc; /* we'll check for error with ferror */ + + /* + * Walk through the database table. + */ + hash_seq_init(&hstat, pgStatDBHash); + while ((dbentry = (PgStat_StatDBEntry *) hash_seq_search(&hstat)) != NULL) + { + /* + * Write out the table and function stats for this DB into the + * appropriate per-DB stat file, if required. + */ + if (allDbs || pgstat_db_requested(dbentry->databaseid)) + { + /* Make DB's timestamp consistent with the global stats */ + dbentry->stats_timestamp = globalStats.stats_timestamp; + + pgstat_write_db_statsfile(dbentry, permanent); + } + + /* + * Write out the DB entry. We don't write the tables or functions + * pointers, since they're of no use to any other process. + */ + fputc('D', fpout); + rc = fwrite(dbentry, offsetof(PgStat_StatDBEntry, tables), 1, fpout); + (void) rc; /* we'll check for error with ferror */ + } + + /* + * Write replication slot stats struct + */ + if (replSlotStatHash) + { + PgStat_StatReplSlotEntry *slotent; + + hash_seq_init(&hstat, replSlotStatHash); + while ((slotent = (PgStat_StatReplSlotEntry *) hash_seq_search(&hstat)) != NULL) + { + fputc('R', fpout); + rc = fwrite(slotent, sizeof(PgStat_StatReplSlotEntry), 1, fpout); + (void) rc; /* we'll check for error with ferror */ + } + } + + /* + * No more output to be done. Close the temp file and replace the old + * pgstat.stat with it. The ferror() check replaces testing for error + * after each individual fputc or fwrite above. + */ + fputc('E', fpout); + + if (ferror(fpout)) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write temporary statistics file \"%s\": %m", + tmpfile))); + FreeFile(fpout); + unlink(tmpfile); + } + else if (FreeFile(fpout) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not close temporary statistics file \"%s\": %m", + tmpfile))); + unlink(tmpfile); + } + else if (rename(tmpfile, statfile) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m", + tmpfile, statfile))); + unlink(tmpfile); + } + + if (permanent) + unlink(pgstat_stat_filename); + + /* + * Now throw away the list of requests. Note that requests sent after we + * started the write are still waiting on the network socket. + */ + list_free(pending_write_requests); + pending_write_requests = NIL; +} + +/* + * return the filename for a DB stat file; filename is the output buffer, + * of length len. + */ +static void +get_dbstat_filename(bool permanent, bool tempname, Oid databaseid, + char *filename, int len) +{ + int printed; + + /* NB -- pgstat_reset_remove_files knows about the pattern this uses */ + printed = snprintf(filename, len, "%s/db_%u.%s", + permanent ? PGSTAT_STAT_PERMANENT_DIRECTORY : + pgstat_stat_directory, + databaseid, + tempname ? "tmp" : "stat"); + if (printed >= len) + elog(ERROR, "overlength pgstat path"); +} + +/* ---------- + * pgstat_write_db_statsfile() - + * Write the stat file for a single database. + * + * If writing to the permanent file (happens when the collector is + * shutting down only), remove the temporary file so that backends + * starting up under a new postmaster can't read the old data before + * the new collector is ready. + * ---------- + */ +static void +pgstat_write_db_statsfile(PgStat_StatDBEntry *dbentry, bool permanent) +{ + HASH_SEQ_STATUS tstat; + HASH_SEQ_STATUS fstat; + PgStat_StatTabEntry *tabentry; + PgStat_StatFuncEntry *funcentry; + FILE *fpout; + int32 format_id; + Oid dbid = dbentry->databaseid; + int rc; + char tmpfile[MAXPGPATH]; + char statfile[MAXPGPATH]; + + get_dbstat_filename(permanent, true, dbid, tmpfile, MAXPGPATH); + get_dbstat_filename(permanent, false, dbid, statfile, MAXPGPATH); + + elog(DEBUG2, "writing stats file \"%s\"", statfile); + + /* + * Open the statistics temp file to write out the current values. + */ + fpout = AllocateFile(tmpfile, PG_BINARY_W); + if (fpout == NULL) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open temporary statistics file \"%s\": %m", + tmpfile))); + return; + } + + /* + * Write the file header --- currently just a format ID. + */ + format_id = PGSTAT_FILE_FORMAT_ID; + rc = fwrite(&format_id, sizeof(format_id), 1, fpout); + (void) rc; /* we'll check for error with ferror */ + + /* + * Walk through the database's access stats per table. + */ + hash_seq_init(&tstat, dbentry->tables); + while ((tabentry = (PgStat_StatTabEntry *) hash_seq_search(&tstat)) != NULL) + { + fputc('T', fpout); + rc = fwrite(tabentry, sizeof(PgStat_StatTabEntry), 1, fpout); + (void) rc; /* we'll check for error with ferror */ + } + + /* + * Walk through the database's function stats table. + */ + hash_seq_init(&fstat, dbentry->functions); + while ((funcentry = (PgStat_StatFuncEntry *) hash_seq_search(&fstat)) != NULL) + { + fputc('F', fpout); + rc = fwrite(funcentry, sizeof(PgStat_StatFuncEntry), 1, fpout); + (void) rc; /* we'll check for error with ferror */ + } + + /* + * No more output to be done. Close the temp file and replace the old + * pgstat.stat with it. The ferror() check replaces testing for error + * after each individual fputc or fwrite above. + */ + fputc('E', fpout); + + if (ferror(fpout)) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write temporary statistics file \"%s\": %m", + tmpfile))); + FreeFile(fpout); + unlink(tmpfile); + } + else if (FreeFile(fpout) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not close temporary statistics file \"%s\": %m", + tmpfile))); + unlink(tmpfile); + } + else if (rename(tmpfile, statfile) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m", + tmpfile, statfile))); + unlink(tmpfile); + } + + if (permanent) + { + get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH); + + elog(DEBUG2, "removing temporary stats file \"%s\"", statfile); + unlink(statfile); + } +} + +/* ---------- + * pgstat_read_statsfiles() - + * + * Reads in some existing statistics collector files and returns the + * databases hash table that is the top level of the data. + * + * If 'onlydb' is not InvalidOid, it means we only want data for that DB + * plus the shared catalogs ("DB 0"). We'll still populate the DB hash + * table for all databases, but we don't bother even creating table/function + * hash tables for other databases. + * + * 'permanent' specifies reading from the permanent files not temporary ones. + * When true (happens only when the collector is starting up), remove the + * files after reading; the in-memory status is now authoritative, and the + * files would be out of date in case somebody else reads them. + * + * If a 'deep' read is requested, table/function stats are read, otherwise + * the table/function hash tables remain empty. + * ---------- + */ +static HTAB * +pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep) +{ + PgStat_StatDBEntry *dbentry; + PgStat_StatDBEntry dbbuf; + HASHCTL hash_ctl; + HTAB *dbhash; + FILE *fpin; + int32 format_id; + bool found; + const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; + int i; + + /* + * The tables will live in pgStatLocalContext. + */ + pgstat_setup_memcxt(); + + /* + * Create the DB hashtable + */ + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(PgStat_StatDBEntry); + hash_ctl.hcxt = pgStatLocalContext; + dbhash = hash_create("Databases hash", PGSTAT_DB_HASH_SIZE, &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + /* + * Clear out global, archiver, WAL and SLRU statistics so they start from + * zero in case we can't load an existing statsfile. + */ + memset(&globalStats, 0, sizeof(globalStats)); + memset(&archiverStats, 0, sizeof(archiverStats)); + memset(&walStats, 0, sizeof(walStats)); + memset(&slruStats, 0, sizeof(slruStats)); + + /* + * Set the current timestamp (will be kept only in case we can't load an + * existing statsfile). + */ + globalStats.stat_reset_timestamp = GetCurrentTimestamp(); + archiverStats.stat_reset_timestamp = globalStats.stat_reset_timestamp; + walStats.stat_reset_timestamp = globalStats.stat_reset_timestamp; + + /* + * Set the same reset timestamp for all SLRU items too. + */ + for (i = 0; i < SLRU_NUM_ELEMENTS; i++) + slruStats[i].stat_reset_timestamp = globalStats.stat_reset_timestamp; + + /* + * Try to open the stats file. If it doesn't exist, the backends simply + * return zero for anything and the collector simply starts from scratch + * with empty counters. + * + * ENOENT is a possibility if the stats collector is not running or has + * not yet written the stats file the first time. Any other failure + * condition is suspicious. + */ + if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL) + { + if (errno != ENOENT) + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errcode_for_file_access(), + errmsg("could not open statistics file \"%s\": %m", + statfile))); + return dbhash; + } + + /* + * Verify it's of the expected format. + */ + if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) || + format_id != PGSTAT_FILE_FORMAT_ID) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", statfile))); + goto done; + } + + /* + * Read global stats struct + */ + if (fread(&globalStats, 1, sizeof(globalStats), fpin) != sizeof(globalStats)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", statfile))); + memset(&globalStats, 0, sizeof(globalStats)); + goto done; + } + + /* + * In the collector, disregard the timestamp we read from the permanent + * stats file; we should be willing to write a temp stats file immediately + * upon the first request from any backend. This only matters if the old + * file's timestamp is less than PGSTAT_STAT_INTERVAL ago, but that's not + * an unusual scenario. + */ + if (pgStatRunningInCollector) + globalStats.stats_timestamp = 0; + + /* + * Read archiver stats struct + */ + if (fread(&archiverStats, 1, sizeof(archiverStats), fpin) != sizeof(archiverStats)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", statfile))); + memset(&archiverStats, 0, sizeof(archiverStats)); + goto done; + } + + /* + * Read WAL stats struct + */ + if (fread(&walStats, 1, sizeof(walStats), fpin) != sizeof(walStats)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", statfile))); + memset(&walStats, 0, sizeof(walStats)); + goto done; + } + + /* + * Read SLRU stats struct + */ + if (fread(slruStats, 1, sizeof(slruStats), fpin) != sizeof(slruStats)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", statfile))); + memset(&slruStats, 0, sizeof(slruStats)); + goto done; + } + + /* + * We found an existing collector stats file. Read it and put all the + * hashtable entries into place. + */ + for (;;) + { + switch (fgetc(fpin)) + { + /* + * 'D' A PgStat_StatDBEntry struct describing a database + * follows. + */ + case 'D': + if (fread(&dbbuf, 1, offsetof(PgStat_StatDBEntry, tables), + fpin) != offsetof(PgStat_StatDBEntry, tables)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", + statfile))); + goto done; + } + + /* + * Add to the DB hash + */ + dbentry = (PgStat_StatDBEntry *) hash_search(dbhash, + (void *) &dbbuf.databaseid, + HASH_ENTER, + &found); + if (found) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", + statfile))); + goto done; + } + + memcpy(dbentry, &dbbuf, sizeof(PgStat_StatDBEntry)); + dbentry->tables = NULL; + dbentry->functions = NULL; + + /* + * In the collector, disregard the timestamp we read from the + * permanent stats file; we should be willing to write a temp + * stats file immediately upon the first request from any + * backend. + */ + if (pgStatRunningInCollector) + dbentry->stats_timestamp = 0; + + /* + * Don't create tables/functions hashtables for uninteresting + * databases. + */ + if (onlydb != InvalidOid) + { + if (dbbuf.databaseid != onlydb && + dbbuf.databaseid != InvalidOid) + break; + } + + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(PgStat_StatTabEntry); + hash_ctl.hcxt = pgStatLocalContext; + dbentry->tables = hash_create("Per-database table", + PGSTAT_TAB_HASH_SIZE, + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + hash_ctl.keysize = sizeof(Oid); + hash_ctl.entrysize = sizeof(PgStat_StatFuncEntry); + hash_ctl.hcxt = pgStatLocalContext; + dbentry->functions = hash_create("Per-database function", + PGSTAT_FUNCTION_HASH_SIZE, + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + /* + * If requested, read the data from the database-specific + * file. Otherwise we just leave the hashtables empty. + */ + if (deep) + pgstat_read_db_statsfile(dbentry->databaseid, + dbentry->tables, + dbentry->functions, + permanent); + + break; + + /* + * 'R' A PgStat_StatReplSlotEntry struct describing a + * replication slot follows. + */ + case 'R': + { + PgStat_StatReplSlotEntry slotbuf; + PgStat_StatReplSlotEntry *slotent; + + if (fread(&slotbuf, 1, sizeof(PgStat_StatReplSlotEntry), fpin) + != sizeof(PgStat_StatReplSlotEntry)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", + statfile))); + goto done; + } + + /* Create hash table if we don't have it already. */ + if (replSlotStatHash == NULL) + { + HASHCTL hash_ctl; + + hash_ctl.keysize = sizeof(NameData); + hash_ctl.entrysize = sizeof(PgStat_StatReplSlotEntry); + hash_ctl.hcxt = pgStatLocalContext; + replSlotStatHash = hash_create("Replication slots hash", + PGSTAT_REPLSLOT_HASH_SIZE, + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + } + + slotent = (PgStat_StatReplSlotEntry *) hash_search(replSlotStatHash, + (void *) &slotbuf.slotname, + HASH_ENTER, NULL); + memcpy(slotent, &slotbuf, sizeof(PgStat_StatReplSlotEntry)); + break; + } + + case 'E': + goto done; + + default: + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", + statfile))); + goto done; + } + } + +done: + FreeFile(fpin); + + /* If requested to read the permanent file, also get rid of it. */ + if (permanent) + { + elog(DEBUG2, "removing permanent stats file \"%s\"", statfile); + unlink(statfile); + } + + return dbhash; +} + + +/* ---------- + * pgstat_read_db_statsfile() - + * + * Reads in the existing statistics collector file for the given database, + * filling the passed-in tables and functions hash tables. + * + * As in pgstat_read_statsfiles, if the permanent file is requested, it is + * removed after reading. + * + * Note: this code has the ability to skip storing per-table or per-function + * data, if NULL is passed for the corresponding hashtable. That's not used + * at the moment though. + * ---------- + */ +static void +pgstat_read_db_statsfile(Oid databaseid, HTAB *tabhash, HTAB *funchash, + bool permanent) +{ + PgStat_StatTabEntry *tabentry; + PgStat_StatTabEntry tabbuf; + PgStat_StatFuncEntry funcbuf; + PgStat_StatFuncEntry *funcentry; + FILE *fpin; + int32 format_id; + bool found; + char statfile[MAXPGPATH]; + + get_dbstat_filename(permanent, false, databaseid, statfile, MAXPGPATH); + + /* + * Try to open the stats file. If it doesn't exist, the backends simply + * return zero for anything and the collector simply starts from scratch + * with empty counters. + * + * ENOENT is a possibility if the stats collector is not running or has + * not yet written the stats file the first time. Any other failure + * condition is suspicious. + */ + if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL) + { + if (errno != ENOENT) + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errcode_for_file_access(), + errmsg("could not open statistics file \"%s\": %m", + statfile))); + return; + } + + /* + * Verify it's of the expected format. + */ + if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) || + format_id != PGSTAT_FILE_FORMAT_ID) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", statfile))); + goto done; + } + + /* + * We found an existing collector stats file. Read it and put all the + * hashtable entries into place. + */ + for (;;) + { + switch (fgetc(fpin)) + { + /* + * 'T' A PgStat_StatTabEntry follows. + */ + case 'T': + if (fread(&tabbuf, 1, sizeof(PgStat_StatTabEntry), + fpin) != sizeof(PgStat_StatTabEntry)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", + statfile))); + goto done; + } + + /* + * Skip if table data not wanted. + */ + if (tabhash == NULL) + break; + + tabentry = (PgStat_StatTabEntry *) hash_search(tabhash, + (void *) &tabbuf.tableid, + HASH_ENTER, &found); + + if (found) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", + statfile))); + goto done; + } + + memcpy(tabentry, &tabbuf, sizeof(tabbuf)); + break; + + /* + * 'F' A PgStat_StatFuncEntry follows. + */ + case 'F': + if (fread(&funcbuf, 1, sizeof(PgStat_StatFuncEntry), + fpin) != sizeof(PgStat_StatFuncEntry)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", + statfile))); + goto done; + } + + /* + * Skip if function data not wanted. + */ + if (funchash == NULL) + break; + + funcentry = (PgStat_StatFuncEntry *) hash_search(funchash, + (void *) &funcbuf.functionid, + HASH_ENTER, &found); + + if (found) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", + statfile))); + goto done; + } + + memcpy(funcentry, &funcbuf, sizeof(funcbuf)); + break; + + /* + * 'E' The EOF marker of a complete stats file. + */ + case 'E': + goto done; + + default: + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", + statfile))); + goto done; + } + } + +done: + FreeFile(fpin); + + if (permanent) + { + elog(DEBUG2, "removing permanent stats file \"%s\"", statfile); + unlink(statfile); + } +} + +/* ---------- + * pgstat_read_db_statsfile_timestamp() - + * + * Attempt to determine the timestamp of the last db statfile write. + * Returns true if successful; the timestamp is stored in *ts. The caller must + * rely on timestamp stored in *ts iff the function returns true. + * + * This needs to be careful about handling databases for which no stats file + * exists, such as databases without a stat entry or those not yet written: + * + * - if there's a database entry in the global file, return the corresponding + * stats_timestamp value. + * + * - if there's no db stat entry (e.g. for a new or inactive database), + * there's no stats_timestamp value, but also nothing to write so we return + * the timestamp of the global statfile. + * ---------- + */ +static bool +pgstat_read_db_statsfile_timestamp(Oid databaseid, bool permanent, + TimestampTz *ts) +{ + PgStat_StatDBEntry dbentry; + PgStat_GlobalStats myGlobalStats; + PgStat_ArchiverStats myArchiverStats; + PgStat_WalStats myWalStats; + PgStat_SLRUStats mySLRUStats[SLRU_NUM_ELEMENTS]; + PgStat_StatReplSlotEntry myReplSlotStats; + FILE *fpin; + int32 format_id; + const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename; + + /* + * Try to open the stats file. As above, anything but ENOENT is worthy of + * complaining about. + */ + if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL) + { + if (errno != ENOENT) + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errcode_for_file_access(), + errmsg("could not open statistics file \"%s\": %m", + statfile))); + return false; + } + + /* + * Verify it's of the expected format. + */ + if (fread(&format_id, 1, sizeof(format_id), fpin) != sizeof(format_id) || + format_id != PGSTAT_FILE_FORMAT_ID) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", statfile))); + FreeFile(fpin); + return false; + } + + /* + * Read global stats struct + */ + if (fread(&myGlobalStats, 1, sizeof(myGlobalStats), + fpin) != sizeof(myGlobalStats)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", statfile))); + FreeFile(fpin); + return false; + } + + /* + * Read archiver stats struct + */ + if (fread(&myArchiverStats, 1, sizeof(myArchiverStats), + fpin) != sizeof(myArchiverStats)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", statfile))); + FreeFile(fpin); + return false; + } + + /* + * Read WAL stats struct + */ + if (fread(&myWalStats, 1, sizeof(myWalStats), fpin) != sizeof(myWalStats)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", statfile))); + FreeFile(fpin); + return false; + } + + /* + * Read SLRU stats struct + */ + if (fread(mySLRUStats, 1, sizeof(mySLRUStats), fpin) != sizeof(mySLRUStats)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", statfile))); + FreeFile(fpin); + return false; + } + + /* By default, we're going to return the timestamp of the global file. */ + *ts = myGlobalStats.stats_timestamp; + + /* + * We found an existing collector stats file. Read it and look for a + * record for the requested database. If found, use its timestamp. + */ + for (;;) + { + switch (fgetc(fpin)) + { + /* + * 'D' A PgStat_StatDBEntry struct describing a database + * follows. + */ + case 'D': + if (fread(&dbentry, 1, offsetof(PgStat_StatDBEntry, tables), + fpin) != offsetof(PgStat_StatDBEntry, tables)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", + statfile))); + FreeFile(fpin); + return false; + } + + /* + * If this is the DB we're looking for, save its timestamp and + * we're done. + */ + if (dbentry.databaseid == databaseid) + { + *ts = dbentry.stats_timestamp; + goto done; + } + + break; + + /* + * 'R' A PgStat_StatReplSlotEntry struct describing a + * replication slot follows. + */ + case 'R': + if (fread(&myReplSlotStats, 1, sizeof(PgStat_StatReplSlotEntry), fpin) + != sizeof(PgStat_StatReplSlotEntry)) + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", + statfile))); + FreeFile(fpin); + return false; + } + break; + + case 'E': + goto done; + + default: + { + ereport(pgStatRunningInCollector ? LOG : WARNING, + (errmsg("corrupted statistics file \"%s\"", + statfile))); + FreeFile(fpin); + return false; + } + } + } + +done: + FreeFile(fpin); + return true; +} + +/* + * If not already done, read the statistics collector stats file into + * some hash tables. The results will be kept until pgstat_clear_snapshot() + * is called (typically, at end of transaction). + */ +static void +backend_read_statsfile(void) +{ + TimestampTz min_ts = 0; + TimestampTz ref_ts = 0; + Oid inquiry_db; + int count; + + /* already read it? */ + if (pgStatDBHash) + return; + Assert(!pgStatRunningInCollector); + + /* + * In a normal backend, we check staleness of the data for our own DB, and + * so we send MyDatabaseId in inquiry messages. In the autovac launcher, + * check staleness of the shared-catalog data, and send InvalidOid in + * inquiry messages so as not to force writing unnecessary data. + */ + if (IsAutoVacuumLauncherProcess()) + inquiry_db = InvalidOid; + else + inquiry_db = MyDatabaseId; + + /* + * Loop until fresh enough stats file is available or we ran out of time. + * The stats inquiry message is sent repeatedly in case collector drops + * it; but not every single time, as that just swamps the collector. + */ + for (count = 0; count < PGSTAT_POLL_LOOP_COUNT; count++) + { + bool ok; + TimestampTz file_ts = 0; + TimestampTz cur_ts; + + CHECK_FOR_INTERRUPTS(); + + ok = pgstat_read_db_statsfile_timestamp(inquiry_db, false, &file_ts); + + cur_ts = GetCurrentTimestamp(); + /* Calculate min acceptable timestamp, if we didn't already */ + if (count == 0 || cur_ts < ref_ts) + { + /* + * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL + * msec before now. This indirectly ensures that the collector + * needn't write the file more often than PGSTAT_STAT_INTERVAL. In + * an autovacuum worker, however, we want a lower delay to avoid + * using stale data, so we use PGSTAT_RETRY_DELAY (since the + * number of workers is low, this shouldn't be a problem). + * + * We don't recompute min_ts after sleeping, except in the + * unlikely case that cur_ts went backwards. So we might end up + * accepting a file a bit older than PGSTAT_STAT_INTERVAL. In + * practice that shouldn't happen, though, as long as the sleep + * time is less than PGSTAT_STAT_INTERVAL; and we don't want to + * tell the collector that our cutoff time is less than what we'd + * actually accept. + */ + ref_ts = cur_ts; + if (IsAutoVacuumWorkerProcess()) + min_ts = TimestampTzPlusMilliseconds(ref_ts, + -PGSTAT_RETRY_DELAY); + else + min_ts = TimestampTzPlusMilliseconds(ref_ts, + -PGSTAT_STAT_INTERVAL); + } + + /* + * If the file timestamp is actually newer than cur_ts, we must have + * had a clock glitch (system time went backwards) or there is clock + * skew between our processor and the stats collector's processor. + * Accept the file, but send an inquiry message anyway to make + * pgstat_recv_inquiry do a sanity check on the collector's time. + */ + if (ok && file_ts > cur_ts) + { + /* + * A small amount of clock skew between processors isn't terribly + * surprising, but a large difference is worth logging. We + * arbitrarily define "large" as 1000 msec. + */ + if (file_ts >= TimestampTzPlusMilliseconds(cur_ts, 1000)) + { + char *filetime; + char *mytime; + + /* Copy because timestamptz_to_str returns a static buffer */ + filetime = pstrdup(timestamptz_to_str(file_ts)); + mytime = pstrdup(timestamptz_to_str(cur_ts)); + ereport(LOG, + (errmsg("statistics collector's time %s is later than backend local time %s", + filetime, mytime))); + pfree(filetime); + pfree(mytime); + } + + pgstat_send_inquiry(cur_ts, min_ts, inquiry_db); + break; + } + + /* Normal acceptance case: file is not older than cutoff time */ + if (ok && file_ts >= min_ts) + break; + + /* Not there or too old, so kick the collector and wait a bit */ + if ((count % PGSTAT_INQ_LOOP_COUNT) == 0) + pgstat_send_inquiry(cur_ts, min_ts, inquiry_db); + + pg_usleep(PGSTAT_RETRY_DELAY * 1000L); + } + + if (count >= PGSTAT_POLL_LOOP_COUNT) + ereport(LOG, + (errmsg("using stale statistics instead of current ones " + "because stats collector is not responding"))); + + /* + * Autovacuum launcher wants stats about all databases, but a shallow read + * is sufficient. Regular backends want a deep read for just the tables + * they can see (MyDatabaseId + shared catalogs). + */ + if (IsAutoVacuumLauncherProcess()) + pgStatDBHash = pgstat_read_statsfiles(InvalidOid, false, false); + else + pgStatDBHash = pgstat_read_statsfiles(MyDatabaseId, false, true); +} + + +/* ---------- + * pgstat_setup_memcxt() - + * + * Create pgStatLocalContext, if not already done. + * ---------- + */ +static void +pgstat_setup_memcxt(void) +{ + if (!pgStatLocalContext) + pgStatLocalContext = AllocSetContextCreate(TopMemoryContext, + "Statistics snapshot", + ALLOCSET_SMALL_SIZES); +} + + +/* ---------- + * pgstat_clear_snapshot() - + * + * Discard any data collected in the current transaction. Any subsequent + * request will cause new snapshots to be read. + * + * This is also invoked during transaction commit or abort to discard + * the no-longer-wanted snapshot. + * ---------- + */ +void +pgstat_clear_snapshot(void) +{ + /* Release memory, if any was allocated */ + if (pgStatLocalContext) + MemoryContextDelete(pgStatLocalContext); + + /* Reset variables */ + pgStatLocalContext = NULL; + pgStatDBHash = NULL; + replSlotStatHash = NULL; + + /* + * Historically the backend_status.c facilities lived in this file, and + * were reset with the same function. For now keep it that way, and + * forward the reset request. + */ + pgstat_clear_backend_activity_snapshot(); +} + + +/* ---------- + * pgstat_recv_inquiry() - + * + * Process stat inquiry requests. + * ---------- + */ +static void +pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + + elog(DEBUG2, "received inquiry for database %u", msg->databaseid); + + /* + * If there's already a write request for this DB, there's nothing to do. + * + * Note that if a request is found, we return early and skip the below + * check for clock skew. This is okay, since the only way for a DB + * request to be present in the list is that we have been here since the + * last write round. It seems sufficient to check for clock skew once per + * write round. + */ + if (list_member_oid(pending_write_requests, msg->databaseid)) + return; + + /* + * Check to see if we last wrote this database at a time >= the requested + * cutoff time. If so, this is a stale request that was generated before + * we updated the DB file, and we don't need to do so again. + * + * If the requestor's local clock time is older than stats_timestamp, we + * should suspect a clock glitch, ie system time going backwards; though + * the more likely explanation is just delayed message receipt. It is + * worth expending a GetCurrentTimestamp call to be sure, since a large + * retreat in the system clock reading could otherwise cause us to neglect + * to update the stats file for a long time. + */ + dbentry = pgstat_get_db_entry(msg->databaseid, false); + if (dbentry == NULL) + { + /* + * We have no data for this DB. Enter a write request anyway so that + * the global stats will get updated. This is needed to prevent + * backend_read_statsfile from waiting for data that we cannot supply, + * in the case of a new DB that nobody has yet reported any stats for. + * See the behavior of pgstat_read_db_statsfile_timestamp. + */ + } + else if (msg->clock_time < dbentry->stats_timestamp) + { + TimestampTz cur_ts = GetCurrentTimestamp(); + + if (cur_ts < dbentry->stats_timestamp) + { + /* + * Sure enough, time went backwards. Force a new stats file write + * to get back in sync; but first, log a complaint. + */ + char *writetime; + char *mytime; + + /* Copy because timestamptz_to_str returns a static buffer */ + writetime = pstrdup(timestamptz_to_str(dbentry->stats_timestamp)); + mytime = pstrdup(timestamptz_to_str(cur_ts)); + ereport(LOG, + (errmsg("stats_timestamp %s is later than collector's time %s for database %u", + writetime, mytime, dbentry->databaseid))); + pfree(writetime); + pfree(mytime); + } + else + { + /* + * Nope, it's just an old request. Assuming msg's clock_time is + * >= its cutoff_time, it must be stale, so we can ignore it. + */ + return; + } + } + else if (msg->cutoff_time <= dbentry->stats_timestamp) + { + /* Stale request, ignore it */ + return; + } + + /* + * We need to write this DB, so create a request. + */ + pending_write_requests = lappend_oid(pending_write_requests, + msg->databaseid); +} + + +/* ---------- + * pgstat_recv_tabstat() - + * + * Count what the backend has done. + * ---------- + */ +static void +pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + PgStat_StatTabEntry *tabentry; + int i; + bool found; + + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); + + /* + * Update database-wide stats. + */ + dbentry->n_xact_commit += (PgStat_Counter) (msg->m_xact_commit); + dbentry->n_xact_rollback += (PgStat_Counter) (msg->m_xact_rollback); + dbentry->n_block_read_time += msg->m_block_read_time; + dbentry->n_block_write_time += msg->m_block_write_time; + + dbentry->total_session_time += msg->m_session_time; + dbentry->total_active_time += msg->m_active_time; + dbentry->total_idle_in_xact_time += msg->m_idle_in_xact_time; + + /* + * Process all table entries in the message. + */ + for (i = 0; i < msg->m_nentries; i++) + { + PgStat_TableEntry *tabmsg = &(msg->m_entry[i]); + + tabentry = (PgStat_StatTabEntry *) hash_search(dbentry->tables, + (void *) &(tabmsg->t_id), + HASH_ENTER, &found); + + if (!found) + { + /* + * If it's a new table entry, initialize counters to the values we + * just got. + */ + tabentry->numscans = tabmsg->t_counts.t_numscans; + tabentry->tuples_returned = tabmsg->t_counts.t_tuples_returned; + tabentry->tuples_fetched = tabmsg->t_counts.t_tuples_fetched; + tabentry->tuples_inserted = tabmsg->t_counts.t_tuples_inserted; + tabentry->tuples_updated = tabmsg->t_counts.t_tuples_updated; + tabentry->tuples_deleted = tabmsg->t_counts.t_tuples_deleted; + tabentry->tuples_hot_updated = tabmsg->t_counts.t_tuples_hot_updated; + tabentry->n_live_tuples = tabmsg->t_counts.t_delta_live_tuples; + tabentry->n_dead_tuples = tabmsg->t_counts.t_delta_dead_tuples; + tabentry->changes_since_analyze = tabmsg->t_counts.t_changed_tuples; + tabentry->inserts_since_vacuum = tabmsg->t_counts.t_tuples_inserted; + tabentry->blocks_fetched = tabmsg->t_counts.t_blocks_fetched; + tabentry->blocks_hit = tabmsg->t_counts.t_blocks_hit; + + tabentry->vacuum_timestamp = 0; + tabentry->vacuum_count = 0; + tabentry->autovac_vacuum_timestamp = 0; + tabentry->autovac_vacuum_count = 0; + tabentry->analyze_timestamp = 0; + tabentry->analyze_count = 0; + tabentry->autovac_analyze_timestamp = 0; + tabentry->autovac_analyze_count = 0; + } + else + { + /* + * Otherwise add the values to the existing entry. + */ + tabentry->numscans += tabmsg->t_counts.t_numscans; + tabentry->tuples_returned += tabmsg->t_counts.t_tuples_returned; + tabentry->tuples_fetched += tabmsg->t_counts.t_tuples_fetched; + tabentry->tuples_inserted += tabmsg->t_counts.t_tuples_inserted; + tabentry->tuples_updated += tabmsg->t_counts.t_tuples_updated; + tabentry->tuples_deleted += tabmsg->t_counts.t_tuples_deleted; + tabentry->tuples_hot_updated += tabmsg->t_counts.t_tuples_hot_updated; + /* If table was truncated, first reset the live/dead counters */ + if (tabmsg->t_counts.t_truncated) + { + tabentry->n_live_tuples = 0; + tabentry->n_dead_tuples = 0; + tabentry->inserts_since_vacuum = 0; + } + tabentry->n_live_tuples += tabmsg->t_counts.t_delta_live_tuples; + tabentry->n_dead_tuples += tabmsg->t_counts.t_delta_dead_tuples; + tabentry->changes_since_analyze += tabmsg->t_counts.t_changed_tuples; + tabentry->inserts_since_vacuum += tabmsg->t_counts.t_tuples_inserted; + tabentry->blocks_fetched += tabmsg->t_counts.t_blocks_fetched; + tabentry->blocks_hit += tabmsg->t_counts.t_blocks_hit; + } + + /* Clamp n_live_tuples in case of negative delta_live_tuples */ + tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0); + /* Likewise for n_dead_tuples */ + tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0); + + /* + * Add per-table stats to the per-database entry, too. + */ + dbentry->n_tuples_returned += tabmsg->t_counts.t_tuples_returned; + dbentry->n_tuples_fetched += tabmsg->t_counts.t_tuples_fetched; + dbentry->n_tuples_inserted += tabmsg->t_counts.t_tuples_inserted; + dbentry->n_tuples_updated += tabmsg->t_counts.t_tuples_updated; + dbentry->n_tuples_deleted += tabmsg->t_counts.t_tuples_deleted; + dbentry->n_blocks_fetched += tabmsg->t_counts.t_blocks_fetched; + dbentry->n_blocks_hit += tabmsg->t_counts.t_blocks_hit; + } +} + + +/* ---------- + * pgstat_recv_tabpurge() - + * + * Arrange for dead table removal. + * ---------- + */ +static void +pgstat_recv_tabpurge(PgStat_MsgTabpurge *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + int i; + + dbentry = pgstat_get_db_entry(msg->m_databaseid, false); + + /* + * No need to purge if we don't even know the database. + */ + if (!dbentry || !dbentry->tables) + return; + + /* + * Process all table entries in the message. + */ + for (i = 0; i < msg->m_nentries; i++) + { + /* Remove from hashtable if present; we don't care if it's not. */ + (void) hash_search(dbentry->tables, + (void *) &(msg->m_tableid[i]), + HASH_REMOVE, NULL); + } +} + + +/* ---------- + * pgstat_recv_dropdb() - + * + * Arrange for dead database removal + * ---------- + */ +static void +pgstat_recv_dropdb(PgStat_MsgDropdb *msg, int len) +{ + Oid dbid = msg->m_databaseid; + PgStat_StatDBEntry *dbentry; + + /* + * Lookup the database in the hashtable. + */ + dbentry = pgstat_get_db_entry(dbid, false); + + /* + * If found, remove it (along with the db statfile). + */ + if (dbentry) + { + char statfile[MAXPGPATH]; + + get_dbstat_filename(false, false, dbid, statfile, MAXPGPATH); + + elog(DEBUG2, "removing stats file \"%s\"", statfile); + unlink(statfile); + + if (dbentry->tables != NULL) + hash_destroy(dbentry->tables); + if (dbentry->functions != NULL) + hash_destroy(dbentry->functions); + + if (hash_search(pgStatDBHash, + (void *) &dbid, + HASH_REMOVE, NULL) == NULL) + ereport(ERROR, + (errmsg("database hash table corrupted during cleanup --- abort"))); + } +} + + +/* ---------- + * pgstat_recv_resetcounter() - + * + * Reset the statistics for the specified database. + * ---------- + */ +static void +pgstat_recv_resetcounter(PgStat_MsgResetcounter *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + + /* + * Lookup the database in the hashtable. Nothing to do if not there. + */ + dbentry = pgstat_get_db_entry(msg->m_databaseid, false); + + if (!dbentry) + return; + + /* + * We simply throw away all the database's table entries by recreating a + * new hash table for them. + */ + if (dbentry->tables != NULL) + hash_destroy(dbentry->tables); + if (dbentry->functions != NULL) + hash_destroy(dbentry->functions); + + dbentry->tables = NULL; + dbentry->functions = NULL; + + /* + * Reset database-level stats, too. This creates empty hash tables for + * tables and functions. + */ + reset_dbentry_counters(dbentry); +} + +/* ---------- + * pgstat_recv_resetsharedcounter() - + * + * Reset some shared statistics of the cluster. + * ---------- + */ +static void +pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len) +{ + if (msg->m_resettarget == RESET_BGWRITER) + { + /* Reset the global background writer statistics for the cluster. */ + memset(&globalStats, 0, sizeof(globalStats)); + globalStats.stat_reset_timestamp = GetCurrentTimestamp(); + } + else if (msg->m_resettarget == RESET_ARCHIVER) + { + /* Reset the archiver statistics for the cluster. */ + memset(&archiverStats, 0, sizeof(archiverStats)); + archiverStats.stat_reset_timestamp = GetCurrentTimestamp(); + } + else if (msg->m_resettarget == RESET_WAL) + { + /* Reset the WAL statistics for the cluster. */ + memset(&walStats, 0, sizeof(walStats)); + walStats.stat_reset_timestamp = GetCurrentTimestamp(); + } + + /* + * Presumably the sender of this message validated the target, don't + * complain here if it's not valid + */ +} + +/* ---------- + * pgstat_recv_resetsinglecounter() - + * + * Reset a statistics for a single object + * ---------- + */ +static void +pgstat_recv_resetsinglecounter(PgStat_MsgResetsinglecounter *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + + dbentry = pgstat_get_db_entry(msg->m_databaseid, false); + + if (!dbentry) + return; + + /* Set the reset timestamp for the whole database */ + dbentry->stat_reset_timestamp = GetCurrentTimestamp(); + + /* Remove object if it exists, ignore it if not */ + if (msg->m_resettype == RESET_TABLE) + (void) hash_search(dbentry->tables, (void *) &(msg->m_objectid), + HASH_REMOVE, NULL); + else if (msg->m_resettype == RESET_FUNCTION) + (void) hash_search(dbentry->functions, (void *) &(msg->m_objectid), + HASH_REMOVE, NULL); +} + +/* ---------- + * pgstat_recv_resetslrucounter() - + * + * Reset some SLRU statistics of the cluster. + * ---------- + */ +static void +pgstat_recv_resetslrucounter(PgStat_MsgResetslrucounter *msg, int len) +{ + int i; + TimestampTz ts = GetCurrentTimestamp(); + + for (i = 0; i < SLRU_NUM_ELEMENTS; i++) + { + /* reset entry with the given index, or all entries (index is -1) */ + if ((msg->m_index == -1) || (msg->m_index == i)) + { + memset(&slruStats[i], 0, sizeof(slruStats[i])); + slruStats[i].stat_reset_timestamp = ts; + } + } +} + +/* ---------- + * pgstat_recv_resetreplslotcounter() - + * + * Reset some replication slot statistics of the cluster. + * ---------- + */ +static void +pgstat_recv_resetreplslotcounter(PgStat_MsgResetreplslotcounter *msg, + int len) +{ + PgStat_StatReplSlotEntry *slotent; + TimestampTz ts; + + /* Return if we don't have replication slot statistics */ + if (replSlotStatHash == NULL) + return; + + ts = GetCurrentTimestamp(); + if (msg->clearall) + { + HASH_SEQ_STATUS sstat; + + hash_seq_init(&sstat, replSlotStatHash); + while ((slotent = (PgStat_StatReplSlotEntry *) hash_seq_search(&sstat)) != NULL) + pgstat_reset_replslot(slotent, ts); + } + else + { + /* Get the slot statistics to reset */ + slotent = pgstat_get_replslot_entry(msg->m_slotname, false); + + /* + * Nothing to do if the given slot entry is not found. This could + * happen when the slot with the given name is removed and the + * corresponding statistics entry is also removed before receiving the + * reset message. + */ + if (!slotent) + return; + + /* Reset the stats for the requested replication slot */ + pgstat_reset_replslot(slotent, ts); + } +} + + +/* ---------- + * pgstat_recv_autovac() - + * + * Process an autovacuum signaling message. + * ---------- + */ +static void +pgstat_recv_autovac(PgStat_MsgAutovacStart *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + + /* + * Store the last autovacuum time in the database's hashtable entry. + */ + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); + + dbentry->last_autovac_time = msg->m_start_time; +} + +/* ---------- + * pgstat_recv_vacuum() - + * + * Process a VACUUM message. + * ---------- + */ +static void +pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + PgStat_StatTabEntry *tabentry; + + /* + * Store the data in the table's hashtable entry. + */ + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); + + tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true); + + tabentry->n_live_tuples = msg->m_live_tuples; + tabentry->n_dead_tuples = msg->m_dead_tuples; + + /* + * It is quite possible that a non-aggressive VACUUM ended up skipping + * various pages, however, we'll zero the insert counter here regardless. + * It's currently used only to track when we need to perform an "insert" + * autovacuum, which are mainly intended to freeze newly inserted tuples. + * Zeroing this may just mean we'll not try to vacuum the table again + * until enough tuples have been inserted to trigger another insert + * autovacuum. An anti-wraparound autovacuum will catch any persistent + * stragglers. + */ + tabentry->inserts_since_vacuum = 0; + + if (msg->m_autovacuum) + { + tabentry->autovac_vacuum_timestamp = msg->m_vacuumtime; + tabentry->autovac_vacuum_count++; + } + else + { + tabentry->vacuum_timestamp = msg->m_vacuumtime; + tabentry->vacuum_count++; + } +} + +/* ---------- + * pgstat_recv_analyze() - + * + * Process an ANALYZE message. + * ---------- + */ +static void +pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + PgStat_StatTabEntry *tabentry; + + /* + * Store the data in the table's hashtable entry. + */ + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); + + tabentry = pgstat_get_tab_entry(dbentry, msg->m_tableoid, true); + + tabentry->n_live_tuples = msg->m_live_tuples; + tabentry->n_dead_tuples = msg->m_dead_tuples; + + /* + * If commanded, reset changes_since_analyze to zero. This forgets any + * changes that were committed while the ANALYZE was in progress, but we + * have no good way to estimate how many of those there were. + */ + if (msg->m_resetcounter) + tabentry->changes_since_analyze = 0; + + if (msg->m_autovacuum) + { + tabentry->autovac_analyze_timestamp = msg->m_analyzetime; + tabentry->autovac_analyze_count++; + } + else + { + tabentry->analyze_timestamp = msg->m_analyzetime; + tabentry->analyze_count++; + } +} + + +/* ---------- + * pgstat_recv_archiver() - + * + * Process a ARCHIVER message. + * ---------- + */ +static void +pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len) +{ + if (msg->m_failed) + { + /* Failed archival attempt */ + ++archiverStats.failed_count; + memcpy(archiverStats.last_failed_wal, msg->m_xlog, + sizeof(archiverStats.last_failed_wal)); + archiverStats.last_failed_timestamp = msg->m_timestamp; + } + else + { + /* Successful archival operation */ + ++archiverStats.archived_count; + memcpy(archiverStats.last_archived_wal, msg->m_xlog, + sizeof(archiverStats.last_archived_wal)); + archiverStats.last_archived_timestamp = msg->m_timestamp; + } +} + +/* ---------- + * pgstat_recv_bgwriter() - + * + * Process a BGWRITER message. + * ---------- + */ +static void +pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len) +{ + globalStats.timed_checkpoints += msg->m_timed_checkpoints; + globalStats.requested_checkpoints += msg->m_requested_checkpoints; + globalStats.checkpoint_write_time += msg->m_checkpoint_write_time; + globalStats.checkpoint_sync_time += msg->m_checkpoint_sync_time; + globalStats.buf_written_checkpoints += msg->m_buf_written_checkpoints; + globalStats.buf_written_clean += msg->m_buf_written_clean; + globalStats.maxwritten_clean += msg->m_maxwritten_clean; + globalStats.buf_written_backend += msg->m_buf_written_backend; + globalStats.buf_fsync_backend += msg->m_buf_fsync_backend; + globalStats.buf_alloc += msg->m_buf_alloc; +} + +/* ---------- + * pgstat_recv_wal() - + * + * Process a WAL message. + * ---------- + */ +static void +pgstat_recv_wal(PgStat_MsgWal *msg, int len) +{ + walStats.wal_records += msg->m_wal_records; + walStats.wal_fpi += msg->m_wal_fpi; + walStats.wal_bytes += msg->m_wal_bytes; + walStats.wal_buffers_full += msg->m_wal_buffers_full; + walStats.wal_write += msg->m_wal_write; + walStats.wal_sync += msg->m_wal_sync; + walStats.wal_write_time += msg->m_wal_write_time; + walStats.wal_sync_time += msg->m_wal_sync_time; +} + +/* ---------- + * pgstat_recv_slru() - + * + * Process a SLRU message. + * ---------- + */ +static void +pgstat_recv_slru(PgStat_MsgSLRU *msg, int len) +{ + slruStats[msg->m_index].blocks_zeroed += msg->m_blocks_zeroed; + slruStats[msg->m_index].blocks_hit += msg->m_blocks_hit; + slruStats[msg->m_index].blocks_read += msg->m_blocks_read; + slruStats[msg->m_index].blocks_written += msg->m_blocks_written; + slruStats[msg->m_index].blocks_exists += msg->m_blocks_exists; + slruStats[msg->m_index].flush += msg->m_flush; + slruStats[msg->m_index].truncate += msg->m_truncate; +} + +/* ---------- + * pgstat_recv_recoveryconflict() - + * + * Process a RECOVERYCONFLICT message. + * ---------- + */ +static void +pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); + + switch (msg->m_reason) + { + case PROCSIG_RECOVERY_CONFLICT_DATABASE: + + /* + * Since we drop the information about the database as soon as it + * replicates, there is no point in counting these conflicts. + */ + break; + case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: + dbentry->n_conflict_tablespace++; + break; + case PROCSIG_RECOVERY_CONFLICT_LOCK: + dbentry->n_conflict_lock++; + break; + case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: + dbentry->n_conflict_snapshot++; + break; + case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: + dbentry->n_conflict_bufferpin++; + break; + case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: + dbentry->n_conflict_startup_deadlock++; + break; + } +} + +/* ---------- + * pgstat_recv_deadlock() - + * + * Process a DEADLOCK message. + * ---------- + */ +static void +pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); + + dbentry->n_deadlocks++; +} + +/* ---------- + * pgstat_recv_checksum_failure() - + * + * Process a CHECKSUMFAILURE message. + * ---------- + */ +static void +pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); + + dbentry->n_checksum_failures += msg->m_failurecount; + dbentry->last_checksum_failure = msg->m_failure_time; +} + +/* ---------- + * pgstat_recv_replslot() - + * + * Process a REPLSLOT message. + * ---------- + */ +static void +pgstat_recv_replslot(PgStat_MsgReplSlot *msg, int len) +{ + if (msg->m_drop) + { + Assert(!msg->m_create); + + /* Remove the replication slot statistics with the given name */ + if (replSlotStatHash != NULL) + (void) hash_search(replSlotStatHash, + (void *) &(msg->m_slotname), + HASH_REMOVE, + NULL); + } + else + { + PgStat_StatReplSlotEntry *slotent; + + slotent = pgstat_get_replslot_entry(msg->m_slotname, true); + Assert(slotent); + + if (msg->m_create) + { + /* + * If the message for dropping the slot with the same name gets + * lost, slotent has stats for the old slot. So we initialize all + * counters at slot creation. + */ + pgstat_reset_replslot(slotent, 0); + } + else + { + /* Update the replication slot statistics */ + slotent->spill_txns += msg->m_spill_txns; + slotent->spill_count += msg->m_spill_count; + slotent->spill_bytes += msg->m_spill_bytes; + slotent->stream_txns += msg->m_stream_txns; + slotent->stream_count += msg->m_stream_count; + slotent->stream_bytes += msg->m_stream_bytes; + slotent->total_txns += msg->m_total_txns; + slotent->total_bytes += msg->m_total_bytes; + } + } +} + +/* ---------- + * pgstat_recv_connect() - + * + * Process a CONNECT message. + * ---------- + */ +static void +pgstat_recv_connect(PgStat_MsgConnect *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); + dbentry->n_sessions++; +} + +/* ---------- + * pgstat_recv_disconnect() - + * + * Process a DISCONNECT message. + * ---------- + */ +static void +pgstat_recv_disconnect(PgStat_MsgDisconnect *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); + + switch (msg->m_cause) + { + case DISCONNECT_NOT_YET: + case DISCONNECT_NORMAL: + /* we don't collect these */ + break; + case DISCONNECT_CLIENT_EOF: + dbentry->n_sessions_abandoned++; + break; + case DISCONNECT_FATAL: + dbentry->n_sessions_fatal++; + break; + case DISCONNECT_KILLED: + dbentry->n_sessions_killed++; + break; + } +} + +/* ---------- + * pgstat_recv_tempfile() - + * + * Process a TEMPFILE message. + * ---------- + */ +static void +pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); + + dbentry->n_temp_bytes += msg->m_filesize; + dbentry->n_temp_files += 1; +} + +/* ---------- + * pgstat_recv_funcstat() - + * + * Count what the backend has done. + * ---------- + */ +static void +pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len) +{ + PgStat_FunctionEntry *funcmsg = &(msg->m_entry[0]); + PgStat_StatDBEntry *dbentry; + PgStat_StatFuncEntry *funcentry; + int i; + bool found; + + dbentry = pgstat_get_db_entry(msg->m_databaseid, true); + + /* + * Process all function entries in the message. + */ + for (i = 0; i < msg->m_nentries; i++, funcmsg++) + { + funcentry = (PgStat_StatFuncEntry *) hash_search(dbentry->functions, + (void *) &(funcmsg->f_id), + HASH_ENTER, &found); + + if (!found) + { + /* + * If it's a new function entry, initialize counters to the values + * we just got. + */ + funcentry->f_numcalls = funcmsg->f_numcalls; + funcentry->f_total_time = funcmsg->f_total_time; + funcentry->f_self_time = funcmsg->f_self_time; + } + else + { + /* + * Otherwise add the values to the existing entry. + */ + funcentry->f_numcalls += funcmsg->f_numcalls; + funcentry->f_total_time += funcmsg->f_total_time; + funcentry->f_self_time += funcmsg->f_self_time; + } + } +} + +/* ---------- + * pgstat_recv_funcpurge() - + * + * Arrange for dead function removal. + * ---------- + */ +static void +pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len) +{ + PgStat_StatDBEntry *dbentry; + int i; + + dbentry = pgstat_get_db_entry(msg->m_databaseid, false); + + /* + * No need to purge if we don't even know the database. + */ + if (!dbentry || !dbentry->functions) + return; + + /* + * Process all function entries in the message. + */ + for (i = 0; i < msg->m_nentries; i++) + { + /* Remove from hashtable if present; we don't care if it's not. */ + (void) hash_search(dbentry->functions, + (void *) &(msg->m_functionid[i]), + HASH_REMOVE, NULL); + } +} + +/* ---------- + * pgstat_write_statsfile_needed() - + * + * Do we need to write out any stats files? + * ---------- + */ +static bool +pgstat_write_statsfile_needed(void) +{ + if (pending_write_requests != NIL) + return true; + + /* Everything was written recently */ + return false; +} + +/* ---------- + * pgstat_db_requested() - + * + * Checks whether stats for a particular DB need to be written to a file. + * ---------- + */ +static bool +pgstat_db_requested(Oid databaseid) +{ + /* + * If any requests are outstanding at all, we should write the stats for + * shared catalogs (the "database" with OID 0). This ensures that + * backends will see up-to-date stats for shared catalogs, even though + * they send inquiry messages mentioning only their own DB. + */ + if (databaseid == InvalidOid && pending_write_requests != NIL) + return true; + + /* Search to see if there's an open request to write this database. */ + if (list_member_oid(pending_write_requests, databaseid)) + return true; + + return false; +} + +/* ---------- + * pgstat_replslot_entry + * + * Return the entry of replication slot stats with the given name. Return + * NULL if not found and the caller didn't request to create it. + * + * create tells whether to create the new slot entry if it is not found. + * ---------- + */ +static PgStat_StatReplSlotEntry * +pgstat_get_replslot_entry(NameData name, bool create) +{ + PgStat_StatReplSlotEntry *slotent; + bool found; + + if (replSlotStatHash == NULL) + { + HASHCTL hash_ctl; + + /* + * Quick return NULL if the hash table is empty and the caller didn't + * request to create the entry. + */ + if (!create) + return NULL; + + hash_ctl.keysize = sizeof(NameData); + hash_ctl.entrysize = sizeof(PgStat_StatReplSlotEntry); + replSlotStatHash = hash_create("Replication slots hash", + PGSTAT_REPLSLOT_HASH_SIZE, + &hash_ctl, + HASH_ELEM | HASH_BLOBS); + } + + slotent = (PgStat_StatReplSlotEntry *) hash_search(replSlotStatHash, + (void *) &name, + create ? HASH_ENTER : HASH_FIND, + &found); + + if (!slotent) + { + /* not found */ + Assert(!create && !found); + return NULL; + } + + /* initialize the entry */ + if (create && !found) + { + namestrcpy(&(slotent->slotname), NameStr(name)); + pgstat_reset_replslot(slotent, 0); + } + + return slotent; +} + +/* ---------- + * pgstat_reset_replslot + * + * Reset the given replication slot stats. + * ---------- + */ +static void +pgstat_reset_replslot(PgStat_StatReplSlotEntry *slotent, TimestampTz ts) +{ + /* reset only counters. Don't clear slot name */ + slotent->spill_txns = 0; + slotent->spill_count = 0; + slotent->spill_bytes = 0; + slotent->stream_txns = 0; + slotent->stream_count = 0; + slotent->stream_bytes = 0; + slotent->total_txns = 0; + slotent->total_bytes = 0; + slotent->stat_reset_timestamp = ts; +} + +/* + * pgstat_slru_index + * + * Determine index of entry for a SLRU with a given name. If there's no exact + * match, returns index of the last "other" entry used for SLRUs defined in + * external projects. + */ +int +pgstat_slru_index(const char *name) +{ + int i; + + for (i = 0; i < SLRU_NUM_ELEMENTS; i++) + { + if (strcmp(slru_names[i], name) == 0) + return i; + } + + /* return index of the last entry (which is the "other" one) */ + return (SLRU_NUM_ELEMENTS - 1); +} + +/* + * pgstat_slru_name + * + * Returns SLRU name for an index. The index may be above SLRU_NUM_ELEMENTS, + * in which case this returns NULL. This allows writing code that does not + * know the number of entries in advance. + */ +const char * +pgstat_slru_name(int slru_idx) +{ + if (slru_idx < 0 || slru_idx >= SLRU_NUM_ELEMENTS) + return NULL; + + return slru_names[slru_idx]; +} + +/* + * slru_entry + * + * Returns pointer to entry with counters for given SLRU (based on the name + * stored in SlruCtl as lwlock tranche name). + */ +static inline PgStat_MsgSLRU * +slru_entry(int slru_idx) +{ + /* + * The postmaster should never register any SLRU statistics counts; if it + * did, the counts would be duplicated into child processes via fork(). + */ + Assert(IsUnderPostmaster || !IsPostmasterEnvironment); + + Assert((slru_idx >= 0) && (slru_idx < SLRU_NUM_ELEMENTS)); + + return &SLRUStats[slru_idx]; +} + +/* + * SLRU statistics count accumulation functions --- called from slru.c + */ + +void +pgstat_count_slru_page_zeroed(int slru_idx) +{ + slru_entry(slru_idx)->m_blocks_zeroed += 1; +} + +void +pgstat_count_slru_page_hit(int slru_idx) +{ + slru_entry(slru_idx)->m_blocks_hit += 1; +} + +void +pgstat_count_slru_page_exists(int slru_idx) +{ + slru_entry(slru_idx)->m_blocks_exists += 1; +} + +void +pgstat_count_slru_page_read(int slru_idx) +{ + slru_entry(slru_idx)->m_blocks_read += 1; +} + +void +pgstat_count_slru_page_written(int slru_idx) +{ + slru_entry(slru_idx)->m_blocks_written += 1; +} + +void +pgstat_count_slru_flush(int slru_idx) +{ + slru_entry(slru_idx)->m_flush += 1; +} + +void +pgstat_count_slru_truncate(int slru_idx) +{ + slru_entry(slru_idx)->m_truncate += 1; +} diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c new file mode 100644 index 0000000..ad35340 --- /dev/null +++ b/src/backend/postmaster/postmaster.c @@ -0,0 +1,6647 @@ +/*------------------------------------------------------------------------- + * + * postmaster.c + * This program acts as a clearing house for requests to the + * POSTGRES system. Frontend programs send a startup message + * to the Postmaster and the postmaster uses the info in the + * message to setup a backend process. + * + * The postmaster also manages system-wide operations such as + * startup and shutdown. The postmaster itself doesn't do those + * operations, mind you --- it just forks off a subprocess to do them + * at the right times. It also takes care of resetting the system + * if a backend crashes. + * + * The postmaster process creates the shared memory and semaphore + * pools during startup, but as a rule does not touch them itself. + * In particular, it is not a member of the PGPROC array of backends + * and so it cannot participate in lock-manager operations. Keeping + * the postmaster away from shared memory operations makes it simpler + * and more reliable. The postmaster is almost always able to recover + * from crashes of individual backends by resetting shared memory; + * if it did much with shared memory then it would be prone to crashing + * along with the backends. + * + * When a request message is received, we now fork() immediately. + * The child process performs authentication of the request, and + * then becomes a backend if successful. This allows the auth code + * to be written in a simple single-threaded style (as opposed to the + * crufty "poor man's multitasking" code that used to be needed). + * More importantly, it ensures that blockages in non-multithreaded + * libraries like SSL or PAM cannot cause denial of service to other + * clients. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/postmaster/postmaster.c + * + * NOTES + * + * Initialization: + * The Postmaster sets up shared memory data structures + * for the backends. + * + * Synchronization: + * The Postmaster shares memory with the backends but should avoid + * touching shared memory, so as not to become stuck if a crashing + * backend screws up locks or shared memory. Likewise, the Postmaster + * should never block on messages from frontend clients. + * + * Garbage Collection: + * The Postmaster cleans up after backends if they have an emergency + * exit and/or core dump. + * + * Error Reporting: + * Use write_stderr() only for reporting "interactive" errors + * (essentially, bogus arguments on the command line). Once the + * postmaster is launched, use ereport(). + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <unistd.h> +#include <signal.h> +#include <time.h> +#include <sys/wait.h> +#include <ctype.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <fcntl.h> +#include <sys/param.h> +#include <netdb.h> +#include <limits.h> + +#ifdef HAVE_SYS_SELECT_H +#include <sys/select.h> +#endif + +#ifdef USE_BONJOUR +#include <dns_sd.h> +#endif + +#ifdef USE_SYSTEMD +#include <systemd/sd-daemon.h> +#endif + +#ifdef HAVE_PTHREAD_IS_THREADED_NP +#include <pthread.h> +#endif + +#include "access/transam.h" +#include "access/xlog.h" +#include "bootstrap/bootstrap.h" +#include "catalog/pg_control.h" +#include "common/file_perm.h" +#include "common/ip.h" +#include "common/string.h" +#include "lib/ilist.h" +#include "libpq/auth.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "libpq/pqsignal.h" +#include "pg_getopt.h" +#include "pgstat.h" +#include "port/pg_bswap.h" +#include "postmaster/autovacuum.h" +#include "postmaster/bgworker_internals.h" +#include "postmaster/fork_process.h" +#include "postmaster/interrupt.h" +#include "postmaster/pgarch.h" +#include "postmaster/postmaster.h" +#include "postmaster/syslogger.h" +#include "replication/logicallauncher.h" +#include "replication/walsender.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/pg_shmem.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "tcop/tcopprot.h" +#include "utils/builtins.h" +#include "utils/datetime.h" +#include "utils/memutils.h" +#include "utils/pidfile.h" +#include "utils/ps_status.h" +#include "utils/queryjumble.h" +#include "utils/timeout.h" +#include "utils/timestamp.h" +#include "utils/varlena.h" + +#ifdef EXEC_BACKEND +#include "storage/spin.h" +#endif + + +/* + * Possible types of a backend. Beyond being the possible bkend_type values in + * struct bkend, these are OR-able request flag bits for SignalSomeChildren() + * and CountChildren(). + */ +#define BACKEND_TYPE_NORMAL 0x0001 /* normal backend */ +#define BACKEND_TYPE_AUTOVAC 0x0002 /* autovacuum worker process */ +#define BACKEND_TYPE_WALSND 0x0004 /* walsender process */ +#define BACKEND_TYPE_BGWORKER 0x0008 /* bgworker process */ +#define BACKEND_TYPE_ALL 0x000F /* OR of all the above */ + +/* + * List of active backends (or child processes anyway; we don't actually + * know whether a given child has become a backend or is still in the + * authorization phase). This is used mainly to keep track of how many + * children we have and send them appropriate signals when necessary. + * + * As shown in the above set of backend types, this list includes not only + * "normal" client sessions, but also autovacuum workers, walsenders, and + * background workers. (Note that at the time of launch, walsenders are + * labeled BACKEND_TYPE_NORMAL; we relabel them to BACKEND_TYPE_WALSND + * upon noticing they've changed their PMChildFlags entry. Hence that check + * must be done before any operation that needs to distinguish walsenders + * from normal backends.) + * + * Also, "dead_end" children are in it: these are children launched just for + * the purpose of sending a friendly rejection message to a would-be client. + * We must track them because they are attached to shared memory, but we know + * they will never become live backends. dead_end children are not assigned a + * PMChildSlot. dead_end children have bkend_type NORMAL. + * + * "Special" children such as the startup, bgwriter and autovacuum launcher + * tasks are not in this list. They are tracked via StartupPID and other + * pid_t variables below. (Thus, there can't be more than one of any given + * "special" child process type. We use BackendList entries for any child + * process there can be more than one of.) + */ +typedef struct bkend +{ + pid_t pid; /* process id of backend */ + int32 cancel_key; /* cancel key for cancels for this backend */ + int child_slot; /* PMChildSlot for this backend, if any */ + int bkend_type; /* child process flavor, see above */ + bool dead_end; /* is it going to send an error and quit? */ + bool bgworker_notify; /* gets bgworker start/stop notifications */ + dlist_node elem; /* list link in BackendList */ +} Backend; + +static dlist_head BackendList = DLIST_STATIC_INIT(BackendList); + +#ifdef EXEC_BACKEND +static Backend *ShmemBackendArray; +#endif + +BackgroundWorker *MyBgworkerEntry = NULL; + + + +/* The socket number we are listening for connections on */ +int PostPortNumber; + +/* The directory names for Unix socket(s) */ +char *Unix_socket_directories; + +/* The TCP listen address(es) */ +char *ListenAddresses; + +/* + * ReservedBackends is the number of backends reserved for superuser use. + * This number is taken out of the pool size given by MaxConnections so + * number of backend slots available to non-superusers is + * (MaxConnections - ReservedBackends). Note what this really means is + * "if there are <= ReservedBackends connections available, only superusers + * can make new connections" --- pre-existing superuser connections don't + * count against the limit. + */ +int ReservedBackends; + +/* The socket(s) we're listening to. */ +#define MAXLISTEN 64 +static pgsocket ListenSocket[MAXLISTEN]; + +/* + * These globals control the behavior of the postmaster in case some + * backend dumps core. Normally, it kills all peers of the dead backend + * and reinitializes shared memory. By specifying -s or -n, we can have + * the postmaster stop (rather than kill) peers and not reinitialize + * shared data structures. (Reinit is currently dead code, though.) + */ +static bool Reinit = true; +static int SendStop = false; + +/* still more option variables */ +bool EnableSSL = false; + +int PreAuthDelay = 0; +int AuthenticationTimeout = 60; + +bool log_hostname; /* for ps display and logging */ +bool Log_connections = false; +bool Db_user_namespace = false; + +bool enable_bonjour = false; +char *bonjour_name; +bool restart_after_crash = true; +bool remove_temp_files_after_crash = true; + +/* PIDs of special child processes; 0 when not running */ +static pid_t StartupPID = 0, + BgWriterPID = 0, + CheckpointerPID = 0, + WalWriterPID = 0, + WalReceiverPID = 0, + AutoVacPID = 0, + PgArchPID = 0, + PgStatPID = 0, + SysLoggerPID = 0; + +/* Startup process's status */ +typedef enum +{ + STARTUP_NOT_RUNNING, + STARTUP_RUNNING, + STARTUP_SIGNALED, /* we sent it a SIGQUIT or SIGKILL */ + STARTUP_CRASHED +} StartupStatusEnum; + +static StartupStatusEnum StartupStatus = STARTUP_NOT_RUNNING; + +/* Startup/shutdown state */ +#define NoShutdown 0 +#define SmartShutdown 1 +#define FastShutdown 2 +#define ImmediateShutdown 3 + +static int Shutdown = NoShutdown; + +static bool FatalError = false; /* T if recovering from backend crash */ + +/* + * We use a simple state machine to control startup, shutdown, and + * crash recovery (which is rather like shutdown followed by startup). + * + * After doing all the postmaster initialization work, we enter PM_STARTUP + * state and the startup process is launched. The startup process begins by + * reading the control file and other preliminary initialization steps. + * In a normal startup, or after crash recovery, the startup process exits + * with exit code 0 and we switch to PM_RUN state. However, archive recovery + * is handled specially since it takes much longer and we would like to support + * hot standby during archive recovery. + * + * When the startup process is ready to start archive recovery, it signals the + * postmaster, and we switch to PM_RECOVERY state. The background writer and + * checkpointer are launched, while the startup process continues applying WAL. + * If Hot Standby is enabled, then, after reaching a consistent point in WAL + * redo, startup process signals us again, and we switch to PM_HOT_STANDBY + * state and begin accepting connections to perform read-only queries. When + * archive recovery is finished, the startup process exits with exit code 0 + * and we switch to PM_RUN state. + * + * Normal child backends can only be launched when we are in PM_RUN or + * PM_HOT_STANDBY state. (connsAllowed can also restrict launching.) + * In other states we handle connection requests by launching "dead_end" + * child processes, which will simply send the client an error message and + * quit. (We track these in the BackendList so that we can know when they + * are all gone; this is important because they're still connected to shared + * memory, and would interfere with an attempt to destroy the shmem segment, + * possibly leading to SHMALL failure when we try to make a new one.) + * In PM_WAIT_DEAD_END state we are waiting for all the dead_end children + * to drain out of the system, and therefore stop accepting connection + * requests at all until the last existing child has quit (which hopefully + * will not be very long). + * + * Notice that this state variable does not distinguish *why* we entered + * states later than PM_RUN --- Shutdown and FatalError must be consulted + * to find that out. FatalError is never true in PM_RECOVERY, PM_HOT_STANDBY, + * or PM_RUN states, nor in PM_SHUTDOWN states (because we don't enter those + * states when trying to recover from a crash). It can be true in PM_STARTUP + * state, because we don't clear it until we've successfully started WAL redo. + */ +typedef enum +{ + PM_INIT, /* postmaster starting */ + PM_STARTUP, /* waiting for startup subprocess */ + PM_RECOVERY, /* in archive recovery mode */ + PM_HOT_STANDBY, /* in hot standby mode */ + PM_RUN, /* normal "database is alive" state */ + PM_STOP_BACKENDS, /* need to stop remaining backends */ + PM_WAIT_BACKENDS, /* waiting for live backends to exit */ + PM_SHUTDOWN, /* waiting for checkpointer to do shutdown + * ckpt */ + PM_SHUTDOWN_2, /* waiting for archiver and walsenders to + * finish */ + PM_WAIT_DEAD_END, /* waiting for dead_end children to exit */ + PM_NO_CHILDREN /* all important children have exited */ +} PMState; + +static PMState pmState = PM_INIT; + +/* + * While performing a "smart shutdown", we restrict new connections but stay + * in PM_RUN or PM_HOT_STANDBY state until all the client backends are gone. + * connsAllowed is a sub-state indicator showing the active restriction. + * It is of no interest unless pmState is PM_RUN or PM_HOT_STANDBY. + */ +typedef enum +{ + ALLOW_ALL_CONNS, /* normal not-shutting-down state */ + ALLOW_SUPERUSER_CONNS, /* only superusers can connect */ + ALLOW_NO_CONNS /* no new connections allowed, period */ +} ConnsAllowedState; + +static ConnsAllowedState connsAllowed = ALLOW_ALL_CONNS; + +/* Start time of SIGKILL timeout during immediate shutdown or child crash */ +/* Zero means timeout is not running */ +static time_t AbortStartTime = 0; + +/* Length of said timeout */ +#define SIGKILL_CHILDREN_AFTER_SECS 5 + +static bool ReachedNormalRunning = false; /* T if we've reached PM_RUN */ + +bool ClientAuthInProgress = false; /* T during new-client + * authentication */ + +bool redirection_done = false; /* stderr redirected for syslogger? */ + +/* received START_AUTOVAC_LAUNCHER signal */ +static volatile sig_atomic_t start_autovac_launcher = false; + +/* the launcher needs to be signaled to communicate some condition */ +static volatile bool avlauncher_needs_signal = false; + +/* received START_WALRECEIVER signal */ +static volatile sig_atomic_t WalReceiverRequested = false; + +/* set when there's a worker that needs to be started up */ +static volatile bool StartWorkerNeeded = true; +static volatile bool HaveCrashedWorker = false; + +#ifdef USE_SSL +/* Set when and if SSL has been initialized properly */ +static bool LoadedSSL = false; +#endif + +#ifdef USE_BONJOUR +static DNSServiceRef bonjour_sdref = NULL; +#endif + +/* + * postmaster.c - function prototypes + */ +static void CloseServerPorts(int status, Datum arg); +static void unlink_external_pid_file(int status, Datum arg); +static void getInstallationPaths(const char *argv0); +static void checkControlFile(void); +static Port *ConnCreate(int serverFd); +static void ConnFree(Port *port); +static void reset_shared(void); +static void SIGHUP_handler(SIGNAL_ARGS); +static void pmdie(SIGNAL_ARGS); +static void reaper(SIGNAL_ARGS); +static void sigusr1_handler(SIGNAL_ARGS); +static void process_startup_packet_die(SIGNAL_ARGS); +static void dummy_handler(SIGNAL_ARGS); +static void StartupPacketTimeoutHandler(void); +static void CleanupBackend(int pid, int exitstatus); +static bool CleanupBackgroundWorker(int pid, int exitstatus); +static void HandleChildCrash(int pid, int exitstatus, const char *procname); +static void LogChildExit(int lev, const char *procname, + int pid, int exitstatus); +static void PostmasterStateMachine(void); +static void BackendInitialize(Port *port); +static void BackendRun(Port *port) pg_attribute_noreturn(); +static void ExitPostmaster(int status) pg_attribute_noreturn(); +static int ServerLoop(void); +static int BackendStartup(Port *port); +static int ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done); +static void SendNegotiateProtocolVersion(List *unrecognized_protocol_options); +static void processCancelRequest(Port *port, void *pkt); +static int initMasks(fd_set *rmask); +static void report_fork_failure_to_client(Port *port, int errnum); +static CAC_state canAcceptConnections(int backend_type); +static bool RandomCancelKey(int32 *cancel_key); +static void signal_child(pid_t pid, int signal); +static bool SignalSomeChildren(int signal, int targets); +static void TerminateChildren(int signal); + +#define SignalChildren(sig) SignalSomeChildren(sig, BACKEND_TYPE_ALL) + +static int CountChildren(int target); +static bool assign_backendlist_entry(RegisteredBgWorker *rw); +static void maybe_start_bgworkers(void); +static bool CreateOptsFile(int argc, char *argv[], char *fullprogname); +static pid_t StartChildProcess(AuxProcType type); +static void StartAutovacuumWorker(void); +static void MaybeStartWalReceiver(void); +static void InitPostmasterDeathWatchHandle(void); + +/* + * Archiver is allowed to start up at the current postmaster state? + * + * If WAL archiving is enabled always, we are allowed to start archiver + * even during recovery. + */ +#define PgArchStartupAllowed() \ + (((XLogArchivingActive() && pmState == PM_RUN) || \ + (XLogArchivingAlways() && \ + (pmState == PM_RECOVERY || pmState == PM_HOT_STANDBY))) && \ + PgArchCanRestart()) + +#ifdef EXEC_BACKEND + +#ifdef WIN32 +#define WNOHANG 0 /* ignored, so any integer value will do */ + +static pid_t waitpid(pid_t pid, int *exitstatus, int options); +static void WINAPI pgwin32_deadchild_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired); + +static HANDLE win32ChildQueue; + +typedef struct +{ + HANDLE waitHandle; + HANDLE procHandle; + DWORD procId; +} win32_deadchild_waitinfo; +#endif /* WIN32 */ + +static pid_t backend_forkexec(Port *port); +static pid_t internal_forkexec(int argc, char *argv[], Port *port); + +/* Type for a socket that can be inherited to a client process */ +#ifdef WIN32 +typedef struct +{ + SOCKET origsocket; /* Original socket value, or PGINVALID_SOCKET + * if not a socket */ + WSAPROTOCOL_INFO wsainfo; +} InheritableSocket; +#else +typedef int InheritableSocket; +#endif + +/* + * Structure contains all variables passed to exec:ed backends + */ +typedef struct +{ + Port port; + InheritableSocket portsocket; + char DataDir[MAXPGPATH]; + pgsocket ListenSocket[MAXLISTEN]; + int32 MyCancelKey; + int MyPMChildSlot; +#ifndef WIN32 + unsigned long UsedShmemSegID; +#else + void *ShmemProtectiveRegion; + HANDLE UsedShmemSegID; +#endif + void *UsedShmemSegAddr; + slock_t *ShmemLock; + VariableCache ShmemVariableCache; + Backend *ShmemBackendArray; +#ifndef HAVE_SPINLOCKS + PGSemaphore *SpinlockSemaArray; +#endif + int NamedLWLockTrancheRequests; + NamedLWLockTranche *NamedLWLockTrancheArray; + LWLockPadded *MainLWLockArray; + slock_t *ProcStructLock; + PROC_HDR *ProcGlobal; + PGPROC *AuxiliaryProcs; + PGPROC *PreparedXactProcs; + PMSignalData *PMSignalState; + InheritableSocket pgStatSock; + pid_t PostmasterPid; + TimestampTz PgStartTime; + TimestampTz PgReloadTime; + pg_time_t first_syslogger_file_time; + bool redirection_done; + bool IsBinaryUpgrade; + bool query_id_enabled; + int max_safe_fds; + int MaxBackends; +#ifdef WIN32 + HANDLE PostmasterHandle; + HANDLE initial_signal_pipe; + HANDLE syslogPipe[2]; +#else + int postmaster_alive_fds[2]; + int syslogPipe[2]; +#endif + char my_exec_path[MAXPGPATH]; + char pkglib_path[MAXPGPATH]; +} BackendParameters; + +static void read_backend_variables(char *id, Port *port); +static void restore_backend_variables(BackendParameters *param, Port *port); + +#ifndef WIN32 +static bool save_backend_variables(BackendParameters *param, Port *port); +#else +static bool save_backend_variables(BackendParameters *param, Port *port, + HANDLE childProcess, pid_t childPid); +#endif + +static void ShmemBackendArrayAdd(Backend *bn); +static void ShmemBackendArrayRemove(Backend *bn); +#endif /* EXEC_BACKEND */ + +#define StartupDataBase() StartChildProcess(StartupProcess) +#define StartArchiver() StartChildProcess(ArchiverProcess) +#define StartBackgroundWriter() StartChildProcess(BgWriterProcess) +#define StartCheckpointer() StartChildProcess(CheckpointerProcess) +#define StartWalWriter() StartChildProcess(WalWriterProcess) +#define StartWalReceiver() StartChildProcess(WalReceiverProcess) + +/* Macros to check exit status of a child process */ +#define EXIT_STATUS_0(st) ((st) == 0) +#define EXIT_STATUS_1(st) (WIFEXITED(st) && WEXITSTATUS(st) == 1) +#define EXIT_STATUS_3(st) (WIFEXITED(st) && WEXITSTATUS(st) == 3) + +#ifndef WIN32 +/* + * File descriptors for pipe used to monitor if postmaster is alive. + * First is POSTMASTER_FD_WATCH, second is POSTMASTER_FD_OWN. + */ +int postmaster_alive_fds[2] = {-1, -1}; +#else +/* Process handle of postmaster used for the same purpose on Windows */ +HANDLE PostmasterHandle; +#endif + +/* + * Postmaster main entry point + */ +void +PostmasterMain(int argc, char *argv[]) +{ + int opt; + int status; + char *userDoption = NULL; + bool listen_addr_saved = false; + int i; + char *output_config_variable = NULL; + + InitProcessGlobals(); + + PostmasterPid = MyProcPid; + + IsPostmasterEnvironment = true; + + /* + * We should not be creating any files or directories before we check the + * data directory (see checkDataDir()), but just in case set the umask to + * the most restrictive (owner-only) permissions. + * + * checkDataDir() will reset the umask based on the data directory + * permissions. + */ + umask(PG_MODE_MASK_OWNER); + + /* + * By default, palloc() requests in the postmaster will be allocated in + * the PostmasterContext, which is space that can be recycled by backends. + * Allocated data that needs to be available to backends should be + * allocated in TopMemoryContext. + */ + PostmasterContext = AllocSetContextCreate(TopMemoryContext, + "Postmaster", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(PostmasterContext); + + /* Initialize paths to installation files */ + getInstallationPaths(argv[0]); + + /* + * Set up signal handlers for the postmaster process. + * + * In the postmaster, we use pqsignal_pm() rather than pqsignal() (which + * is used by all child processes and client processes). That has a + * couple of special behaviors: + * + * 1. Except on Windows, we tell sigaction() to block all signals for the + * duration of the signal handler. This is faster than our old approach + * of blocking/unblocking explicitly in the signal handler, and it should + * also prevent excessive stack consumption if signals arrive quickly. + * + * 2. We do not set the SA_RESTART flag. This is because signals will be + * blocked at all times except when ServerLoop is waiting for something to + * happen, and during that window, we want signals to exit the select(2) + * wait so that ServerLoop can respond if anything interesting happened. + * On some platforms, signals marked SA_RESTART would not cause the + * select() wait to end. + * + * Child processes will generally want SA_RESTART, so pqsignal() sets that + * flag. We expect children to set up their own handlers before + * unblocking signals. + * + * CAUTION: when changing this list, check for side-effects on the signal + * handling setup of child processes. See tcop/postgres.c, + * bootstrap/bootstrap.c, postmaster/bgwriter.c, postmaster/walwriter.c, + * postmaster/autovacuum.c, postmaster/pgarch.c, postmaster/pgstat.c, + * postmaster/syslogger.c, postmaster/bgworker.c and + * postmaster/checkpointer.c. + */ + pqinitmask(); + PG_SETMASK(&BlockSig); + + pqsignal_pm(SIGHUP, SIGHUP_handler); /* reread config file and have + * children do same */ + pqsignal_pm(SIGINT, pmdie); /* send SIGTERM and shut down */ + pqsignal_pm(SIGQUIT, pmdie); /* send SIGQUIT and die */ + pqsignal_pm(SIGTERM, pmdie); /* wait for children and shut down */ + pqsignal_pm(SIGALRM, SIG_IGN); /* ignored */ + pqsignal_pm(SIGPIPE, SIG_IGN); /* ignored */ + pqsignal_pm(SIGUSR1, sigusr1_handler); /* message from child process */ + pqsignal_pm(SIGUSR2, dummy_handler); /* unused, reserve for children */ + pqsignal_pm(SIGCHLD, reaper); /* handle child termination */ + +#ifdef SIGURG + + /* + * Ignore SIGURG for now. Child processes may change this (see + * InitializeLatchSupport), but they will not receive any such signals + * until they wait on a latch. + */ + pqsignal_pm(SIGURG, SIG_IGN); /* ignored */ +#endif + + /* + * No other place in Postgres should touch SIGTTIN/SIGTTOU handling. We + * ignore those signals in a postmaster environment, so that there is no + * risk of a child process freezing up due to writing to stderr. But for + * a standalone backend, their default handling is reasonable. Hence, all + * child processes should just allow the inherited settings to stand. + */ +#ifdef SIGTTIN + pqsignal_pm(SIGTTIN, SIG_IGN); /* ignored */ +#endif +#ifdef SIGTTOU + pqsignal_pm(SIGTTOU, SIG_IGN); /* ignored */ +#endif + + /* ignore SIGXFSZ, so that ulimit violations work like disk full */ +#ifdef SIGXFSZ + pqsignal_pm(SIGXFSZ, SIG_IGN); /* ignored */ +#endif + + /* + * Options setup + */ + InitializeGUCOptions(); + + opterr = 1; + + /* + * Parse command-line options. CAUTION: keep this in sync with + * tcop/postgres.c (the option sets should not conflict) and with the + * common help() function in main/main.c. + */ + while ((opt = getopt(argc, argv, "B:bc:C:D:d:EeFf:h:ijk:lN:nOPp:r:S:sTt:W:-:")) != -1) + { + switch (opt) + { + case 'B': + SetConfigOption("shared_buffers", optarg, PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'b': + /* Undocumented flag used for binary upgrades */ + IsBinaryUpgrade = true; + break; + + case 'C': + output_config_variable = strdup(optarg); + break; + + case 'D': + userDoption = strdup(optarg); + break; + + case 'd': + set_debug_options(atoi(optarg), PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'E': + SetConfigOption("log_statement", "all", PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'e': + SetConfigOption("datestyle", "euro", PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'F': + SetConfigOption("fsync", "false", PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'f': + if (!set_plan_disabling_options(optarg, PGC_POSTMASTER, PGC_S_ARGV)) + { + write_stderr("%s: invalid argument for option -f: \"%s\"\n", + progname, optarg); + ExitPostmaster(1); + } + break; + + case 'h': + SetConfigOption("listen_addresses", optarg, PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'i': + SetConfigOption("listen_addresses", "*", PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'j': + /* only used by interactive backend */ + break; + + case 'k': + SetConfigOption("unix_socket_directories", optarg, PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'l': + SetConfigOption("ssl", "true", PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'N': + SetConfigOption("max_connections", optarg, PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'n': + /* Don't reinit shared mem after abnormal exit */ + Reinit = false; + break; + + case 'O': + SetConfigOption("allow_system_table_mods", "true", PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'P': + SetConfigOption("ignore_system_indexes", "true", PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'p': + SetConfigOption("port", optarg, PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'r': + /* only used by single-user backend */ + break; + + case 'S': + SetConfigOption("work_mem", optarg, PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 's': + SetConfigOption("log_statement_stats", "true", PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'T': + + /* + * In the event that some backend dumps core, send SIGSTOP, + * rather than SIGQUIT, to all its peers. This lets the wily + * post_hacker collect core dumps from everyone. + */ + SendStop = true; + break; + + case 't': + { + const char *tmp = get_stats_option_name(optarg); + + if (tmp) + { + SetConfigOption(tmp, "true", PGC_POSTMASTER, PGC_S_ARGV); + } + else + { + write_stderr("%s: invalid argument for option -t: \"%s\"\n", + progname, optarg); + ExitPostmaster(1); + } + break; + } + + case 'W': + SetConfigOption("post_auth_delay", optarg, PGC_POSTMASTER, PGC_S_ARGV); + break; + + case 'c': + case '-': + { + char *name, + *value; + + ParseLongOption(optarg, &name, &value); + if (!value) + { + if (opt == '-') + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("--%s requires a value", + optarg))); + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("-c %s requires a value", + optarg))); + } + + SetConfigOption(name, value, PGC_POSTMASTER, PGC_S_ARGV); + free(name); + if (value) + free(value); + break; + } + + default: + write_stderr("Try \"%s --help\" for more information.\n", + progname); + ExitPostmaster(1); + } + } + + /* + * Postmaster accepts no non-option switch arguments. + */ + if (optind < argc) + { + write_stderr("%s: invalid argument: \"%s\"\n", + progname, argv[optind]); + write_stderr("Try \"%s --help\" for more information.\n", + progname); + ExitPostmaster(1); + } + + /* + * Locate the proper configuration files and data directory, and read + * postgresql.conf for the first time. + */ + if (!SelectConfigFiles(userDoption, progname)) + ExitPostmaster(2); + + if (output_config_variable != NULL) + { + /* + * "-C guc" was specified, so print GUC's value and exit. No extra + * permission check is needed because the user is reading inside the + * data dir. + */ + const char *config_val = GetConfigOption(output_config_variable, + false, false); + + puts(config_val ? config_val : ""); + ExitPostmaster(0); + } + + /* Verify that DataDir looks reasonable */ + checkDataDir(); + + /* Check that pg_control exists */ + checkControlFile(); + + /* And switch working directory into it */ + ChangeToDataDir(); + + /* + * Check for invalid combinations of GUC settings. + */ + if (ReservedBackends >= MaxConnections) + { + write_stderr("%s: superuser_reserved_connections (%d) must be less than max_connections (%d)\n", + progname, + ReservedBackends, MaxConnections); + ExitPostmaster(1); + } + if (XLogArchiveMode > ARCHIVE_MODE_OFF && wal_level == WAL_LEVEL_MINIMAL) + ereport(ERROR, + (errmsg("WAL archival cannot be enabled when wal_level is \"minimal\""))); + if (max_wal_senders > 0 && wal_level == WAL_LEVEL_MINIMAL) + ereport(ERROR, + (errmsg("WAL streaming (max_wal_senders > 0) requires wal_level \"replica\" or \"logical\""))); + + /* + * Other one-time internal sanity checks can go here, if they are fast. + * (Put any slow processing further down, after postmaster.pid creation.) + */ + if (!CheckDateTokenTables()) + { + write_stderr("%s: invalid datetoken tables, please fix\n", progname); + ExitPostmaster(1); + } + + /* + * Now that we are done processing the postmaster arguments, reset + * getopt(3) library so that it will work correctly in subprocesses. + */ + optind = 1; +#ifdef HAVE_INT_OPTRESET + optreset = 1; /* some systems need this too */ +#endif + + /* For debugging: display postmaster environment */ + { + extern char **environ; + char **p; + + ereport(DEBUG3, + (errmsg_internal("%s: PostmasterMain: initial environment dump:", + progname))); + ereport(DEBUG3, + (errmsg_internal("-----------------------------------------"))); + for (p = environ; *p; ++p) + ereport(DEBUG3, + (errmsg_internal("\t%s", *p))); + ereport(DEBUG3, + (errmsg_internal("-----------------------------------------"))); + } + + /* + * Create lockfile for data directory. + * + * We want to do this before we try to grab the input sockets, because the + * data directory interlock is more reliable than the socket-file + * interlock (thanks to whoever decided to put socket files in /tmp :-(). + * For the same reason, it's best to grab the TCP socket(s) before the + * Unix socket(s). + * + * Also note that this internally sets up the on_proc_exit function that + * is responsible for removing both data directory and socket lockfiles; + * so it must happen before opening sockets so that at exit, the socket + * lockfiles go away after CloseServerPorts runs. + */ + CreateDataDirLockFile(true); + + /* + * Read the control file (for error checking and config info). + * + * Since we verify the control file's CRC, this has a useful side effect + * on machines where we need a run-time test for CRC support instructions. + * The postmaster will do the test once at startup, and then its child + * processes will inherit the correct function pointer and not need to + * repeat the test. + */ + LocalProcessControlFile(false); + + /* + * Register the apply launcher. Since it registers a background worker, + * it needs to be called before InitializeMaxBackends(), and it's probably + * a good idea to call it before any modules had chance to take the + * background worker slots. + */ + ApplyLauncherRegister(); + + /* + * process any libraries that should be preloaded at postmaster start + */ + process_shared_preload_libraries(); + + /* + * Initialize SSL library, if specified. + */ +#ifdef USE_SSL + if (EnableSSL) + { + (void) secure_initialize(true); + LoadedSSL = true; + } +#endif + + /* + * Now that loadable modules have had their chance to register background + * workers, calculate MaxBackends. + */ + InitializeMaxBackends(); + + /* + * Set up shared memory and semaphores. + */ + reset_shared(); + + /* + * Estimate number of openable files. This must happen after setting up + * semaphores, because on some platforms semaphores count as open files. + */ + set_max_safe_fds(); + + /* + * Set reference point for stack-depth checking. + */ + (void) set_stack_base(); + + /* + * Initialize pipe (or process handle on Windows) that allows children to + * wake up from sleep on postmaster death. + */ + InitPostmasterDeathWatchHandle(); + +#ifdef WIN32 + + /* + * Initialize I/O completion port used to deliver list of dead children. + */ + win32ChildQueue = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 1); + if (win32ChildQueue == NULL) + ereport(FATAL, + (errmsg("could not create I/O completion port for child queue"))); +#endif + +#ifdef EXEC_BACKEND + /* Write out nondefault GUC settings for child processes to use */ + write_nondefault_variables(PGC_POSTMASTER); + + /* + * Clean out the temp directory used to transmit parameters to child + * processes (see internal_forkexec, below). We must do this before + * launching any child processes, else we have a race condition: we could + * remove a parameter file before the child can read it. It should be + * safe to do so now, because we verified earlier that there are no + * conflicting Postgres processes in this data directory. + */ + RemovePgTempFilesInDir(PG_TEMP_FILES_DIR, true, false); +#endif + + /* + * Forcibly remove the files signaling a standby promotion request. + * Otherwise, the existence of those files triggers a promotion too early, + * whether a user wants that or not. + * + * This removal of files is usually unnecessary because they can exist + * only during a few moments during a standby promotion. However there is + * a race condition: if pg_ctl promote is executed and creates the files + * during a promotion, the files can stay around even after the server is + * brought up to be the primary. Then, if a new standby starts by using + * the backup taken from the new primary, the files can exist at server + * startup and must be removed in order to avoid an unexpected promotion. + * + * Note that promotion signal files need to be removed before the startup + * process is invoked. Because, after that, they can be used by + * postmaster's SIGUSR1 signal handler. + */ + RemovePromoteSignalFiles(); + + /* Do the same for logrotate signal file */ + RemoveLogrotateSignalFiles(); + + /* Remove any outdated file holding the current log filenames. */ + if (unlink(LOG_METAINFO_DATAFILE) < 0 && errno != ENOENT) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + LOG_METAINFO_DATAFILE))); + + /* + * If enabled, start up syslogger collection subprocess + */ + SysLoggerPID = SysLogger_Start(); + + /* + * Reset whereToSendOutput from DestDebug (its starting state) to + * DestNone. This stops ereport from sending log messages to stderr unless + * Log_destination permits. We don't do this until the postmaster is + * fully launched, since startup failures may as well be reported to + * stderr. + * + * If we are in fact disabling logging to stderr, first emit a log message + * saying so, to provide a breadcrumb trail for users who may not remember + * that their logging is configured to go somewhere else. + */ + if (!(Log_destination & LOG_DESTINATION_STDERR)) + ereport(LOG, + (errmsg("ending log output to stderr"), + errhint("Future log output will go to log destination \"%s\".", + Log_destination_string))); + + whereToSendOutput = DestNone; + + /* + * Report server startup in log. While we could emit this much earlier, + * it seems best to do so after starting the log collector, if we intend + * to use one. + */ + ereport(LOG, + (errmsg("starting %s", PG_VERSION_STR))); + + /* + * Establish input sockets. + * + * First, mark them all closed, and set up an on_proc_exit function that's + * charged with closing the sockets again at postmaster shutdown. + */ + for (i = 0; i < MAXLISTEN; i++) + ListenSocket[i] = PGINVALID_SOCKET; + + on_proc_exit(CloseServerPorts, 0); + + if (ListenAddresses) + { + char *rawstring; + List *elemlist; + ListCell *l; + int success = 0; + + /* Need a modifiable copy of ListenAddresses */ + rawstring = pstrdup(ListenAddresses); + + /* Parse string into list of hostnames */ + if (!SplitGUCList(rawstring, ',', &elemlist)) + { + /* syntax error in list */ + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid list syntax in parameter \"%s\"", + "listen_addresses"))); + } + + foreach(l, elemlist) + { + char *curhost = (char *) lfirst(l); + + if (strcmp(curhost, "*") == 0) + status = StreamServerPort(AF_UNSPEC, NULL, + (unsigned short) PostPortNumber, + NULL, + ListenSocket, MAXLISTEN); + else + status = StreamServerPort(AF_UNSPEC, curhost, + (unsigned short) PostPortNumber, + NULL, + ListenSocket, MAXLISTEN); + + if (status == STATUS_OK) + { + success++; + /* record the first successful host addr in lockfile */ + if (!listen_addr_saved) + { + AddToDataDirLockFile(LOCK_FILE_LINE_LISTEN_ADDR, curhost); + listen_addr_saved = true; + } + } + else + ereport(WARNING, + (errmsg("could not create listen socket for \"%s\"", + curhost))); + } + + if (!success && elemlist != NIL) + ereport(FATAL, + (errmsg("could not create any TCP/IP sockets"))); + + list_free(elemlist); + pfree(rawstring); + } + +#ifdef USE_BONJOUR + /* Register for Bonjour only if we opened TCP socket(s) */ + if (enable_bonjour && ListenSocket[0] != PGINVALID_SOCKET) + { + DNSServiceErrorType err; + + /* + * We pass 0 for interface_index, which will result in registering on + * all "applicable" interfaces. It's not entirely clear from the + * DNS-SD docs whether this would be appropriate if we have bound to + * just a subset of the available network interfaces. + */ + err = DNSServiceRegister(&bonjour_sdref, + 0, + 0, + bonjour_name, + "_postgresql._tcp.", + NULL, + NULL, + pg_hton16(PostPortNumber), + 0, + NULL, + NULL, + NULL); + if (err != kDNSServiceErr_NoError) + ereport(LOG, + (errmsg("DNSServiceRegister() failed: error code %ld", + (long) err))); + + /* + * We don't bother to read the mDNS daemon's reply, and we expect that + * it will automatically terminate our registration when the socket is + * closed at postmaster termination. So there's nothing more to be + * done here. However, the bonjour_sdref is kept around so that + * forked children can close their copies of the socket. + */ + } +#endif + +#ifdef HAVE_UNIX_SOCKETS + if (Unix_socket_directories) + { + char *rawstring; + List *elemlist; + ListCell *l; + int success = 0; + + /* Need a modifiable copy of Unix_socket_directories */ + rawstring = pstrdup(Unix_socket_directories); + + /* Parse string into list of directories */ + if (!SplitDirectoriesString(rawstring, ',', &elemlist)) + { + /* syntax error in list */ + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid list syntax in parameter \"%s\"", + "unix_socket_directories"))); + } + + foreach(l, elemlist) + { + char *socketdir = (char *) lfirst(l); + + status = StreamServerPort(AF_UNIX, NULL, + (unsigned short) PostPortNumber, + socketdir, + ListenSocket, MAXLISTEN); + + if (status == STATUS_OK) + { + success++; + /* record the first successful Unix socket in lockfile */ + if (success == 1) + AddToDataDirLockFile(LOCK_FILE_LINE_SOCKET_DIR, socketdir); + } + else + ereport(WARNING, + (errmsg("could not create Unix-domain socket in directory \"%s\"", + socketdir))); + } + + if (!success && elemlist != NIL) + ereport(FATAL, + (errmsg("could not create any Unix-domain sockets"))); + + list_free_deep(elemlist); + pfree(rawstring); + } +#endif + + /* + * check that we have some socket to listen on + */ + if (ListenSocket[0] == PGINVALID_SOCKET) + ereport(FATAL, + (errmsg("no socket created for listening"))); + + /* + * If no valid TCP ports, write an empty line for listen address, + * indicating the Unix socket must be used. Note that this line is not + * added to the lock file until there is a socket backing it. + */ + if (!listen_addr_saved) + AddToDataDirLockFile(LOCK_FILE_LINE_LISTEN_ADDR, ""); + + /* + * Record postmaster options. We delay this till now to avoid recording + * bogus options (eg, unusable port number). + */ + if (!CreateOptsFile(argc, argv, my_exec_path)) + ExitPostmaster(1); + + /* + * Write the external PID file if requested + */ + if (external_pid_file) + { + FILE *fpidfile = fopen(external_pid_file, "w"); + + if (fpidfile) + { + fprintf(fpidfile, "%d\n", MyProcPid); + fclose(fpidfile); + + /* Make PID file world readable */ + if (chmod(external_pid_file, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) != 0) + write_stderr("%s: could not change permissions of external PID file \"%s\": %s\n", + progname, external_pid_file, strerror(errno)); + } + else + write_stderr("%s: could not write external PID file \"%s\": %s\n", + progname, external_pid_file, strerror(errno)); + + on_proc_exit(unlink_external_pid_file, 0); + } + + /* + * Remove old temporary files. At this point there can be no other + * Postgres processes running in this directory, so this should be safe. + */ + RemovePgTempFiles(); + + /* + * Initialize stats collection subsystem (this does NOT start the + * collector process!) + */ + pgstat_init(); + + /* + * Initialize the autovacuum subsystem (again, no process start yet) + */ + autovac_init(); + + /* + * Load configuration files for client authentication. + */ + if (!load_hba()) + { + /* + * It makes no sense to continue if we fail to load the HBA file, + * since there is no way to connect to the database in this case. + */ + ereport(FATAL, + (errmsg("could not load pg_hba.conf"))); + } + if (!load_ident()) + { + /* + * We can start up without the IDENT file, although it means that you + * cannot log in using any of the authentication methods that need a + * user name mapping. load_ident() already logged the details of error + * to the log. + */ + } + +#ifdef HAVE_PTHREAD_IS_THREADED_NP + + /* + * On macOS, libintl replaces setlocale() with a version that calls + * CFLocaleCopyCurrent() when its second argument is "" and every relevant + * environment variable is unset or empty. CFLocaleCopyCurrent() makes + * the process multithreaded. The postmaster calls sigprocmask() and + * calls fork() without an immediate exec(), both of which have undefined + * behavior in a multithreaded program. A multithreaded postmaster is the + * normal case on Windows, which offers neither fork() nor sigprocmask(). + */ + if (pthread_is_threaded_np() != 0) + ereport(FATAL, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("postmaster became multithreaded during startup"), + errhint("Set the LC_ALL environment variable to a valid locale."))); +#endif + + /* + * Remember postmaster startup time + */ + PgStartTime = GetCurrentTimestamp(); + + /* + * Report postmaster status in the postmaster.pid file, to allow pg_ctl to + * see what's happening. + */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STARTING); + + /* + * We're ready to rock and roll... + */ + StartupPID = StartupDataBase(); + Assert(StartupPID != 0); + StartupStatus = STARTUP_RUNNING; + pmState = PM_STARTUP; + + /* Some workers may be scheduled to start now */ + maybe_start_bgworkers(); + + status = ServerLoop(); + + /* + * ServerLoop probably shouldn't ever return, but if it does, close down. + */ + ExitPostmaster(status != STATUS_OK); + + abort(); /* not reached */ +} + + +/* + * on_proc_exit callback to close server's listen sockets + */ +static void +CloseServerPorts(int status, Datum arg) +{ + int i; + + /* + * First, explicitly close all the socket FDs. We used to just let this + * happen implicitly at postmaster exit, but it's better to close them + * before we remove the postmaster.pid lockfile; otherwise there's a race + * condition if a new postmaster wants to re-use the TCP port number. + */ + for (i = 0; i < MAXLISTEN; i++) + { + if (ListenSocket[i] != PGINVALID_SOCKET) + { + StreamClose(ListenSocket[i]); + ListenSocket[i] = PGINVALID_SOCKET; + } + } + + /* + * Next, remove any filesystem entries for Unix sockets. To avoid race + * conditions against incoming postmasters, this must happen after closing + * the sockets and before removing lock files. + */ + RemoveSocketFiles(); + + /* + * We don't do anything about socket lock files here; those will be + * removed in a later on_proc_exit callback. + */ +} + +/* + * on_proc_exit callback to delete external_pid_file + */ +static void +unlink_external_pid_file(int status, Datum arg) +{ + if (external_pid_file) + unlink(external_pid_file); +} + + +/* + * Compute and check the directory paths to files that are part of the + * installation (as deduced from the postgres executable's own location) + */ +static void +getInstallationPaths(const char *argv0) +{ + DIR *pdir; + + /* Locate the postgres executable itself */ + if (find_my_exec(argv0, my_exec_path) < 0) + ereport(FATAL, + (errmsg("%s: could not locate my own executable path", argv0))); + +#ifdef EXEC_BACKEND + /* Locate executable backend before we change working directory */ + if (find_other_exec(argv0, "postgres", PG_BACKEND_VERSIONSTR, + postgres_exec_path) < 0) + ereport(FATAL, + (errmsg("%s: could not locate matching postgres executable", + argv0))); +#endif + + /* + * Locate the pkglib directory --- this has to be set early in case we try + * to load any modules from it in response to postgresql.conf entries. + */ + get_pkglib_path(my_exec_path, pkglib_path); + + /* + * Verify that there's a readable directory there; otherwise the Postgres + * installation is incomplete or corrupt. (A typical cause of this + * failure is that the postgres executable has been moved or hardlinked to + * some directory that's not a sibling of the installation lib/ + * directory.) + */ + pdir = AllocateDir(pkglib_path); + if (pdir == NULL) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open directory \"%s\": %m", + pkglib_path), + errhint("This may indicate an incomplete PostgreSQL installation, or that the file \"%s\" has been moved away from its proper location.", + my_exec_path))); + FreeDir(pdir); + + /* + * XXX is it worth similarly checking the share/ directory? If the lib/ + * directory is there, then share/ probably is too. + */ +} + +/* + * Check that pg_control exists in the correct location in the data directory. + * + * No attempt is made to validate the contents of pg_control here. This is + * just a sanity check to see if we are looking at a real data directory. + */ +static void +checkControlFile(void) +{ + char path[MAXPGPATH]; + FILE *fp; + + snprintf(path, sizeof(path), "%s/global/pg_control", DataDir); + + fp = AllocateFile(path, PG_BINARY_R); + if (fp == NULL) + { + write_stderr("%s: could not find the database system\n" + "Expected to find it in the directory \"%s\",\n" + "but could not open file \"%s\": %s\n", + progname, DataDir, path, strerror(errno)); + ExitPostmaster(2); + } + FreeFile(fp); +} + +/* + * Determine how long should we let ServerLoop sleep. + * + * In normal conditions we wait at most one minute, to ensure that the other + * background tasks handled by ServerLoop get done even when no requests are + * arriving. However, if there are background workers waiting to be started, + * we don't actually sleep so that they are quickly serviced. Other exception + * cases are as shown in the code. + */ +static void +DetermineSleepTime(struct timeval *timeout) +{ + TimestampTz next_wakeup = 0; + + /* + * Normal case: either there are no background workers at all, or we're in + * a shutdown sequence (during which we ignore bgworkers altogether). + */ + if (Shutdown > NoShutdown || + (!StartWorkerNeeded && !HaveCrashedWorker)) + { + if (AbortStartTime != 0) + { + /* time left to abort; clamp to 0 in case it already expired */ + timeout->tv_sec = SIGKILL_CHILDREN_AFTER_SECS - + (time(NULL) - AbortStartTime); + timeout->tv_sec = Max(timeout->tv_sec, 0); + timeout->tv_usec = 0; + } + else + { + timeout->tv_sec = 60; + timeout->tv_usec = 0; + } + return; + } + + if (StartWorkerNeeded) + { + timeout->tv_sec = 0; + timeout->tv_usec = 0; + return; + } + + if (HaveCrashedWorker) + { + slist_mutable_iter siter; + + /* + * When there are crashed bgworkers, we sleep just long enough that + * they are restarted when they request to be. Scan the list to + * determine the minimum of all wakeup times according to most recent + * crash time and requested restart interval. + */ + slist_foreach_modify(siter, &BackgroundWorkerList) + { + RegisteredBgWorker *rw; + TimestampTz this_wakeup; + + rw = slist_container(RegisteredBgWorker, rw_lnode, siter.cur); + + if (rw->rw_crashed_at == 0) + continue; + + if (rw->rw_worker.bgw_restart_time == BGW_NEVER_RESTART + || rw->rw_terminate) + { + ForgetBackgroundWorker(&siter); + continue; + } + + this_wakeup = TimestampTzPlusMilliseconds(rw->rw_crashed_at, + 1000L * rw->rw_worker.bgw_restart_time); + if (next_wakeup == 0 || this_wakeup < next_wakeup) + next_wakeup = this_wakeup; + } + } + + if (next_wakeup != 0) + { + long secs; + int microsecs; + + TimestampDifference(GetCurrentTimestamp(), next_wakeup, + &secs, µsecs); + timeout->tv_sec = secs; + timeout->tv_usec = microsecs; + + /* Ensure we don't exceed one minute */ + if (timeout->tv_sec > 60) + { + timeout->tv_sec = 60; + timeout->tv_usec = 0; + } + } + else + { + timeout->tv_sec = 60; + timeout->tv_usec = 0; + } +} + +/* + * Main idle loop of postmaster + * + * NB: Needs to be called with signals blocked + */ +static int +ServerLoop(void) +{ + fd_set readmask; + int nSockets; + time_t last_lockfile_recheck_time, + last_touch_time; + + last_lockfile_recheck_time = last_touch_time = time(NULL); + + nSockets = initMasks(&readmask); + + for (;;) + { + fd_set rmask; + int selres; + time_t now; + + /* + * Wait for a connection request to arrive. + * + * We block all signals except while sleeping. That makes it safe for + * signal handlers, which again block all signals while executing, to + * do nontrivial work. + * + * If we are in PM_WAIT_DEAD_END state, then we don't want to accept + * any new connections, so we don't call select(), and just sleep. + */ + memcpy((char *) &rmask, (char *) &readmask, sizeof(fd_set)); + + if (pmState == PM_WAIT_DEAD_END) + { + PG_SETMASK(&UnBlockSig); + + pg_usleep(100000L); /* 100 msec seems reasonable */ + selres = 0; + + PG_SETMASK(&BlockSig); + } + else + { + /* must set timeout each time; some OSes change it! */ + struct timeval timeout; + + /* Needs to run with blocked signals! */ + DetermineSleepTime(&timeout); + + PG_SETMASK(&UnBlockSig); + + selres = select(nSockets, &rmask, NULL, NULL, &timeout); + + PG_SETMASK(&BlockSig); + } + + /* Now check the select() result */ + if (selres < 0) + { + if (errno != EINTR && errno != EWOULDBLOCK) + { + ereport(LOG, + (errcode_for_socket_access(), + errmsg("select() failed in postmaster: %m"))); + return STATUS_ERROR; + } + } + + /* + * New connection pending on any of our sockets? If so, fork a child + * process to deal with it. + */ + if (selres > 0) + { + int i; + + for (i = 0; i < MAXLISTEN; i++) + { + if (ListenSocket[i] == PGINVALID_SOCKET) + break; + if (FD_ISSET(ListenSocket[i], &rmask)) + { + Port *port; + + port = ConnCreate(ListenSocket[i]); + if (port) + { + BackendStartup(port); + + /* + * We no longer need the open socket or port structure + * in this process + */ + StreamClose(port->sock); + ConnFree(port); + } + } + } + } + + /* If we have lost the log collector, try to start a new one */ + if (SysLoggerPID == 0 && Logging_collector) + SysLoggerPID = SysLogger_Start(); + + /* + * If no background writer process is running, and we are not in a + * state that prevents it, start one. It doesn't matter if this + * fails, we'll just try again later. Likewise for the checkpointer. + */ + if (pmState == PM_RUN || pmState == PM_RECOVERY || + pmState == PM_HOT_STANDBY) + { + if (CheckpointerPID == 0) + CheckpointerPID = StartCheckpointer(); + if (BgWriterPID == 0) + BgWriterPID = StartBackgroundWriter(); + } + + /* + * Likewise, if we have lost the walwriter process, try to start a new + * one. But this is needed only in normal operation (else we cannot + * be writing any new WAL). + */ + if (WalWriterPID == 0 && pmState == PM_RUN) + WalWriterPID = StartWalWriter(); + + /* + * If we have lost the autovacuum launcher, try to start a new one. We + * don't want autovacuum to run in binary upgrade mode because + * autovacuum might update relfrozenxid for empty tables before the + * physical files are put in place. + */ + if (!IsBinaryUpgrade && AutoVacPID == 0 && + (AutoVacuumingActive() || start_autovac_launcher) && + pmState == PM_RUN) + { + AutoVacPID = StartAutoVacLauncher(); + if (AutoVacPID != 0) + start_autovac_launcher = false; /* signal processed */ + } + + /* If we have lost the stats collector, try to start a new one */ + if (PgStatPID == 0 && + (pmState == PM_RUN || pmState == PM_HOT_STANDBY)) + PgStatPID = pgstat_start(); + + /* If we have lost the archiver, try to start a new one. */ + if (PgArchPID == 0 && PgArchStartupAllowed()) + PgArchPID = StartArchiver(); + + /* If we need to signal the autovacuum launcher, do so now */ + if (avlauncher_needs_signal) + { + avlauncher_needs_signal = false; + if (AutoVacPID != 0) + kill(AutoVacPID, SIGUSR2); + } + + /* If we need to start a WAL receiver, try to do that now */ + if (WalReceiverRequested) + MaybeStartWalReceiver(); + + /* Get other worker processes running, if needed */ + if (StartWorkerNeeded || HaveCrashedWorker) + maybe_start_bgworkers(); + +#ifdef HAVE_PTHREAD_IS_THREADED_NP + + /* + * With assertions enabled, check regularly for appearance of + * additional threads. All builds check at start and exit. + */ + Assert(pthread_is_threaded_np() == 0); +#endif + + /* + * Lastly, check to see if it's time to do some things that we don't + * want to do every single time through the loop, because they're a + * bit expensive. Note that there's up to a minute of slop in when + * these tasks will be performed, since DetermineSleepTime() will let + * us sleep at most that long; except for SIGKILL timeout which has + * special-case logic there. + */ + now = time(NULL); + + /* + * If we already sent SIGQUIT to children and they are slow to shut + * down, it's time to send them SIGKILL. This doesn't happen + * normally, but under certain conditions backends can get stuck while + * shutting down. This is a last measure to get them unwedged. + * + * Note we also do this during recovery from a process crash. + */ + if ((Shutdown >= ImmediateShutdown || (FatalError && !SendStop)) && + AbortStartTime != 0 && + (now - AbortStartTime) >= SIGKILL_CHILDREN_AFTER_SECS) + { + /* We were gentle with them before. Not anymore */ + ereport(LOG, + (errmsg("issuing SIGKILL to recalcitrant children"))); + TerminateChildren(SIGKILL); + /* reset flag so we don't SIGKILL again */ + AbortStartTime = 0; + } + + /* + * Once a minute, verify that postmaster.pid hasn't been removed or + * overwritten. If it has, we force a shutdown. This avoids having + * postmasters and child processes hanging around after their database + * is gone, and maybe causing problems if a new database cluster is + * created in the same place. It also provides some protection + * against a DBA foolishly removing postmaster.pid and manually + * starting a new postmaster. Data corruption is likely to ensue from + * that anyway, but we can minimize the damage by aborting ASAP. + */ + if (now - last_lockfile_recheck_time >= 1 * SECS_PER_MINUTE) + { + if (!RecheckDataDirLockFile()) + { + ereport(LOG, + (errmsg("performing immediate shutdown because data directory lock file is invalid"))); + kill(MyProcPid, SIGQUIT); + } + last_lockfile_recheck_time = now; + } + + /* + * Touch Unix socket and lock files every 58 minutes, to ensure that + * they are not removed by overzealous /tmp-cleaning tasks. We assume + * no one runs cleaners with cutoff times of less than an hour ... + */ + if (now - last_touch_time >= 58 * SECS_PER_MINUTE) + { + TouchSocketFiles(); + TouchSocketLockFiles(); + last_touch_time = now; + } + } +} + +/* + * Initialise the masks for select() for the ports we are listening on. + * Return the number of sockets to listen on. + */ +static int +initMasks(fd_set *rmask) +{ + int maxsock = -1; + int i; + + FD_ZERO(rmask); + + for (i = 0; i < MAXLISTEN; i++) + { + int fd = ListenSocket[i]; + + if (fd == PGINVALID_SOCKET) + break; + FD_SET(fd, rmask); + + if (fd > maxsock) + maxsock = fd; + } + + return maxsock + 1; +} + + +/* + * Read a client's startup packet and do something according to it. + * + * Returns STATUS_OK or STATUS_ERROR, or might call ereport(FATAL) and + * not return at all. + * + * (Note that ereport(FATAL) stuff is sent to the client, so only use it + * if that's what you want. Return STATUS_ERROR if you don't want to + * send anything to the client, which would typically be appropriate + * if we detect a communications failure.) + * + * Set ssl_done and/or gss_done when negotiation of an encrypted layer + * (currently, TLS or GSSAPI) is completed. A successful negotiation of either + * encryption layer sets both flags, but a rejected negotiation sets only the + * flag for that layer, since the client may wish to try the other one. We + * should make no assumption here about the order in which the client may make + * requests. + */ +static int +ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done) +{ + int32 len; + char *buf; + ProtocolVersion proto; + MemoryContext oldcontext; + + pq_startmsgread(); + + /* + * Grab the first byte of the length word separately, so that we can tell + * whether we have no data at all or an incomplete packet. (This might + * sound inefficient, but it's not really, because of buffering in + * pqcomm.c.) + */ + if (pq_getbytes((char *) &len, 1) == EOF) + { + /* + * If we get no data at all, don't clutter the log with a complaint; + * such cases often occur for legitimate reasons. An example is that + * we might be here after responding to NEGOTIATE_SSL_CODE, and if the + * client didn't like our response, it'll probably just drop the + * connection. Service-monitoring software also often just opens and + * closes a connection without sending anything. (So do port + * scanners, which may be less benign, but it's not really our job to + * notice those.) + */ + return STATUS_ERROR; + } + + if (pq_getbytes(((char *) &len) + 1, 3) == EOF) + { + /* Got a partial length word, so bleat about that */ + if (!ssl_done && !gss_done) + ereport(COMMERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("incomplete startup packet"))); + return STATUS_ERROR; + } + + len = pg_ntoh32(len); + len -= 4; + + if (len < (int32) sizeof(ProtocolVersion) || + len > MAX_STARTUP_PACKET_LENGTH) + { + ereport(COMMERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid length of startup packet"))); + return STATUS_ERROR; + } + + /* + * Allocate space to hold the startup packet, plus one extra byte that's + * initialized to be zero. This ensures we will have null termination of + * all strings inside the packet. + */ + buf = palloc(len + 1); + buf[len] = '\0'; + + if (pq_getbytes(buf, len) == EOF) + { + ereport(COMMERROR, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("incomplete startup packet"))); + return STATUS_ERROR; + } + pq_endmsgread(); + + /* + * The first field is either a protocol version number or a special + * request code. + */ + port->proto = proto = pg_ntoh32(*((ProtocolVersion *) buf)); + + if (proto == CANCEL_REQUEST_CODE) + { + processCancelRequest(port, buf); + /* Not really an error, but we don't want to proceed further */ + return STATUS_ERROR; + } + + if (proto == NEGOTIATE_SSL_CODE && !ssl_done) + { + char SSLok; + +#ifdef USE_SSL + /* No SSL when disabled or on Unix sockets */ + if (!LoadedSSL || IS_AF_UNIX(port->laddr.addr.ss_family)) + SSLok = 'N'; + else + SSLok = 'S'; /* Support for SSL */ +#else + SSLok = 'N'; /* No support for SSL */ +#endif + +retry1: + if (send(port->sock, &SSLok, 1, 0) != 1) + { + if (errno == EINTR) + goto retry1; /* if interrupted, just retry */ + ereport(COMMERROR, + (errcode_for_socket_access(), + errmsg("failed to send SSL negotiation response: %m"))); + return STATUS_ERROR; /* close the connection */ + } + +#ifdef USE_SSL + if (SSLok == 'S' && secure_open_server(port) == -1) + return STATUS_ERROR; +#endif + + /* + * At this point we should have no data already buffered. If we do, + * it was received before we performed the SSL handshake, so it wasn't + * encrypted and indeed may have been injected by a man-in-the-middle. + * We report this case to the client. + */ + if (pq_buffer_has_data()) + ereport(FATAL, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("received unencrypted data after SSL request"), + errdetail("This could be either a client-software bug or evidence of an attempted man-in-the-middle attack."))); + + /* + * regular startup packet, cancel, etc packet should follow, but not + * another SSL negotiation request, and a GSS request should only + * follow if SSL was rejected (client may negotiate in either order) + */ + return ProcessStartupPacket(port, true, SSLok == 'S'); + } + else if (proto == NEGOTIATE_GSS_CODE && !gss_done) + { + char GSSok = 'N'; + +#ifdef ENABLE_GSS + /* No GSSAPI encryption when on Unix socket */ + if (!IS_AF_UNIX(port->laddr.addr.ss_family)) + GSSok = 'G'; +#endif + + while (send(port->sock, &GSSok, 1, 0) != 1) + { + if (errno == EINTR) + continue; + ereport(COMMERROR, + (errcode_for_socket_access(), + errmsg("failed to send GSSAPI negotiation response: %m"))); + return STATUS_ERROR; /* close the connection */ + } + +#ifdef ENABLE_GSS + if (GSSok == 'G' && secure_open_gssapi(port) == -1) + return STATUS_ERROR; +#endif + + /* + * At this point we should have no data already buffered. If we do, + * it was received before we performed the GSS handshake, so it wasn't + * encrypted and indeed may have been injected by a man-in-the-middle. + * We report this case to the client. + */ + if (pq_buffer_has_data()) + ereport(FATAL, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("received unencrypted data after GSSAPI encryption request"), + errdetail("This could be either a client-software bug or evidence of an attempted man-in-the-middle attack."))); + + /* + * regular startup packet, cancel, etc packet should follow, but not + * another GSS negotiation request, and an SSL request should only + * follow if GSS was rejected (client may negotiate in either order) + */ + return ProcessStartupPacket(port, GSSok == 'G', true); + } + + /* Could add additional special packet types here */ + + /* + * Set FrontendProtocol now so that ereport() knows what format to send if + * we fail during startup. + */ + FrontendProtocol = proto; + + /* Check that the major protocol version is in range. */ + if (PG_PROTOCOL_MAJOR(proto) < PG_PROTOCOL_MAJOR(PG_PROTOCOL_EARLIEST) || + PG_PROTOCOL_MAJOR(proto) > PG_PROTOCOL_MAJOR(PG_PROTOCOL_LATEST)) + ereport(FATAL, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("unsupported frontend protocol %u.%u: server supports %u.0 to %u.%u", + PG_PROTOCOL_MAJOR(proto), PG_PROTOCOL_MINOR(proto), + PG_PROTOCOL_MAJOR(PG_PROTOCOL_EARLIEST), + PG_PROTOCOL_MAJOR(PG_PROTOCOL_LATEST), + PG_PROTOCOL_MINOR(PG_PROTOCOL_LATEST)))); + + /* + * Now fetch parameters out of startup packet and save them into the Port + * structure. All data structures attached to the Port struct must be + * allocated in TopMemoryContext so that they will remain available in a + * running backend (even after PostmasterContext is destroyed). We need + * not worry about leaking this storage on failure, since we aren't in the + * postmaster process anymore. + */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + /* Handle protocol version 3 startup packet */ + { + int32 offset = sizeof(ProtocolVersion); + List *unrecognized_protocol_options = NIL; + + /* + * Scan packet body for name/option pairs. We can assume any string + * beginning within the packet body is null-terminated, thanks to + * zeroing extra byte above. + */ + port->guc_options = NIL; + + while (offset < len) + { + char *nameptr = buf + offset; + int32 valoffset; + char *valptr; + + if (*nameptr == '\0') + break; /* found packet terminator */ + valoffset = offset + strlen(nameptr) + 1; + if (valoffset >= len) + break; /* missing value, will complain below */ + valptr = buf + valoffset; + + if (strcmp(nameptr, "database") == 0) + port->database_name = pstrdup(valptr); + else if (strcmp(nameptr, "user") == 0) + port->user_name = pstrdup(valptr); + else if (strcmp(nameptr, "options") == 0) + port->cmdline_options = pstrdup(valptr); + else if (strcmp(nameptr, "replication") == 0) + { + /* + * Due to backward compatibility concerns the replication + * parameter is a hybrid beast which allows the value to be + * either boolean or the string 'database'. The latter + * connects to a specific database which is e.g. required for + * logical decoding while. + */ + if (strcmp(valptr, "database") == 0) + { + am_walsender = true; + am_db_walsender = true; + } + else if (!parse_bool(valptr, &am_walsender)) + ereport(FATAL, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": \"%s\"", + "replication", + valptr), + errhint("Valid values are: \"false\", 0, \"true\", 1, \"database\"."))); + } + else if (strncmp(nameptr, "_pq_.", 5) == 0) + { + /* + * Any option beginning with _pq_. is reserved for use as a + * protocol-level option, but at present no such options are + * defined. + */ + unrecognized_protocol_options = + lappend(unrecognized_protocol_options, pstrdup(nameptr)); + } + else + { + /* Assume it's a generic GUC option */ + port->guc_options = lappend(port->guc_options, + pstrdup(nameptr)); + port->guc_options = lappend(port->guc_options, + pstrdup(valptr)); + + /* + * Copy application_name to port if we come across it. This + * is done so we can log the application_name in the + * connection authorization message. Note that the GUC would + * be used but we haven't gone through GUC setup yet. + */ + if (strcmp(nameptr, "application_name") == 0) + { + char *tmp_app_name = pstrdup(valptr); + + pg_clean_ascii(tmp_app_name); + + port->application_name = tmp_app_name; + } + } + offset = valoffset + strlen(valptr) + 1; + } + + /* + * If we didn't find a packet terminator exactly at the end of the + * given packet length, complain. + */ + if (offset != len - 1) + ereport(FATAL, + (errcode(ERRCODE_PROTOCOL_VIOLATION), + errmsg("invalid startup packet layout: expected terminator as last byte"))); + + /* + * If the client requested a newer protocol version or if the client + * requested any protocol options we didn't recognize, let them know + * the newest minor protocol version we do support and the names of + * any unrecognized options. + */ + if (PG_PROTOCOL_MINOR(proto) > PG_PROTOCOL_MINOR(PG_PROTOCOL_LATEST) || + unrecognized_protocol_options != NIL) + SendNegotiateProtocolVersion(unrecognized_protocol_options); + } + + /* Check a user name was given. */ + if (port->user_name == NULL || port->user_name[0] == '\0') + ereport(FATAL, + (errcode(ERRCODE_INVALID_AUTHORIZATION_SPECIFICATION), + errmsg("no PostgreSQL user name specified in startup packet"))); + + /* The database defaults to the user name. */ + if (port->database_name == NULL || port->database_name[0] == '\0') + port->database_name = pstrdup(port->user_name); + + if (Db_user_namespace) + { + /* + * If user@, it is a global user, remove '@'. We only want to do this + * if there is an '@' at the end and no earlier in the user string or + * they may fake as a local user of another database attaching to this + * database. + */ + if (strchr(port->user_name, '@') == + port->user_name + strlen(port->user_name) - 1) + *strchr(port->user_name, '@') = '\0'; + else + { + /* Append '@' and dbname */ + port->user_name = psprintf("%s@%s", port->user_name, port->database_name); + } + } + + /* + * Truncate given database and user names to length of a Postgres name. + * This avoids lookup failures when overlength names are given. + */ + if (strlen(port->database_name) >= NAMEDATALEN) + port->database_name[NAMEDATALEN - 1] = '\0'; + if (strlen(port->user_name) >= NAMEDATALEN) + port->user_name[NAMEDATALEN - 1] = '\0'; + + if (am_walsender) + MyBackendType = B_WAL_SENDER; + else + MyBackendType = B_BACKEND; + + /* + * Normal walsender backends, e.g. for streaming replication, are not + * connected to a particular database. But walsenders used for logical + * replication need to connect to a specific database. We allow streaming + * replication commands to be issued even if connected to a database as it + * can make sense to first make a basebackup and then stream changes + * starting from that. + */ + if (am_walsender && !am_db_walsender) + port->database_name[0] = '\0'; + + /* + * Done putting stuff in TopMemoryContext. + */ + MemoryContextSwitchTo(oldcontext); + + /* + * If we're going to reject the connection due to database state, say so + * now instead of wasting cycles on an authentication exchange. (This also + * allows a pg_ping utility to be written.) + */ + switch (port->canAcceptConnections) + { + case CAC_STARTUP: + ereport(FATAL, + (errcode(ERRCODE_CANNOT_CONNECT_NOW), + errmsg("the database system is starting up"))); + break; + case CAC_NOTCONSISTENT: + if (EnableHotStandby) + ereport(FATAL, + (errcode(ERRCODE_CANNOT_CONNECT_NOW), + errmsg("the database system is not yet accepting connections"), + errdetail("Consistent recovery state has not been yet reached."))); + else + ereport(FATAL, + (errcode(ERRCODE_CANNOT_CONNECT_NOW), + errmsg("the database system is not accepting connections"), + errdetail("Hot standby mode is disabled."))); + break; + case CAC_SHUTDOWN: + ereport(FATAL, + (errcode(ERRCODE_CANNOT_CONNECT_NOW), + errmsg("the database system is shutting down"))); + break; + case CAC_RECOVERY: + ereport(FATAL, + (errcode(ERRCODE_CANNOT_CONNECT_NOW), + errmsg("the database system is in recovery mode"))); + break; + case CAC_TOOMANY: + ereport(FATAL, + (errcode(ERRCODE_TOO_MANY_CONNECTIONS), + errmsg("sorry, too many clients already"))); + break; + case CAC_SUPERUSER: + /* OK for now, will check in InitPostgres */ + break; + case CAC_OK: + break; + } + + return STATUS_OK; +} + +/* + * Send a NegotiateProtocolVersion to the client. This lets the client know + * that they have requested a newer minor protocol version than we are able + * to speak. We'll speak the highest version we know about; the client can, + * of course, abandon the connection if that's a problem. + * + * We also include in the response a list of protocol options we didn't + * understand. This allows clients to include optional parameters that might + * be present either in newer protocol versions or third-party protocol + * extensions without fear of having to reconnect if those options are not + * understood, while at the same time making certain that the client is aware + * of which options were actually accepted. + */ +static void +SendNegotiateProtocolVersion(List *unrecognized_protocol_options) +{ + StringInfoData buf; + ListCell *lc; + + pq_beginmessage(&buf, 'v'); /* NegotiateProtocolVersion */ + pq_sendint32(&buf, PG_PROTOCOL_LATEST); + pq_sendint32(&buf, list_length(unrecognized_protocol_options)); + foreach(lc, unrecognized_protocol_options) + pq_sendstring(&buf, lfirst(lc)); + pq_endmessage(&buf); + + /* no need to flush, some other message will follow */ +} + +/* + * The client has sent a cancel request packet, not a normal + * start-a-new-connection packet. Perform the necessary processing. + * Nothing is sent back to the client. + */ +static void +processCancelRequest(Port *port, void *pkt) +{ + CancelRequestPacket *canc = (CancelRequestPacket *) pkt; + int backendPID; + int32 cancelAuthCode; + Backend *bp; + +#ifndef EXEC_BACKEND + dlist_iter iter; +#else + int i; +#endif + + backendPID = (int) pg_ntoh32(canc->backendPID); + cancelAuthCode = (int32) pg_ntoh32(canc->cancelAuthCode); + + /* + * See if we have a matching backend. In the EXEC_BACKEND case, we can no + * longer access the postmaster's own backend list, and must rely on the + * duplicate array in shared memory. + */ +#ifndef EXEC_BACKEND + dlist_foreach(iter, &BackendList) + { + bp = dlist_container(Backend, elem, iter.cur); +#else + for (i = MaxLivePostmasterChildren() - 1; i >= 0; i--) + { + bp = (Backend *) &ShmemBackendArray[i]; +#endif + if (bp->pid == backendPID) + { + if (bp->cancel_key == cancelAuthCode) + { + /* Found a match; signal that backend to cancel current op */ + ereport(DEBUG2, + (errmsg_internal("processing cancel request: sending SIGINT to process %d", + backendPID))); + signal_child(bp->pid, SIGINT); + } + else + /* Right PID, wrong key: no way, Jose */ + ereport(LOG, + (errmsg("wrong key in cancel request for process %d", + backendPID))); + return; + } +#ifndef EXEC_BACKEND /* make GNU Emacs 26.1 see brace balance */ + } +#else + } +#endif + + /* No matching backend */ + ereport(LOG, + (errmsg("PID %d in cancel request did not match any process", + backendPID))); +} + +/* + * canAcceptConnections --- check to see if database state allows connections + * of the specified type. backend_type can be BACKEND_TYPE_NORMAL, + * BACKEND_TYPE_AUTOVAC, or BACKEND_TYPE_BGWORKER. (Note that we don't yet + * know whether a NORMAL connection might turn into a walsender.) + */ +static CAC_state +canAcceptConnections(int backend_type) +{ + CAC_state result = CAC_OK; + + /* + * Can't start backends when in startup/shutdown/inconsistent recovery + * state. We treat autovac workers the same as user backends for this + * purpose. However, bgworkers are excluded from this test; we expect + * bgworker_should_start_now() decided whether the DB state allows them. + */ + if (pmState != PM_RUN && pmState != PM_HOT_STANDBY && + backend_type != BACKEND_TYPE_BGWORKER) + { + if (Shutdown > NoShutdown) + return CAC_SHUTDOWN; /* shutdown is pending */ + else if (!FatalError && pmState == PM_STARTUP) + return CAC_STARTUP; /* normal startup */ + else if (!FatalError && pmState == PM_RECOVERY) + return CAC_NOTCONSISTENT; /* not yet at consistent recovery + * state */ + else + return CAC_RECOVERY; /* else must be crash recovery */ + } + + /* + * "Smart shutdown" restrictions are applied only to normal connections, + * not to autovac workers or bgworkers. When only superusers can connect, + * we return CAC_SUPERUSER to indicate that superuserness must be checked + * later. Note that neither CAC_OK nor CAC_SUPERUSER can safely be + * returned until we have checked for too many children. + */ + if (connsAllowed != ALLOW_ALL_CONNS && + backend_type == BACKEND_TYPE_NORMAL) + { + if (connsAllowed == ALLOW_SUPERUSER_CONNS) + result = CAC_SUPERUSER; /* allow superusers only */ + else + return CAC_SHUTDOWN; /* shutdown is pending */ + } + + /* + * Don't start too many children. + * + * We allow more connections here than we can have backends because some + * might still be authenticating; they might fail auth, or some existing + * backend might exit before the auth cycle is completed. The exact + * MaxBackends limit is enforced when a new backend tries to join the + * shared-inval backend array. + * + * The limit here must match the sizes of the per-child-process arrays; + * see comments for MaxLivePostmasterChildren(). + */ + if (CountChildren(BACKEND_TYPE_ALL) >= MaxLivePostmasterChildren()) + result = CAC_TOOMANY; + + return result; +} + + +/* + * ConnCreate -- create a local connection data structure + * + * Returns NULL on failure, other than out-of-memory which is fatal. + */ +static Port * +ConnCreate(int serverFd) +{ + Port *port; + + if (!(port = (Port *) calloc(1, sizeof(Port)))) + { + ereport(LOG, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + ExitPostmaster(1); + } + + if (StreamConnection(serverFd, port) != STATUS_OK) + { + if (port->sock != PGINVALID_SOCKET) + StreamClose(port->sock); + ConnFree(port); + return NULL; + } + + return port; +} + + +/* + * ConnFree -- free a local connection data structure + * + * Caller has already closed the socket if any, so there's not much + * to do here. + */ +static void +ConnFree(Port *conn) +{ + free(conn); +} + + +/* + * ClosePostmasterPorts -- close all the postmaster's open sockets + * + * This is called during child process startup to release file descriptors + * that are not needed by that child process. The postmaster still has + * them open, of course. + * + * Note: we pass am_syslogger as a boolean because we don't want to set + * the global variable yet when this is called. + */ +void +ClosePostmasterPorts(bool am_syslogger) +{ + int i; + +#ifndef WIN32 + + /* + * Close the write end of postmaster death watch pipe. It's important to + * do this as early as possible, so that if postmaster dies, others won't + * think that it's still running because we're holding the pipe open. + */ + if (close(postmaster_alive_fds[POSTMASTER_FD_OWN]) != 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg_internal("could not close postmaster death monitoring pipe in child process: %m"))); + postmaster_alive_fds[POSTMASTER_FD_OWN] = -1; + /* Notify fd.c that we released one pipe FD. */ + ReleaseExternalFD(); +#endif + + /* + * Close the postmaster's listen sockets. These aren't tracked by fd.c, + * so we don't call ReleaseExternalFD() here. + */ + for (i = 0; i < MAXLISTEN; i++) + { + if (ListenSocket[i] != PGINVALID_SOCKET) + { + StreamClose(ListenSocket[i]); + ListenSocket[i] = PGINVALID_SOCKET; + } + } + + /* + * If using syslogger, close the read side of the pipe. We don't bother + * tracking this in fd.c, either. + */ + if (!am_syslogger) + { +#ifndef WIN32 + if (syslogPipe[0] >= 0) + close(syslogPipe[0]); + syslogPipe[0] = -1; +#else + if (syslogPipe[0]) + CloseHandle(syslogPipe[0]); + syslogPipe[0] = 0; +#endif + } + +#ifdef USE_BONJOUR + /* If using Bonjour, close the connection to the mDNS daemon */ + if (bonjour_sdref) + close(DNSServiceRefSockFD(bonjour_sdref)); +#endif +} + + +/* + * InitProcessGlobals -- set MyProcPid, MyStartTime[stamp], random seeds + * + * Called early in the postmaster and every backend. + */ +void +InitProcessGlobals(void) +{ + unsigned int rseed; + + MyProcPid = getpid(); + MyStartTimestamp = GetCurrentTimestamp(); + MyStartTime = timestamptz_to_time_t(MyStartTimestamp); + + /* + * Set a different seed for random() in every process. We want something + * unpredictable, so if possible, use high-quality random bits for the + * seed. Otherwise, fall back to a seed based on timestamp and PID. + */ + if (!pg_strong_random(&rseed, sizeof(rseed))) + { + /* + * Since PIDs and timestamps tend to change more frequently in their + * least significant bits, shift the timestamp left to allow a larger + * total number of seeds in a given time period. Since that would + * leave only 20 bits of the timestamp that cycle every ~1 second, + * also mix in some higher bits. + */ + rseed = ((uint64) MyProcPid) ^ + ((uint64) MyStartTimestamp << 12) ^ + ((uint64) MyStartTimestamp >> 20); + } + srandom(rseed); +} + + +/* + * reset_shared -- reset shared memory and semaphores + */ +static void +reset_shared(void) +{ + /* + * Create or re-create shared memory and semaphores. + * + * Note: in each "cycle of life" we will normally assign the same IPC keys + * (if using SysV shmem and/or semas). This helps ensure that we will + * clean up dead IPC objects if the postmaster crashes and is restarted. + */ + CreateSharedMemoryAndSemaphores(); +} + + +/* + * SIGHUP -- reread config files, and tell children to do same + */ +static void +SIGHUP_handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + /* + * We rely on the signal mechanism to have blocked all signals ... except + * on Windows, which lacks sigaction(), so we have to do it manually. + */ +#ifdef WIN32 + PG_SETMASK(&BlockSig); +#endif + + if (Shutdown <= SmartShutdown) + { + ereport(LOG, + (errmsg("received SIGHUP, reloading configuration files"))); + ProcessConfigFile(PGC_SIGHUP); + SignalChildren(SIGHUP); + if (StartupPID != 0) + signal_child(StartupPID, SIGHUP); + if (BgWriterPID != 0) + signal_child(BgWriterPID, SIGHUP); + if (CheckpointerPID != 0) + signal_child(CheckpointerPID, SIGHUP); + if (WalWriterPID != 0) + signal_child(WalWriterPID, SIGHUP); + if (WalReceiverPID != 0) + signal_child(WalReceiverPID, SIGHUP); + if (AutoVacPID != 0) + signal_child(AutoVacPID, SIGHUP); + if (PgArchPID != 0) + signal_child(PgArchPID, SIGHUP); + if (SysLoggerPID != 0) + signal_child(SysLoggerPID, SIGHUP); + if (PgStatPID != 0) + signal_child(PgStatPID, SIGHUP); + + /* Reload authentication config files too */ + if (!load_hba()) + ereport(LOG, + /* translator: %s is a configuration file */ + (errmsg("%s was not reloaded", "pg_hba.conf"))); + + if (!load_ident()) + ereport(LOG, + (errmsg("%s was not reloaded", "pg_ident.conf"))); + +#ifdef USE_SSL + /* Reload SSL configuration as well */ + if (EnableSSL) + { + if (secure_initialize(false) == 0) + LoadedSSL = true; + else + ereport(LOG, + (errmsg("SSL configuration was not reloaded"))); + } + else + { + secure_destroy(); + LoadedSSL = false; + } +#endif + +#ifdef EXEC_BACKEND + /* Update the starting-point file for future children */ + write_nondefault_variables(PGC_SIGHUP); +#endif + } + +#ifdef WIN32 + PG_SETMASK(&UnBlockSig); +#endif + + errno = save_errno; +} + + +/* + * pmdie -- signal handler for processing various postmaster signals. + */ +static void +pmdie(SIGNAL_ARGS) +{ + int save_errno = errno; + + /* + * We rely on the signal mechanism to have blocked all signals ... except + * on Windows, which lacks sigaction(), so we have to do it manually. + */ +#ifdef WIN32 + PG_SETMASK(&BlockSig); +#endif + + ereport(DEBUG2, + (errmsg_internal("postmaster received signal %d", + postgres_signal_arg))); + + switch (postgres_signal_arg) + { + case SIGTERM: + + /* + * Smart Shutdown: + * + * Wait for children to end their work, then shut down. + */ + if (Shutdown >= SmartShutdown) + break; + Shutdown = SmartShutdown; + ereport(LOG, + (errmsg("received smart shutdown request"))); + + /* Report status */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STOPPING); +#ifdef USE_SYSTEMD + sd_notify(0, "STOPPING=1"); +#endif + + /* + * If we reached normal running, we have to wait for any online + * backup mode to end; otherwise go straight to waiting for client + * backends to exit. (The difference is that in the former state, + * we'll still let in new superuser clients, so that somebody can + * end the online backup mode.) If already in PM_STOP_BACKENDS or + * a later state, do not change it. + */ + if (pmState == PM_RUN) + connsAllowed = ALLOW_SUPERUSER_CONNS; + else if (pmState == PM_HOT_STANDBY) + connsAllowed = ALLOW_NO_CONNS; + else if (pmState == PM_STARTUP || pmState == PM_RECOVERY) + { + /* There should be no clients, so proceed to stop children */ + pmState = PM_STOP_BACKENDS; + } + + /* + * Now wait for online backup mode to end and backends to exit. If + * that is already the case, PostmasterStateMachine will take the + * next step. + */ + PostmasterStateMachine(); + break; + + case SIGINT: + + /* + * Fast Shutdown: + * + * Abort all children with SIGTERM (rollback active transactions + * and exit) and shut down when they are gone. + */ + if (Shutdown >= FastShutdown) + break; + Shutdown = FastShutdown; + ereport(LOG, + (errmsg("received fast shutdown request"))); + + /* Report status */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STOPPING); +#ifdef USE_SYSTEMD + sd_notify(0, "STOPPING=1"); +#endif + + if (pmState == PM_STARTUP || pmState == PM_RECOVERY) + { + /* Just shut down background processes silently */ + pmState = PM_STOP_BACKENDS; + } + else if (pmState == PM_RUN || + pmState == PM_HOT_STANDBY) + { + /* Report that we're about to zap live client sessions */ + ereport(LOG, + (errmsg("aborting any active transactions"))); + pmState = PM_STOP_BACKENDS; + } + + /* + * PostmasterStateMachine will issue any necessary signals, or + * take the next step if no child processes need to be killed. + */ + PostmasterStateMachine(); + break; + + case SIGQUIT: + + /* + * Immediate Shutdown: + * + * abort all children with SIGQUIT, wait for them to exit, + * terminate remaining ones with SIGKILL, then exit without + * attempt to properly shut down the data base system. + */ + if (Shutdown >= ImmediateShutdown) + break; + Shutdown = ImmediateShutdown; + ereport(LOG, + (errmsg("received immediate shutdown request"))); + + /* Report status */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STOPPING); +#ifdef USE_SYSTEMD + sd_notify(0, "STOPPING=1"); +#endif + + /* tell children to shut down ASAP */ + SetQuitSignalReason(PMQUIT_FOR_STOP); + TerminateChildren(SIGQUIT); + pmState = PM_WAIT_BACKENDS; + + /* set stopwatch for them to die */ + AbortStartTime = time(NULL); + + /* + * Now wait for backends to exit. If there are none, + * PostmasterStateMachine will take the next step. + */ + PostmasterStateMachine(); + break; + } + +#ifdef WIN32 + PG_SETMASK(&UnBlockSig); +#endif + + errno = save_errno; +} + +/* + * Reaper -- signal handler to cleanup after a child process dies. + */ +static void +reaper(SIGNAL_ARGS) +{ + int save_errno = errno; + int pid; /* process id of dead child process */ + int exitstatus; /* its exit status */ + + /* + * We rely on the signal mechanism to have blocked all signals ... except + * on Windows, which lacks sigaction(), so we have to do it manually. + */ +#ifdef WIN32 + PG_SETMASK(&BlockSig); +#endif + + ereport(DEBUG4, + (errmsg_internal("reaping dead processes"))); + + while ((pid = waitpid(-1, &exitstatus, WNOHANG)) > 0) + { + /* + * Check if this child was a startup process. + */ + if (pid == StartupPID) + { + StartupPID = 0; + + /* + * Startup process exited in response to a shutdown request (or it + * completed normally regardless of the shutdown request). + */ + if (Shutdown > NoShutdown && + (EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus))) + { + StartupStatus = STARTUP_NOT_RUNNING; + pmState = PM_WAIT_BACKENDS; + /* PostmasterStateMachine logic does the rest */ + continue; + } + + if (EXIT_STATUS_3(exitstatus)) + { + ereport(LOG, + (errmsg("shutdown at recovery target"))); + StartupStatus = STARTUP_NOT_RUNNING; + Shutdown = Max(Shutdown, SmartShutdown); + TerminateChildren(SIGTERM); + pmState = PM_WAIT_BACKENDS; + /* PostmasterStateMachine logic does the rest */ + continue; + } + + /* + * Unexpected exit of startup process (including FATAL exit) + * during PM_STARTUP is treated as catastrophic. There are no + * other processes running yet, so we can just exit. + */ + if (pmState == PM_STARTUP && + StartupStatus != STARTUP_SIGNALED && + !EXIT_STATUS_0(exitstatus)) + { + LogChildExit(LOG, _("startup process"), + pid, exitstatus); + ereport(LOG, + (errmsg("aborting startup due to startup process failure"))); + ExitPostmaster(1); + } + + /* + * After PM_STARTUP, any unexpected exit (including FATAL exit) of + * the startup process is catastrophic, so kill other children, + * and set StartupStatus so we don't try to reinitialize after + * they're gone. Exception: if StartupStatus is STARTUP_SIGNALED, + * then we previously sent the startup process a SIGQUIT; so + * that's probably the reason it died, and we do want to try to + * restart in that case. + * + * This stanza also handles the case where we sent a SIGQUIT + * during PM_STARTUP due to some dead_end child crashing: in that + * situation, if the startup process dies on the SIGQUIT, we need + * to transition to PM_WAIT_BACKENDS state which will allow + * PostmasterStateMachine to restart the startup process. (On the + * other hand, the startup process might complete normally, if we + * were too late with the SIGQUIT. In that case we'll fall + * through and commence normal operations.) + */ + if (!EXIT_STATUS_0(exitstatus)) + { + if (StartupStatus == STARTUP_SIGNALED) + { + StartupStatus = STARTUP_NOT_RUNNING; + if (pmState == PM_STARTUP) + pmState = PM_WAIT_BACKENDS; + } + else + StartupStatus = STARTUP_CRASHED; + HandleChildCrash(pid, exitstatus, + _("startup process")); + continue; + } + + /* + * Startup succeeded, commence normal operations + */ + StartupStatus = STARTUP_NOT_RUNNING; + FatalError = false; + AbortStartTime = 0; + ReachedNormalRunning = true; + pmState = PM_RUN; + connsAllowed = ALLOW_ALL_CONNS; + + /* + * Crank up the background tasks, if we didn't do that already + * when we entered consistent recovery state. It doesn't matter + * if this fails, we'll just try again later. + */ + if (CheckpointerPID == 0) + CheckpointerPID = StartCheckpointer(); + if (BgWriterPID == 0) + BgWriterPID = StartBackgroundWriter(); + if (WalWriterPID == 0) + WalWriterPID = StartWalWriter(); + + /* + * Likewise, start other special children as needed. In a restart + * situation, some of them may be alive already. + */ + if (!IsBinaryUpgrade && AutoVacuumingActive() && AutoVacPID == 0) + AutoVacPID = StartAutoVacLauncher(); + if (PgArchStartupAllowed() && PgArchPID == 0) + PgArchPID = StartArchiver(); + if (PgStatPID == 0) + PgStatPID = pgstat_start(); + + /* workers may be scheduled to start now */ + maybe_start_bgworkers(); + + /* at this point we are really open for business */ + ereport(LOG, + (errmsg("database system is ready to accept connections"))); + + /* Report status */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_READY); +#ifdef USE_SYSTEMD + sd_notify(0, "READY=1"); +#endif + + continue; + } + + /* + * Was it the bgwriter? Normal exit can be ignored; we'll start a new + * one at the next iteration of the postmaster's main loop, if + * necessary. Any other exit condition is treated as a crash. + */ + if (pid == BgWriterPID) + { + BgWriterPID = 0; + if (!EXIT_STATUS_0(exitstatus)) + HandleChildCrash(pid, exitstatus, + _("background writer process")); + continue; + } + + /* + * Was it the checkpointer? + */ + if (pid == CheckpointerPID) + { + CheckpointerPID = 0; + if (EXIT_STATUS_0(exitstatus) && pmState == PM_SHUTDOWN) + { + /* + * OK, we saw normal exit of the checkpointer after it's been + * told to shut down. We expect that it wrote a shutdown + * checkpoint. (If for some reason it didn't, recovery will + * occur on next postmaster start.) + * + * At this point we should have no normal backend children + * left (else we'd not be in PM_SHUTDOWN state) but we might + * have dead_end children to wait for. + * + * If we have an archiver subprocess, tell it to do a last + * archive cycle and quit. Likewise, if we have walsender + * processes, tell them to send any remaining WAL and quit. + */ + Assert(Shutdown > NoShutdown); + + /* Waken archiver for the last time */ + if (PgArchPID != 0) + signal_child(PgArchPID, SIGUSR2); + + /* + * Waken walsenders for the last time. No regular backends + * should be around anymore. + */ + SignalChildren(SIGUSR2); + + pmState = PM_SHUTDOWN_2; + + /* + * We can also shut down the stats collector now; there's + * nothing left for it to do. + */ + if (PgStatPID != 0) + signal_child(PgStatPID, SIGQUIT); + } + else + { + /* + * Any unexpected exit of the checkpointer (including FATAL + * exit) is treated as a crash. + */ + HandleChildCrash(pid, exitstatus, + _("checkpointer process")); + } + + continue; + } + + /* + * Was it the wal writer? Normal exit can be ignored; we'll start a + * new one at the next iteration of the postmaster's main loop, if + * necessary. Any other exit condition is treated as a crash. + */ + if (pid == WalWriterPID) + { + WalWriterPID = 0; + if (!EXIT_STATUS_0(exitstatus)) + HandleChildCrash(pid, exitstatus, + _("WAL writer process")); + continue; + } + + /* + * Was it the wal receiver? If exit status is zero (normal) or one + * (FATAL exit), we assume everything is all right just like normal + * backends. (If we need a new wal receiver, we'll start one at the + * next iteration of the postmaster's main loop.) + */ + if (pid == WalReceiverPID) + { + WalReceiverPID = 0; + if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus)) + HandleChildCrash(pid, exitstatus, + _("WAL receiver process")); + continue; + } + + /* + * Was it the autovacuum launcher? Normal exit can be ignored; we'll + * start a new one at the next iteration of the postmaster's main + * loop, if necessary. Any other exit condition is treated as a + * crash. + */ + if (pid == AutoVacPID) + { + AutoVacPID = 0; + if (!EXIT_STATUS_0(exitstatus)) + HandleChildCrash(pid, exitstatus, + _("autovacuum launcher process")); + continue; + } + + /* + * Was it the archiver? If exit status is zero (normal) or one (FATAL + * exit), we assume everything is all right just like normal backends + * and just try to restart a new one so that we immediately retry + * archiving remaining files. (If fail, we'll try again in future + * cycles of the postmaster's main loop.) Unless we were waiting for + * it to shut down; don't restart it in that case, and + * PostmasterStateMachine() will advance to the next shutdown step. + */ + if (pid == PgArchPID) + { + PgArchPID = 0; + if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus)) + HandleChildCrash(pid, exitstatus, + _("archiver process")); + if (PgArchStartupAllowed()) + PgArchPID = StartArchiver(); + continue; + } + + /* + * Was it the statistics collector? If so, just try to start a new + * one; no need to force reset of the rest of the system. (If fail, + * we'll try again in future cycles of the main loop.) + */ + if (pid == PgStatPID) + { + PgStatPID = 0; + if (!EXIT_STATUS_0(exitstatus)) + LogChildExit(LOG, _("statistics collector process"), + pid, exitstatus); + if (pmState == PM_RUN || pmState == PM_HOT_STANDBY) + PgStatPID = pgstat_start(); + continue; + } + + /* Was it the system logger? If so, try to start a new one */ + if (pid == SysLoggerPID) + { + SysLoggerPID = 0; + /* for safety's sake, launch new logger *first* */ + SysLoggerPID = SysLogger_Start(); + if (!EXIT_STATUS_0(exitstatus)) + LogChildExit(LOG, _("system logger process"), + pid, exitstatus); + continue; + } + + /* Was it one of our background workers? */ + if (CleanupBackgroundWorker(pid, exitstatus)) + { + /* have it be restarted */ + HaveCrashedWorker = true; + continue; + } + + /* + * Else do standard backend child cleanup. + */ + CleanupBackend(pid, exitstatus); + } /* loop over pending child-death reports */ + + /* + * After cleaning out the SIGCHLD queue, see if we have any state changes + * or actions to make. + */ + PostmasterStateMachine(); + + /* Done with signal handler */ +#ifdef WIN32 + PG_SETMASK(&UnBlockSig); +#endif + + errno = save_errno; +} + +/* + * Scan the bgworkers list and see if the given PID (which has just stopped + * or crashed) is in it. Handle its shutdown if so, and return true. If not a + * bgworker, return false. + * + * This is heavily based on CleanupBackend. One important difference is that + * we don't know yet that the dying process is a bgworker, so we must be silent + * until we're sure it is. + */ +static bool +CleanupBackgroundWorker(int pid, + int exitstatus) /* child's exit status */ +{ + char namebuf[MAXPGPATH]; + slist_mutable_iter iter; + + slist_foreach_modify(iter, &BackgroundWorkerList) + { + RegisteredBgWorker *rw; + + rw = slist_container(RegisteredBgWorker, rw_lnode, iter.cur); + + if (rw->rw_pid != pid) + continue; + +#ifdef WIN32 + /* see CleanupBackend */ + if (exitstatus == ERROR_WAIT_NO_CHILDREN) + exitstatus = 0; +#endif + + snprintf(namebuf, MAXPGPATH, _("background worker \"%s\""), + rw->rw_worker.bgw_type); + + + if (!EXIT_STATUS_0(exitstatus)) + { + /* Record timestamp, so we know when to restart the worker. */ + rw->rw_crashed_at = GetCurrentTimestamp(); + } + else + { + /* Zero exit status means terminate */ + rw->rw_crashed_at = 0; + rw->rw_terminate = true; + } + + /* + * Additionally, for shared-memory-connected workers, just like a + * backend, any exit status other than 0 or 1 is considered a crash + * and causes a system-wide restart. + */ + if ((rw->rw_worker.bgw_flags & BGWORKER_SHMEM_ACCESS) != 0) + { + if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus)) + { + HandleChildCrash(pid, exitstatus, namebuf); + return true; + } + } + + /* + * We must release the postmaster child slot whether this worker is + * connected to shared memory or not, but we only treat it as a crash + * if it is in fact connected. + */ + if (!ReleasePostmasterChildSlot(rw->rw_child_slot) && + (rw->rw_worker.bgw_flags & BGWORKER_SHMEM_ACCESS) != 0) + { + HandleChildCrash(pid, exitstatus, namebuf); + return true; + } + + /* Get it out of the BackendList and clear out remaining data */ + dlist_delete(&rw->rw_backend->elem); +#ifdef EXEC_BACKEND + ShmemBackendArrayRemove(rw->rw_backend); +#endif + + /* + * It's possible that this background worker started some OTHER + * background worker and asked to be notified when that worker started + * or stopped. If so, cancel any notifications destined for the + * now-dead backend. + */ + if (rw->rw_backend->bgworker_notify) + BackgroundWorkerStopNotifications(rw->rw_pid); + free(rw->rw_backend); + rw->rw_backend = NULL; + rw->rw_pid = 0; + rw->rw_child_slot = 0; + ReportBackgroundWorkerExit(&iter); /* report child death */ + + LogChildExit(EXIT_STATUS_0(exitstatus) ? DEBUG1 : LOG, + namebuf, pid, exitstatus); + + return true; + } + + return false; +} + +/* + * CleanupBackend -- cleanup after terminated backend. + * + * Remove all local state associated with backend. + * + * If you change this, see also CleanupBackgroundWorker. + */ +static void +CleanupBackend(int pid, + int exitstatus) /* child's exit status. */ +{ + dlist_mutable_iter iter; + + LogChildExit(DEBUG2, _("server process"), pid, exitstatus); + + /* + * If a backend dies in an ugly way then we must signal all other backends + * to quickdie. If exit status is zero (normal) or one (FATAL exit), we + * assume everything is all right and proceed to remove the backend from + * the active backend list. + */ + +#ifdef WIN32 + + /* + * On win32, also treat ERROR_WAIT_NO_CHILDREN (128) as nonfatal case, + * since that sometimes happens under load when the process fails to start + * properly (long before it starts using shared memory). Microsoft reports + * it is related to mutex failure: + * http://archives.postgresql.org/pgsql-hackers/2010-09/msg00790.php + */ + if (exitstatus == ERROR_WAIT_NO_CHILDREN) + { + LogChildExit(LOG, _("server process"), pid, exitstatus); + exitstatus = 0; + } +#endif + + if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus)) + { + HandleChildCrash(pid, exitstatus, _("server process")); + return; + } + + dlist_foreach_modify(iter, &BackendList) + { + Backend *bp = dlist_container(Backend, elem, iter.cur); + + if (bp->pid == pid) + { + if (!bp->dead_end) + { + if (!ReleasePostmasterChildSlot(bp->child_slot)) + { + /* + * Uh-oh, the child failed to clean itself up. Treat as a + * crash after all. + */ + HandleChildCrash(pid, exitstatus, _("server process")); + return; + } +#ifdef EXEC_BACKEND + ShmemBackendArrayRemove(bp); +#endif + } + if (bp->bgworker_notify) + { + /* + * This backend may have been slated to receive SIGUSR1 when + * some background worker started or stopped. Cancel those + * notifications, as we don't want to signal PIDs that are not + * PostgreSQL backends. This gets skipped in the (probably + * very common) case where the backend has never requested any + * such notifications. + */ + BackgroundWorkerStopNotifications(bp->pid); + } + dlist_delete(iter.cur); + free(bp); + break; + } + } +} + +/* + * HandleChildCrash -- cleanup after failed backend, bgwriter, checkpointer, + * walwriter, autovacuum, archiver or background worker. + * + * The objectives here are to clean up our local state about the child + * process, and to signal all other remaining children to quickdie. + */ +static void +HandleChildCrash(int pid, int exitstatus, const char *procname) +{ + dlist_mutable_iter iter; + slist_iter siter; + Backend *bp; + bool take_action; + + /* + * We only log messages and send signals if this is the first process + * crash and we're not doing an immediate shutdown; otherwise, we're only + * here to update postmaster's idea of live processes. If we have already + * signaled children, nonzero exit status is to be expected, so don't + * clutter log. + */ + take_action = !FatalError && Shutdown != ImmediateShutdown; + + if (take_action) + { + LogChildExit(LOG, procname, pid, exitstatus); + ereport(LOG, + (errmsg("terminating any other active server processes"))); + SetQuitSignalReason(PMQUIT_FOR_CRASH); + } + + /* Process background workers. */ + slist_foreach(siter, &BackgroundWorkerList) + { + RegisteredBgWorker *rw; + + rw = slist_container(RegisteredBgWorker, rw_lnode, siter.cur); + if (rw->rw_pid == 0) + continue; /* not running */ + if (rw->rw_pid == pid) + { + /* + * Found entry for freshly-dead worker, so remove it. + */ + (void) ReleasePostmasterChildSlot(rw->rw_child_slot); + dlist_delete(&rw->rw_backend->elem); +#ifdef EXEC_BACKEND + ShmemBackendArrayRemove(rw->rw_backend); +#endif + free(rw->rw_backend); + rw->rw_backend = NULL; + rw->rw_pid = 0; + rw->rw_child_slot = 0; + /* don't reset crashed_at */ + /* don't report child stop, either */ + /* Keep looping so we can signal remaining workers */ + } + else + { + /* + * This worker is still alive. Unless we did so already, tell it + * to commit hara-kiri. + * + * SIGQUIT is the special signal that says exit without proc_exit + * and let the user know what's going on. But if SendStop is set + * (-T on command line), then we send SIGSTOP instead, so that we + * can get core dumps from all backends by hand. + */ + if (take_action) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (SendStop ? "SIGSTOP" : "SIGQUIT"), + (int) rw->rw_pid))); + signal_child(rw->rw_pid, (SendStop ? SIGSTOP : SIGQUIT)); + } + } + } + + /* Process regular backends */ + dlist_foreach_modify(iter, &BackendList) + { + bp = dlist_container(Backend, elem, iter.cur); + + if (bp->pid == pid) + { + /* + * Found entry for freshly-dead backend, so remove it. + */ + if (!bp->dead_end) + { + (void) ReleasePostmasterChildSlot(bp->child_slot); +#ifdef EXEC_BACKEND + ShmemBackendArrayRemove(bp); +#endif + } + dlist_delete(iter.cur); + free(bp); + /* Keep looping so we can signal remaining backends */ + } + else + { + /* + * This backend is still alive. Unless we did so already, tell it + * to commit hara-kiri. + * + * SIGQUIT is the special signal that says exit without proc_exit + * and let the user know what's going on. But if SendStop is set + * (-T on command line), then we send SIGSTOP instead, so that we + * can get core dumps from all backends by hand. + * + * We could exclude dead_end children here, but at least in the + * SIGSTOP case it seems better to include them. + * + * Background workers were already processed above; ignore them + * here. + */ + if (bp->bkend_type == BACKEND_TYPE_BGWORKER) + continue; + + if (take_action) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (SendStop ? "SIGSTOP" : "SIGQUIT"), + (int) bp->pid))); + signal_child(bp->pid, (SendStop ? SIGSTOP : SIGQUIT)); + } + } + } + + /* Take care of the startup process too */ + if (pid == StartupPID) + { + StartupPID = 0; + /* Caller adjusts StartupStatus, so don't touch it here */ + } + else if (StartupPID != 0 && take_action) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (SendStop ? "SIGSTOP" : "SIGQUIT"), + (int) StartupPID))); + signal_child(StartupPID, (SendStop ? SIGSTOP : SIGQUIT)); + StartupStatus = STARTUP_SIGNALED; + } + + /* Take care of the bgwriter too */ + if (pid == BgWriterPID) + BgWriterPID = 0; + else if (BgWriterPID != 0 && take_action) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (SendStop ? "SIGSTOP" : "SIGQUIT"), + (int) BgWriterPID))); + signal_child(BgWriterPID, (SendStop ? SIGSTOP : SIGQUIT)); + } + + /* Take care of the checkpointer too */ + if (pid == CheckpointerPID) + CheckpointerPID = 0; + else if (CheckpointerPID != 0 && take_action) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (SendStop ? "SIGSTOP" : "SIGQUIT"), + (int) CheckpointerPID))); + signal_child(CheckpointerPID, (SendStop ? SIGSTOP : SIGQUIT)); + } + + /* Take care of the walwriter too */ + if (pid == WalWriterPID) + WalWriterPID = 0; + else if (WalWriterPID != 0 && take_action) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (SendStop ? "SIGSTOP" : "SIGQUIT"), + (int) WalWriterPID))); + signal_child(WalWriterPID, (SendStop ? SIGSTOP : SIGQUIT)); + } + + /* Take care of the walreceiver too */ + if (pid == WalReceiverPID) + WalReceiverPID = 0; + else if (WalReceiverPID != 0 && take_action) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (SendStop ? "SIGSTOP" : "SIGQUIT"), + (int) WalReceiverPID))); + signal_child(WalReceiverPID, (SendStop ? SIGSTOP : SIGQUIT)); + } + + /* Take care of the autovacuum launcher too */ + if (pid == AutoVacPID) + AutoVacPID = 0; + else if (AutoVacPID != 0 && take_action) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (SendStop ? "SIGSTOP" : "SIGQUIT"), + (int) AutoVacPID))); + signal_child(AutoVacPID, (SendStop ? SIGSTOP : SIGQUIT)); + } + + /* Take care of the archiver too */ + if (pid == PgArchPID) + PgArchPID = 0; + else if (PgArchPID != 0 && take_action) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (SendStop ? "SIGSTOP" : "SIGQUIT"), + (int) PgArchPID))); + signal_child(PgArchPID, (SendStop ? SIGSTOP : SIGQUIT)); + } + + /* + * Force a power-cycle of the pgstat process too. (This isn't absolutely + * necessary, but it seems like a good idea for robustness, and it + * simplifies the state-machine logic in the case where a shutdown request + * arrives during crash processing.) + */ + if (PgStatPID != 0 && take_action) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + "SIGQUIT", + (int) PgStatPID))); + signal_child(PgStatPID, SIGQUIT); + allow_immediate_pgstat_restart(); + } + + /* We do NOT restart the syslogger */ + + if (Shutdown != ImmediateShutdown) + FatalError = true; + + /* We now transit into a state of waiting for children to die */ + if (pmState == PM_RECOVERY || + pmState == PM_HOT_STANDBY || + pmState == PM_RUN || + pmState == PM_STOP_BACKENDS || + pmState == PM_SHUTDOWN) + pmState = PM_WAIT_BACKENDS; + + /* + * .. and if this doesn't happen quickly enough, now the clock is ticking + * for us to kill them without mercy. + */ + if (AbortStartTime == 0) + AbortStartTime = time(NULL); +} + +/* + * Log the death of a child process. + */ +static void +LogChildExit(int lev, const char *procname, int pid, int exitstatus) +{ + /* + * size of activity_buffer is arbitrary, but set equal to default + * track_activity_query_size + */ + char activity_buffer[1024]; + const char *activity = NULL; + + if (!EXIT_STATUS_0(exitstatus)) + activity = pgstat_get_crashed_backend_activity(pid, + activity_buffer, + sizeof(activity_buffer)); + + if (WIFEXITED(exitstatus)) + ereport(lev, + + /*------ + translator: %s is a noun phrase describing a child process, such as + "server process" */ + (errmsg("%s (PID %d) exited with exit code %d", + procname, pid, WEXITSTATUS(exitstatus)), + activity ? errdetail("Failed process was running: %s", activity) : 0)); + else if (WIFSIGNALED(exitstatus)) + { +#if defined(WIN32) + ereport(lev, + + /*------ + translator: %s is a noun phrase describing a child process, such as + "server process" */ + (errmsg("%s (PID %d) was terminated by exception 0x%X", + procname, pid, WTERMSIG(exitstatus)), + errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value."), + activity ? errdetail("Failed process was running: %s", activity) : 0)); +#else + ereport(lev, + + /*------ + translator: %s is a noun phrase describing a child process, such as + "server process" */ + (errmsg("%s (PID %d) was terminated by signal %d: %s", + procname, pid, WTERMSIG(exitstatus), + pg_strsignal(WTERMSIG(exitstatus))), + activity ? errdetail("Failed process was running: %s", activity) : 0)); +#endif + } + else + ereport(lev, + + /*------ + translator: %s is a noun phrase describing a child process, such as + "server process" */ + (errmsg("%s (PID %d) exited with unrecognized status %d", + procname, pid, exitstatus), + activity ? errdetail("Failed process was running: %s", activity) : 0)); +} + +/* + * Advance the postmaster's state machine and take actions as appropriate + * + * This is common code for pmdie(), reaper() and sigusr1_handler(), which + * receive the signals that might mean we need to change state. + */ +static void +PostmasterStateMachine(void) +{ + /* If we're doing a smart shutdown, try to advance that state. */ + if (pmState == PM_RUN || pmState == PM_HOT_STANDBY) + { + if (connsAllowed == ALLOW_SUPERUSER_CONNS) + { + /* + * ALLOW_SUPERUSER_CONNS state ends as soon as online backup mode + * is not active. + */ + if (!BackupInProgress()) + connsAllowed = ALLOW_NO_CONNS; + } + + if (connsAllowed == ALLOW_NO_CONNS) + { + /* + * ALLOW_NO_CONNS state ends when we have no normal client + * backends running. Then we're ready to stop other children. + */ + if (CountChildren(BACKEND_TYPE_NORMAL) == 0) + pmState = PM_STOP_BACKENDS; + } + } + + /* + * If we're ready to do so, signal child processes to shut down. (This + * isn't a persistent state, but treating it as a distinct pmState allows + * us to share this code across multiple shutdown code paths.) + */ + if (pmState == PM_STOP_BACKENDS) + { + /* + * Forget any pending requests for background workers, since we're no + * longer willing to launch any new workers. (If additional requests + * arrive, BackgroundWorkerStateChange will reject them.) + */ + ForgetUnstartedBackgroundWorkers(); + + /* Signal all backend children except walsenders */ + SignalSomeChildren(SIGTERM, + BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND); + /* and the autovac launcher too */ + if (AutoVacPID != 0) + signal_child(AutoVacPID, SIGTERM); + /* and the bgwriter too */ + if (BgWriterPID != 0) + signal_child(BgWriterPID, SIGTERM); + /* and the walwriter too */ + if (WalWriterPID != 0) + signal_child(WalWriterPID, SIGTERM); + /* If we're in recovery, also stop startup and walreceiver procs */ + if (StartupPID != 0) + signal_child(StartupPID, SIGTERM); + if (WalReceiverPID != 0) + signal_child(WalReceiverPID, SIGTERM); + /* checkpointer, archiver, stats, and syslogger may continue for now */ + + /* Now transition to PM_WAIT_BACKENDS state to wait for them to die */ + pmState = PM_WAIT_BACKENDS; + } + + /* + * If we are in a state-machine state that implies waiting for backends to + * exit, see if they're all gone, and change state if so. + */ + if (pmState == PM_WAIT_BACKENDS) + { + /* + * PM_WAIT_BACKENDS state ends when we have no regular backends + * (including autovac workers), no bgworkers (including unconnected + * ones), and no walwriter, autovac launcher or bgwriter. If we are + * doing crash recovery or an immediate shutdown then we expect the + * checkpointer to exit as well, otherwise not. The stats and + * syslogger processes are disregarded since they are not connected to + * shared memory; we also disregard dead_end children here. Walsenders + * and archiver are also disregarded, they will be terminated later + * after writing the checkpoint record. + */ + if (CountChildren(BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND) == 0 && + StartupPID == 0 && + WalReceiverPID == 0 && + BgWriterPID == 0 && + (CheckpointerPID == 0 || + (!FatalError && Shutdown < ImmediateShutdown)) && + WalWriterPID == 0 && + AutoVacPID == 0) + { + if (Shutdown >= ImmediateShutdown || FatalError) + { + /* + * Start waiting for dead_end children to die. This state + * change causes ServerLoop to stop creating new ones. + */ + pmState = PM_WAIT_DEAD_END; + + /* + * We already SIGQUIT'd the archiver and stats processes, if + * any, when we started immediate shutdown or entered + * FatalError state. + */ + } + else + { + /* + * If we get here, we are proceeding with normal shutdown. All + * the regular children are gone, and it's time to tell the + * checkpointer to do a shutdown checkpoint. + */ + Assert(Shutdown > NoShutdown); + /* Start the checkpointer if not running */ + if (CheckpointerPID == 0) + CheckpointerPID = StartCheckpointer(); + /* And tell it to shut down */ + if (CheckpointerPID != 0) + { + signal_child(CheckpointerPID, SIGUSR2); + pmState = PM_SHUTDOWN; + } + else + { + /* + * If we failed to fork a checkpointer, just shut down. + * Any required cleanup will happen at next restart. We + * set FatalError so that an "abnormal shutdown" message + * gets logged when we exit. + */ + FatalError = true; + pmState = PM_WAIT_DEAD_END; + + /* Kill the walsenders, archiver and stats collector too */ + SignalChildren(SIGQUIT); + if (PgArchPID != 0) + signal_child(PgArchPID, SIGQUIT); + if (PgStatPID != 0) + signal_child(PgStatPID, SIGQUIT); + } + } + } + } + + if (pmState == PM_SHUTDOWN_2) + { + /* + * PM_SHUTDOWN_2 state ends when there's no other children than + * dead_end children left. There shouldn't be any regular backends + * left by now anyway; what we're really waiting for is walsenders and + * archiver. + */ + if (PgArchPID == 0 && CountChildren(BACKEND_TYPE_ALL) == 0) + { + pmState = PM_WAIT_DEAD_END; + } + } + + if (pmState == PM_WAIT_DEAD_END) + { + /* + * PM_WAIT_DEAD_END state ends when the BackendList is entirely empty + * (ie, no dead_end children remain), and the archiver and stats + * collector are gone too. + * + * The reason we wait for those two is to protect them against a new + * postmaster starting conflicting subprocesses; this isn't an + * ironclad protection, but it at least helps in the + * shutdown-and-immediately-restart scenario. Note that they have + * already been sent appropriate shutdown signals, either during a + * normal state transition leading up to PM_WAIT_DEAD_END, or during + * FatalError processing. + */ + if (dlist_is_empty(&BackendList) && + PgArchPID == 0 && PgStatPID == 0) + { + /* These other guys should be dead already */ + Assert(StartupPID == 0); + Assert(WalReceiverPID == 0); + Assert(BgWriterPID == 0); + Assert(CheckpointerPID == 0); + Assert(WalWriterPID == 0); + Assert(AutoVacPID == 0); + /* syslogger is not considered here */ + pmState = PM_NO_CHILDREN; + } + } + + /* + * If we've been told to shut down, we exit as soon as there are no + * remaining children. If there was a crash, cleanup will occur at the + * next startup. (Before PostgreSQL 8.3, we tried to recover from the + * crash before exiting, but that seems unwise if we are quitting because + * we got SIGTERM from init --- there may well not be time for recovery + * before init decides to SIGKILL us.) + * + * Note that the syslogger continues to run. It will exit when it sees + * EOF on its input pipe, which happens when there are no more upstream + * processes. + */ + if (Shutdown > NoShutdown && pmState == PM_NO_CHILDREN) + { + if (FatalError) + { + ereport(LOG, (errmsg("abnormal database system shutdown"))); + ExitPostmaster(1); + } + else + { + /* + * Terminate exclusive backup mode to avoid recovery after a clean + * fast shutdown. Since an exclusive backup can only be taken + * during normal running (and not, for example, while running + * under Hot Standby) it only makes sense to do this if we reached + * normal running. If we're still in recovery, the backup file is + * one we're recovering *from*, and we must keep it around so that + * recovery restarts from the right place. + */ + if (ReachedNormalRunning) + CancelBackup(); + + /* + * Normal exit from the postmaster is here. We don't need to log + * anything here, since the UnlinkLockFiles proc_exit callback + * will do so, and that should be the last user-visible action. + */ + ExitPostmaster(0); + } + } + + /* + * If the startup process failed, or the user does not want an automatic + * restart after backend crashes, wait for all non-syslogger children to + * exit, and then exit postmaster. We don't try to reinitialize when the + * startup process fails, because more than likely it will just fail again + * and we will keep trying forever. + */ + if (pmState == PM_NO_CHILDREN) + { + if (StartupStatus == STARTUP_CRASHED) + { + ereport(LOG, + (errmsg("shutting down due to startup process failure"))); + ExitPostmaster(1); + } + if (!restart_after_crash) + { + ereport(LOG, + (errmsg("shutting down because restart_after_crash is off"))); + ExitPostmaster(1); + } + } + + /* + * If we need to recover from a crash, wait for all non-syslogger children + * to exit, then reset shmem and StartupDataBase. + */ + if (FatalError && pmState == PM_NO_CHILDREN) + { + ereport(LOG, + (errmsg("all server processes terminated; reinitializing"))); + + /* remove leftover temporary files after a crash */ + if (remove_temp_files_after_crash) + RemovePgTempFiles(); + + /* allow background workers to immediately restart */ + ResetBackgroundWorkerCrashTimes(); + + shmem_exit(1); + + /* re-read control file into local memory */ + LocalProcessControlFile(true); + + reset_shared(); + + StartupPID = StartupDataBase(); + Assert(StartupPID != 0); + StartupStatus = STARTUP_RUNNING; + pmState = PM_STARTUP; + /* crash recovery started, reset SIGKILL flag */ + AbortStartTime = 0; + } +} + + +/* + * Send a signal to a postmaster child process + * + * On systems that have setsid(), each child process sets itself up as a + * process group leader. For signals that are generally interpreted in the + * appropriate fashion, we signal the entire process group not just the + * direct child process. This allows us to, for example, SIGQUIT a blocked + * archive_recovery script, or SIGINT a script being run by a backend via + * system(). + * + * There is a race condition for recently-forked children: they might not + * have executed setsid() yet. So we signal the child directly as well as + * the group. We assume such a child will handle the signal before trying + * to spawn any grandchild processes. We also assume that signaling the + * child twice will not cause any problems. + */ +static void +signal_child(pid_t pid, int signal) +{ + if (kill(pid, signal) < 0) + elog(DEBUG3, "kill(%ld,%d) failed: %m", (long) pid, signal); +#ifdef HAVE_SETSID + switch (signal) + { + case SIGINT: + case SIGTERM: + case SIGQUIT: + case SIGSTOP: + case SIGKILL: + if (kill(-pid, signal) < 0) + elog(DEBUG3, "kill(%ld,%d) failed: %m", (long) (-pid), signal); + break; + default: + break; + } +#endif +} + +/* + * Send a signal to the targeted children (but NOT special children; + * dead_end children are never signaled, either). + */ +static bool +SignalSomeChildren(int signal, int target) +{ + dlist_iter iter; + bool signaled = false; + + dlist_foreach(iter, &BackendList) + { + Backend *bp = dlist_container(Backend, elem, iter.cur); + + if (bp->dead_end) + continue; + + /* + * Since target == BACKEND_TYPE_ALL is the most common case, we test + * it first and avoid touching shared memory for every child. + */ + if (target != BACKEND_TYPE_ALL) + { + /* + * Assign bkend_type for any recently announced WAL Sender + * processes. + */ + if (bp->bkend_type == BACKEND_TYPE_NORMAL && + IsPostmasterChildWalSender(bp->child_slot)) + bp->bkend_type = BACKEND_TYPE_WALSND; + + if (!(target & bp->bkend_type)) + continue; + } + + ereport(DEBUG4, + (errmsg_internal("sending signal %d to process %d", + signal, (int) bp->pid))); + signal_child(bp->pid, signal); + signaled = true; + } + return signaled; +} + +/* + * Send a termination signal to children. This considers all of our children + * processes, except syslogger and dead_end backends. + */ +static void +TerminateChildren(int signal) +{ + SignalChildren(signal); + if (StartupPID != 0) + { + signal_child(StartupPID, signal); + if (signal == SIGQUIT || signal == SIGKILL) + StartupStatus = STARTUP_SIGNALED; + } + if (BgWriterPID != 0) + signal_child(BgWriterPID, signal); + if (CheckpointerPID != 0) + signal_child(CheckpointerPID, signal); + if (WalWriterPID != 0) + signal_child(WalWriterPID, signal); + if (WalReceiverPID != 0) + signal_child(WalReceiverPID, signal); + if (AutoVacPID != 0) + signal_child(AutoVacPID, signal); + if (PgArchPID != 0) + signal_child(PgArchPID, signal); + if (PgStatPID != 0) + signal_child(PgStatPID, signal); +} + +/* + * BackendStartup -- start backend process + * + * returns: STATUS_ERROR if the fork failed, STATUS_OK otherwise. + * + * Note: if you change this code, also consider StartAutovacuumWorker. + */ +static int +BackendStartup(Port *port) +{ + Backend *bn; /* for backend cleanup */ + pid_t pid; + + /* + * Create backend data structure. Better before the fork() so we can + * handle failure cleanly. + */ + bn = (Backend *) malloc(sizeof(Backend)); + if (!bn) + { + ereport(LOG, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + return STATUS_ERROR; + } + + /* + * Compute the cancel key that will be assigned to this backend. The + * backend will have its own copy in the forked-off process' value of + * MyCancelKey, so that it can transmit the key to the frontend. + */ + if (!RandomCancelKey(&MyCancelKey)) + { + free(bn); + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("could not generate random cancel key"))); + return STATUS_ERROR; + } + + bn->cancel_key = MyCancelKey; + + /* Pass down canAcceptConnections state */ + port->canAcceptConnections = canAcceptConnections(BACKEND_TYPE_NORMAL); + bn->dead_end = (port->canAcceptConnections != CAC_OK && + port->canAcceptConnections != CAC_SUPERUSER); + + /* + * Unless it's a dead_end child, assign it a child slot number + */ + if (!bn->dead_end) + bn->child_slot = MyPMChildSlot = AssignPostmasterChildSlot(); + else + bn->child_slot = 0; + + /* Hasn't asked to be notified about any bgworkers yet */ + bn->bgworker_notify = false; + +#ifdef EXEC_BACKEND + pid = backend_forkexec(port); +#else /* !EXEC_BACKEND */ + pid = fork_process(); + if (pid == 0) /* child */ + { + free(bn); + + /* Detangle from postmaster */ + InitPostmasterChild(); + + /* Close the postmaster's sockets */ + ClosePostmasterPorts(false); + + /* Perform additional initialization and collect startup packet */ + BackendInitialize(port); + + /* And run the backend */ + BackendRun(port); + } +#endif /* EXEC_BACKEND */ + + if (pid < 0) + { + /* in parent, fork failed */ + int save_errno = errno; + + if (!bn->dead_end) + (void) ReleasePostmasterChildSlot(bn->child_slot); + free(bn); + errno = save_errno; + ereport(LOG, + (errmsg("could not fork new process for connection: %m"))); + report_fork_failure_to_client(port, save_errno); + return STATUS_ERROR; + } + + /* in parent, successful fork */ + ereport(DEBUG2, + (errmsg_internal("forked new backend, pid=%d socket=%d", + (int) pid, (int) port->sock))); + + /* + * Everything's been successful, it's safe to add this backend to our list + * of backends. + */ + bn->pid = pid; + bn->bkend_type = BACKEND_TYPE_NORMAL; /* Can change later to WALSND */ + dlist_push_head(&BackendList, &bn->elem); + +#ifdef EXEC_BACKEND + if (!bn->dead_end) + ShmemBackendArrayAdd(bn); +#endif + + return STATUS_OK; +} + +/* + * Try to report backend fork() failure to client before we close the + * connection. Since we do not care to risk blocking the postmaster on + * this connection, we set the connection to non-blocking and try only once. + * + * This is grungy special-purpose code; we cannot use backend libpq since + * it's not up and running. + */ +static void +report_fork_failure_to_client(Port *port, int errnum) +{ + char buffer[1000]; + int rc; + + /* Format the error message packet (always V2 protocol) */ + snprintf(buffer, sizeof(buffer), "E%s%s\n", + _("could not fork new process for connection: "), + strerror(errnum)); + + /* Set port to non-blocking. Don't do send() if this fails */ + if (!pg_set_noblock(port->sock)) + return; + + /* We'll retry after EINTR, but ignore all other failures */ + do + { + rc = send(port->sock, buffer, strlen(buffer) + 1, 0); + } while (rc < 0 && errno == EINTR); +} + + +/* + * BackendInitialize -- initialize an interactive (postmaster-child) + * backend process, and collect the client's startup packet. + * + * returns: nothing. Will not return at all if there's any failure. + * + * Note: this code does not depend on having any access to shared memory. + * Indeed, our approach to SIGTERM/timeout handling *requires* that + * shared memory not have been touched yet; see comments within. + * In the EXEC_BACKEND case, we are physically attached to shared memory + * but have not yet set up most of our local pointers to shmem structures. + */ +static void +BackendInitialize(Port *port) +{ + int status; + int ret; + char remote_host[NI_MAXHOST]; + char remote_port[NI_MAXSERV]; + StringInfoData ps_data; + + /* Save port etc. for ps status */ + MyProcPort = port; + + /* Tell fd.c about the long-lived FD associated with the port */ + ReserveExternalFD(); + + /* + * PreAuthDelay is a debugging aid for investigating problems in the + * authentication cycle: it can be set in postgresql.conf to allow time to + * attach to the newly-forked backend with a debugger. (See also + * PostAuthDelay, which we allow clients to pass through PGOPTIONS, but it + * is not honored until after authentication.) + */ + if (PreAuthDelay > 0) + pg_usleep(PreAuthDelay * 1000000L); + + /* This flag will remain set until InitPostgres finishes authentication */ + ClientAuthInProgress = true; /* limit visibility of log messages */ + + /* set these to empty in case they are needed before we set them up */ + port->remote_host = ""; + port->remote_port = ""; + + /* + * Initialize libpq and enable reporting of ereport errors to the client. + * Must do this now because authentication uses libpq to send messages. + */ + pq_init(); /* initialize libpq to talk to client */ + whereToSendOutput = DestRemote; /* now safe to ereport to client */ + + /* + * We arrange to do _exit(1) if we receive SIGTERM or timeout while trying + * to collect the startup packet; while SIGQUIT results in _exit(2). + * Otherwise the postmaster cannot shutdown the database FAST or IMMED + * cleanly if a buggy client fails to send the packet promptly. + * + * Exiting with _exit(1) is only possible because we have not yet touched + * shared memory; therefore no outside-the-process state needs to get + * cleaned up. + */ + pqsignal(SIGTERM, process_startup_packet_die); + /* SIGQUIT handler was already set up by InitPostmasterChild */ + InitializeTimeouts(); /* establishes SIGALRM handler */ + PG_SETMASK(&StartupBlockSig); + + /* + * Get the remote host name and port for logging and status display. + */ + remote_host[0] = '\0'; + remote_port[0] = '\0'; + if ((ret = pg_getnameinfo_all(&port->raddr.addr, port->raddr.salen, + remote_host, sizeof(remote_host), + remote_port, sizeof(remote_port), + (log_hostname ? 0 : NI_NUMERICHOST) | NI_NUMERICSERV)) != 0) + ereport(WARNING, + (errmsg_internal("pg_getnameinfo_all() failed: %s", + gai_strerror(ret)))); + + /* + * Save remote_host and remote_port in port structure (after this, they + * will appear in log_line_prefix data for log messages). + */ + port->remote_host = strdup(remote_host); + port->remote_port = strdup(remote_port); + + /* And now we can issue the Log_connections message, if wanted */ + if (Log_connections) + { + if (remote_port[0]) + ereport(LOG, + (errmsg("connection received: host=%s port=%s", + remote_host, + remote_port))); + else + ereport(LOG, + (errmsg("connection received: host=%s", + remote_host))); + } + + /* + * If we did a reverse lookup to name, we might as well save the results + * rather than possibly repeating the lookup during authentication. + * + * Note that we don't want to specify NI_NAMEREQD above, because then we'd + * get nothing useful for a client without an rDNS entry. Therefore, we + * must check whether we got a numeric IPv4 or IPv6 address, and not save + * it into remote_hostname if so. (This test is conservative and might + * sometimes classify a hostname as numeric, but an error in that + * direction is safe; it only results in a possible extra lookup.) + */ + if (log_hostname && + ret == 0 && + strspn(remote_host, "0123456789.") < strlen(remote_host) && + strspn(remote_host, "0123456789ABCDEFabcdef:") < strlen(remote_host)) + port->remote_hostname = strdup(remote_host); + + /* + * Ready to begin client interaction. We will give up and _exit(1) after + * a time delay, so that a broken client can't hog a connection + * indefinitely. PreAuthDelay and any DNS interactions above don't count + * against the time limit. + * + * Note: AuthenticationTimeout is applied here while waiting for the + * startup packet, and then again in InitPostgres for the duration of any + * authentication operations. So a hostile client could tie up the + * process for nearly twice AuthenticationTimeout before we kick him off. + * + * Note: because PostgresMain will call InitializeTimeouts again, the + * registration of STARTUP_PACKET_TIMEOUT will be lost. This is okay + * since we never use it again after this function. + */ + RegisterTimeout(STARTUP_PACKET_TIMEOUT, StartupPacketTimeoutHandler); + enable_timeout_after(STARTUP_PACKET_TIMEOUT, AuthenticationTimeout * 1000); + + /* + * Receive the startup packet (which might turn out to be a cancel request + * packet). + */ + status = ProcessStartupPacket(port, false, false); + + /* + * Disable the timeout, and prevent SIGTERM again. + */ + disable_timeout(STARTUP_PACKET_TIMEOUT, false); + PG_SETMASK(&BlockSig); + + /* + * As a safety check that nothing in startup has yet performed + * shared-memory modifications that would need to be undone if we had + * exited through SIGTERM or timeout above, check that no on_shmem_exit + * handlers have been registered yet. (This isn't terribly bulletproof, + * since someone might misuse an on_proc_exit handler for shmem cleanup, + * but it's a cheap and helpful check. We cannot disallow on_proc_exit + * handlers unfortunately, since pq_init() already registered one.) + */ + check_on_shmem_exit_lists_are_empty(); + + /* + * Stop here if it was bad or a cancel packet. ProcessStartupPacket + * already did any appropriate error reporting. + */ + if (status != STATUS_OK) + proc_exit(0); + + /* + * Now that we have the user and database name, we can set the process + * title for ps. It's good to do this as early as possible in startup. + */ + initStringInfo(&ps_data); + if (am_walsender) + appendStringInfo(&ps_data, "%s ", GetBackendTypeDesc(B_WAL_SENDER)); + appendStringInfo(&ps_data, "%s ", port->user_name); + if (!am_walsender) + appendStringInfo(&ps_data, "%s ", port->database_name); + appendStringInfo(&ps_data, "%s", port->remote_host); + if (port->remote_port[0] != '\0') + appendStringInfo(&ps_data, "(%s)", port->remote_port); + + init_ps_display(ps_data.data); + pfree(ps_data.data); + + set_ps_display("initializing"); +} + + +/* + * BackendRun -- set up the backend's argument list and invoke PostgresMain() + * + * returns: + * Doesn't return at all. + */ +static void +BackendRun(Port *port) +{ + char *av[2]; + const int ac = 1; + + av[0] = "postgres"; + av[1] = NULL; + + /* + * Make sure we aren't in PostmasterContext anymore. (We can't delete it + * just yet, though, because InitPostgres will need the HBA data.) + */ + MemoryContextSwitchTo(TopMemoryContext); + + PostgresMain(ac, av, port->database_name, port->user_name); +} + + +#ifdef EXEC_BACKEND + +/* + * postmaster_forkexec -- fork and exec a postmaster subprocess + * + * The caller must have set up the argv array already, except for argv[2] + * which will be filled with the name of the temp variable file. + * + * Returns the child process PID, or -1 on fork failure (a suitable error + * message has been logged on failure). + * + * All uses of this routine will dispatch to SubPostmasterMain in the + * child process. + */ +pid_t +postmaster_forkexec(int argc, char *argv[]) +{ + Port port; + + /* This entry point passes dummy values for the Port variables */ + memset(&port, 0, sizeof(port)); + return internal_forkexec(argc, argv, &port); +} + +/* + * backend_forkexec -- fork/exec off a backend process + * + * Some operating systems (WIN32) don't have fork() so we have to simulate + * it by storing parameters that need to be passed to the child and + * then create a new child process. + * + * returns the pid of the fork/exec'd process, or -1 on failure + */ +static pid_t +backend_forkexec(Port *port) +{ + char *av[4]; + int ac = 0; + + av[ac++] = "postgres"; + av[ac++] = "--forkbackend"; + av[ac++] = NULL; /* filled in by internal_forkexec */ + + av[ac] = NULL; + Assert(ac < lengthof(av)); + + return internal_forkexec(ac, av, port); +} + +#ifndef WIN32 + +/* + * internal_forkexec non-win32 implementation + * + * - writes out backend variables to the parameter file + * - fork():s, and then exec():s the child process + */ +static pid_t +internal_forkexec(int argc, char *argv[], Port *port) +{ + static unsigned long tmpBackendFileNum = 0; + pid_t pid; + char tmpfilename[MAXPGPATH]; + BackendParameters param; + FILE *fp; + + if (!save_backend_variables(¶m, port)) + return -1; /* log made by save_backend_variables */ + + /* Calculate name for temp file */ + snprintf(tmpfilename, MAXPGPATH, "%s/%s.backend_var.%d.%lu", + PG_TEMP_FILES_DIR, PG_TEMP_FILE_PREFIX, + MyProcPid, ++tmpBackendFileNum); + + /* Open file */ + fp = AllocateFile(tmpfilename, PG_BINARY_W); + if (!fp) + { + /* + * As in OpenTemporaryFileInTablespace, try to make the temp-file + * directory, ignoring errors. + */ + (void) MakePGDirectory(PG_TEMP_FILES_DIR); + + fp = AllocateFile(tmpfilename, PG_BINARY_W); + if (!fp) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", + tmpfilename))); + return -1; + } + } + + if (fwrite(¶m, sizeof(param), 1, fp) != 1) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmpfilename))); + FreeFile(fp); + return -1; + } + + /* Release file */ + if (FreeFile(fp)) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmpfilename))); + return -1; + } + + /* Make sure caller set up argv properly */ + Assert(argc >= 3); + Assert(argv[argc] == NULL); + Assert(strncmp(argv[1], "--fork", 6) == 0); + Assert(argv[2] == NULL); + + /* Insert temp file name after --fork argument */ + argv[2] = tmpfilename; + + /* Fire off execv in child */ + if ((pid = fork_process()) == 0) + { + if (execv(postgres_exec_path, argv) < 0) + { + ereport(LOG, + (errmsg("could not execute server process \"%s\": %m", + postgres_exec_path))); + /* We're already in the child process here, can't return */ + exit(1); + } + } + + return pid; /* Parent returns pid, or -1 on fork failure */ +} +#else /* WIN32 */ + +/* + * internal_forkexec win32 implementation + * + * - starts backend using CreateProcess(), in suspended state + * - writes out backend variables to the parameter file + * - during this, duplicates handles and sockets required for + * inheritance into the new process + * - resumes execution of the new process once the backend parameter + * file is complete. + */ +static pid_t +internal_forkexec(int argc, char *argv[], Port *port) +{ + int retry_count = 0; + STARTUPINFO si; + PROCESS_INFORMATION pi; + int i; + int j; + char cmdLine[MAXPGPATH * 2]; + HANDLE paramHandle; + BackendParameters *param; + SECURITY_ATTRIBUTES sa; + char paramHandleStr[32]; + win32_deadchild_waitinfo *childinfo; + + /* Make sure caller set up argv properly */ + Assert(argc >= 3); + Assert(argv[argc] == NULL); + Assert(strncmp(argv[1], "--fork", 6) == 0); + Assert(argv[2] == NULL); + + /* Resume here if we need to retry */ +retry: + + /* Set up shared memory for parameter passing */ + ZeroMemory(&sa, sizeof(sa)); + sa.nLength = sizeof(sa); + sa.bInheritHandle = TRUE; + paramHandle = CreateFileMapping(INVALID_HANDLE_VALUE, + &sa, + PAGE_READWRITE, + 0, + sizeof(BackendParameters), + NULL); + if (paramHandle == INVALID_HANDLE_VALUE) + { + ereport(LOG, + (errmsg("could not create backend parameter file mapping: error code %lu", + GetLastError()))); + return -1; + } + + param = MapViewOfFile(paramHandle, FILE_MAP_WRITE, 0, 0, sizeof(BackendParameters)); + if (!param) + { + ereport(LOG, + (errmsg("could not map backend parameter memory: error code %lu", + GetLastError()))); + CloseHandle(paramHandle); + return -1; + } + + /* Insert temp file name after --fork argument */ +#ifdef _WIN64 + sprintf(paramHandleStr, "%llu", (LONG_PTR) paramHandle); +#else + sprintf(paramHandleStr, "%lu", (DWORD) paramHandle); +#endif + argv[2] = paramHandleStr; + + /* Format the cmd line */ + cmdLine[sizeof(cmdLine) - 1] = '\0'; + cmdLine[sizeof(cmdLine) - 2] = '\0'; + snprintf(cmdLine, sizeof(cmdLine) - 1, "\"%s\"", postgres_exec_path); + i = 0; + while (argv[++i] != NULL) + { + j = strlen(cmdLine); + snprintf(cmdLine + j, sizeof(cmdLine) - 1 - j, " \"%s\"", argv[i]); + } + if (cmdLine[sizeof(cmdLine) - 2] != '\0') + { + ereport(LOG, + (errmsg("subprocess command line too long"))); + UnmapViewOfFile(param); + CloseHandle(paramHandle); + return -1; + } + + memset(&pi, 0, sizeof(pi)); + memset(&si, 0, sizeof(si)); + si.cb = sizeof(si); + + /* + * Create the subprocess in a suspended state. This will be resumed later, + * once we have written out the parameter file. + */ + if (!CreateProcess(NULL, cmdLine, NULL, NULL, TRUE, CREATE_SUSPENDED, + NULL, NULL, &si, &pi)) + { + ereport(LOG, + (errmsg("CreateProcess() call failed: %m (error code %lu)", + GetLastError()))); + UnmapViewOfFile(param); + CloseHandle(paramHandle); + return -1; + } + + if (!save_backend_variables(param, port, pi.hProcess, pi.dwProcessId)) + { + /* + * log made by save_backend_variables, but we have to clean up the + * mess with the half-started process + */ + if (!TerminateProcess(pi.hProcess, 255)) + ereport(LOG, + (errmsg_internal("could not terminate unstarted process: error code %lu", + GetLastError()))); + CloseHandle(pi.hProcess); + CloseHandle(pi.hThread); + UnmapViewOfFile(param); + CloseHandle(paramHandle); + return -1; /* log made by save_backend_variables */ + } + + /* Drop the parameter shared memory that is now inherited to the backend */ + if (!UnmapViewOfFile(param)) + ereport(LOG, + (errmsg("could not unmap view of backend parameter file: error code %lu", + GetLastError()))); + if (!CloseHandle(paramHandle)) + ereport(LOG, + (errmsg("could not close handle to backend parameter file: error code %lu", + GetLastError()))); + + /* + * Reserve the memory region used by our main shared memory segment before + * we resume the child process. Normally this should succeed, but if ASLR + * is active then it might sometimes fail due to the stack or heap having + * gotten mapped into that range. In that case, just terminate the + * process and retry. + */ + if (!pgwin32_ReserveSharedMemoryRegion(pi.hProcess)) + { + /* pgwin32_ReserveSharedMemoryRegion already made a log entry */ + if (!TerminateProcess(pi.hProcess, 255)) + ereport(LOG, + (errmsg_internal("could not terminate process that failed to reserve memory: error code %lu", + GetLastError()))); + CloseHandle(pi.hProcess); + CloseHandle(pi.hThread); + if (++retry_count < 100) + goto retry; + ereport(LOG, + (errmsg("giving up after too many tries to reserve shared memory"), + errhint("This might be caused by ASLR or antivirus software."))); + return -1; + } + + /* + * Now that the backend variables are written out, we start the child + * thread so it can start initializing while we set up the rest of the + * parent state. + */ + if (ResumeThread(pi.hThread) == -1) + { + if (!TerminateProcess(pi.hProcess, 255)) + { + ereport(LOG, + (errmsg_internal("could not terminate unstartable process: error code %lu", + GetLastError()))); + CloseHandle(pi.hProcess); + CloseHandle(pi.hThread); + return -1; + } + CloseHandle(pi.hProcess); + CloseHandle(pi.hThread); + ereport(LOG, + (errmsg_internal("could not resume thread of unstarted process: error code %lu", + GetLastError()))); + return -1; + } + + /* + * Queue a waiter to signal when this child dies. The wait will be handled + * automatically by an operating system thread pool. + * + * Note: use malloc instead of palloc, since it needs to be thread-safe. + * Struct will be free():d from the callback function that runs on a + * different thread. + */ + childinfo = malloc(sizeof(win32_deadchild_waitinfo)); + if (!childinfo) + ereport(FATAL, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + childinfo->procHandle = pi.hProcess; + childinfo->procId = pi.dwProcessId; + + if (!RegisterWaitForSingleObject(&childinfo->waitHandle, + pi.hProcess, + pgwin32_deadchild_callback, + childinfo, + INFINITE, + WT_EXECUTEONLYONCE | WT_EXECUTEINWAITTHREAD)) + ereport(FATAL, + (errmsg_internal("could not register process for wait: error code %lu", + GetLastError()))); + + /* Don't close pi.hProcess here - the wait thread needs access to it */ + + CloseHandle(pi.hThread); + + return pi.dwProcessId; +} +#endif /* WIN32 */ + + +/* + * SubPostmasterMain -- Get the fork/exec'd process into a state equivalent + * to what it would be if we'd simply forked on Unix, and then + * dispatch to the appropriate place. + * + * The first two command line arguments are expected to be "--forkFOO" + * (where FOO indicates which postmaster child we are to become), and + * the name of a variables file that we can read to load data that would + * have been inherited by fork() on Unix. Remaining arguments go to the + * subprocess FooMain() routine. + */ +void +SubPostmasterMain(int argc, char *argv[]) +{ + Port port; + + /* In EXEC_BACKEND case we will not have inherited these settings */ + IsPostmasterEnvironment = true; + whereToSendOutput = DestNone; + + /* Setup essential subsystems (to ensure elog() behaves sanely) */ + InitializeGUCOptions(); + + /* Check we got appropriate args */ + if (argc < 3) + elog(FATAL, "invalid subpostmaster invocation"); + + /* Read in the variables file */ + memset(&port, 0, sizeof(Port)); + read_backend_variables(argv[2], &port); + + /* Close the postmaster's sockets (as soon as we know them) */ + ClosePostmasterPorts(strcmp(argv[1], "--forklog") == 0); + + /* + * Start our win32 signal implementation. This has to be done after we + * read the backend variables, because we need to pick up the signal pipe + * from the parent process. + */ +#ifdef WIN32 + pgwin32_signal_initialize(); +#endif + + /* Setup as postmaster child */ + InitPostmasterChild(); + + /* + * If appropriate, physically re-attach to shared memory segment. We want + * to do this before going any further to ensure that we can attach at the + * same address the postmaster used. On the other hand, if we choose not + * to re-attach, we may have other cleanup to do. + * + * If testing EXEC_BACKEND on Linux, you should run this as root before + * starting the postmaster: + * + * echo 0 >/proc/sys/kernel/randomize_va_space + * + * This prevents using randomized stack and code addresses that cause the + * child process's memory map to be different from the parent's, making it + * sometimes impossible to attach to shared memory at the desired address. + * Return the setting to its old value (usually '1' or '2') when finished. + */ + if (strcmp(argv[1], "--forkbackend") == 0 || + strcmp(argv[1], "--forkavlauncher") == 0 || + strcmp(argv[1], "--forkavworker") == 0 || + strcmp(argv[1], "--forkboot") == 0 || + strncmp(argv[1], "--forkbgworker=", 15) == 0) + PGSharedMemoryReAttach(); + else + PGSharedMemoryNoReAttach(); + + /* autovacuum needs this set before calling InitProcess */ + if (strcmp(argv[1], "--forkavlauncher") == 0) + AutovacuumLauncherIAm(); + if (strcmp(argv[1], "--forkavworker") == 0) + AutovacuumWorkerIAm(); + + /* Read in remaining GUC variables */ + read_nondefault_variables(); + + /* + * Check that the data directory looks valid, which will also check the + * privileges on the data directory and update our umask and file/group + * variables for creating files later. Note: this should really be done + * before we create any files or directories. + */ + checkDataDir(); + + /* + * (re-)read control file, as it contains config. The postmaster will + * already have read this, but this process doesn't know about that. + */ + LocalProcessControlFile(false); + + /* + * Reload any libraries that were preloaded by the postmaster. Since we + * exec'd this process, those libraries didn't come along with us; but we + * should load them into all child processes to be consistent with the + * non-EXEC_BACKEND behavior. + */ + process_shared_preload_libraries(); + + /* Run backend or appropriate child */ + if (strcmp(argv[1], "--forkbackend") == 0) + { + Assert(argc == 3); /* shouldn't be any more args */ + + /* + * Need to reinitialize the SSL library in the backend, since the + * context structures contain function pointers and cannot be passed + * through the parameter file. + * + * If for some reason reload fails (maybe the user installed broken + * key files), soldier on without SSL; that's better than all + * connections becoming impossible. + * + * XXX should we do this in all child processes? For the moment it's + * enough to do it in backend children. + */ +#ifdef USE_SSL + if (EnableSSL) + { + if (secure_initialize(false) == 0) + LoadedSSL = true; + else + ereport(LOG, + (errmsg("SSL configuration could not be loaded in child process"))); + } +#endif + + /* + * Perform additional initialization and collect startup packet. + * + * We want to do this before InitProcess() for a couple of reasons: 1. + * so that we aren't eating up a PGPROC slot while waiting on the + * client. 2. so that if InitProcess() fails due to being out of + * PGPROC slots, we have already initialized libpq and are able to + * report the error to the client. + */ + BackendInitialize(&port); + + /* Restore basic shared memory pointers */ + InitShmemAccess(UsedShmemSegAddr); + + /* Need a PGPROC to run CreateSharedMemoryAndSemaphores */ + InitProcess(); + + /* Attach process to shared data structures */ + CreateSharedMemoryAndSemaphores(); + + /* And run the backend */ + BackendRun(&port); /* does not return */ + } + if (strcmp(argv[1], "--forkboot") == 0) + { + /* Restore basic shared memory pointers */ + InitShmemAccess(UsedShmemSegAddr); + + /* Need a PGPROC to run CreateSharedMemoryAndSemaphores */ + InitAuxiliaryProcess(); + + /* Attach process to shared data structures */ + CreateSharedMemoryAndSemaphores(); + + AuxiliaryProcessMain(argc - 2, argv + 2); /* does not return */ + } + if (strcmp(argv[1], "--forkavlauncher") == 0) + { + /* Restore basic shared memory pointers */ + InitShmemAccess(UsedShmemSegAddr); + + /* Need a PGPROC to run CreateSharedMemoryAndSemaphores */ + InitProcess(); + + /* Attach process to shared data structures */ + CreateSharedMemoryAndSemaphores(); + + AutoVacLauncherMain(argc - 2, argv + 2); /* does not return */ + } + if (strcmp(argv[1], "--forkavworker") == 0) + { + /* Restore basic shared memory pointers */ + InitShmemAccess(UsedShmemSegAddr); + + /* Need a PGPROC to run CreateSharedMemoryAndSemaphores */ + InitProcess(); + + /* Attach process to shared data structures */ + CreateSharedMemoryAndSemaphores(); + + AutoVacWorkerMain(argc - 2, argv + 2); /* does not return */ + } + if (strncmp(argv[1], "--forkbgworker=", 15) == 0) + { + int shmem_slot; + + /* do this as early as possible; in particular, before InitProcess() */ + IsBackgroundWorker = true; + + /* Restore basic shared memory pointers */ + InitShmemAccess(UsedShmemSegAddr); + + /* Need a PGPROC to run CreateSharedMemoryAndSemaphores */ + InitProcess(); + + /* Attach process to shared data structures */ + CreateSharedMemoryAndSemaphores(); + + /* Fetch MyBgworkerEntry from shared memory */ + shmem_slot = atoi(argv[1] + 15); + MyBgworkerEntry = BackgroundWorkerEntry(shmem_slot); + + StartBackgroundWorker(); + } + if (strcmp(argv[1], "--forkcol") == 0) + { + /* Do not want to attach to shared memory */ + + PgstatCollectorMain(argc, argv); /* does not return */ + } + if (strcmp(argv[1], "--forklog") == 0) + { + /* Do not want to attach to shared memory */ + + SysLoggerMain(argc, argv); /* does not return */ + } + + abort(); /* shouldn't get here */ +} +#endif /* EXEC_BACKEND */ + + +/* + * ExitPostmaster -- cleanup + * + * Do NOT call exit() directly --- always go through here! + */ +static void +ExitPostmaster(int status) +{ +#ifdef HAVE_PTHREAD_IS_THREADED_NP + + /* + * There is no known cause for a postmaster to become multithreaded after + * startup. Recheck to account for the possibility of unknown causes. + * This message uses LOG level, because an unclean shutdown at this point + * would usually not look much different from a clean shutdown. + */ + if (pthread_is_threaded_np() != 0) + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg_internal("postmaster became multithreaded"), + errdetail("Please report this to <%s>.", PACKAGE_BUGREPORT))); +#endif + + /* should cleanup shared memory and kill all backends */ + + /* + * Not sure of the semantics here. When the Postmaster dies, should the + * backends all be killed? probably not. + * + * MUST -- vadim 05-10-1999 + */ + + proc_exit(status); +} + +/* + * sigusr1_handler - handle signal conditions from child processes + */ +static void +sigusr1_handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + /* + * We rely on the signal mechanism to have blocked all signals ... except + * on Windows, which lacks sigaction(), so we have to do it manually. + */ +#ifdef WIN32 + PG_SETMASK(&BlockSig); +#endif + + /* + * RECOVERY_STARTED and BEGIN_HOT_STANDBY signals are ignored in + * unexpected states. If the startup process quickly starts up, completes + * recovery, exits, we might process the death of the startup process + * first. We don't want to go back to recovery in that case. + */ + if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_STARTED) && + pmState == PM_STARTUP && Shutdown == NoShutdown) + { + /* WAL redo has started. We're out of reinitialization. */ + FatalError = false; + AbortStartTime = 0; + + /* + * Crank up the background tasks. It doesn't matter if this fails, + * we'll just try again later. + */ + Assert(CheckpointerPID == 0); + CheckpointerPID = StartCheckpointer(); + Assert(BgWriterPID == 0); + BgWriterPID = StartBackgroundWriter(); + + /* + * Start the archiver if we're responsible for (re-)archiving received + * files. + */ + Assert(PgArchPID == 0); + if (XLogArchivingAlways()) + PgArchPID = StartArchiver(); + + /* + * If we aren't planning to enter hot standby mode later, treat + * RECOVERY_STARTED as meaning we're out of startup, and report status + * accordingly. + */ + if (!EnableHotStandby) + { + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STANDBY); +#ifdef USE_SYSTEMD + sd_notify(0, "READY=1"); +#endif + } + + pmState = PM_RECOVERY; + } + + if (CheckPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY) && + pmState == PM_RECOVERY && Shutdown == NoShutdown) + { + /* + * Likewise, start other special children as needed. + */ + Assert(PgStatPID == 0); + PgStatPID = pgstat_start(); + + ereport(LOG, + (errmsg("database system is ready to accept read-only connections"))); + + /* Report status */ + AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_READY); +#ifdef USE_SYSTEMD + sd_notify(0, "READY=1"); +#endif + + pmState = PM_HOT_STANDBY; + connsAllowed = ALLOW_ALL_CONNS; + + /* Some workers may be scheduled to start now */ + StartWorkerNeeded = true; + } + + /* Process background worker state changes. */ + if (CheckPostmasterSignal(PMSIGNAL_BACKGROUND_WORKER_CHANGE)) + { + /* Accept new worker requests only if not stopping. */ + BackgroundWorkerStateChange(pmState < PM_STOP_BACKENDS); + StartWorkerNeeded = true; + } + + if (StartWorkerNeeded || HaveCrashedWorker) + maybe_start_bgworkers(); + + /* Tell syslogger to rotate logfile if requested */ + if (SysLoggerPID != 0) + { + if (CheckLogrotateSignal()) + { + signal_child(SysLoggerPID, SIGUSR1); + RemoveLogrotateSignalFiles(); + } + else if (CheckPostmasterSignal(PMSIGNAL_ROTATE_LOGFILE)) + { + signal_child(SysLoggerPID, SIGUSR1); + } + } + + if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER) && + Shutdown <= SmartShutdown && pmState < PM_STOP_BACKENDS) + { + /* + * Start one iteration of the autovacuum daemon, even if autovacuuming + * is nominally not enabled. This is so we can have an active defense + * against transaction ID wraparound. We set a flag for the main loop + * to do it rather than trying to do it here --- this is because the + * autovac process itself may send the signal, and we want to handle + * that by launching another iteration as soon as the current one + * completes. + */ + start_autovac_launcher = true; + } + + if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER) && + Shutdown <= SmartShutdown && pmState < PM_STOP_BACKENDS) + { + /* The autovacuum launcher wants us to start a worker process. */ + StartAutovacuumWorker(); + } + + if (CheckPostmasterSignal(PMSIGNAL_START_WALRECEIVER)) + { + /* Startup Process wants us to start the walreceiver process. */ + /* Start immediately if possible, else remember request for later. */ + WalReceiverRequested = true; + MaybeStartWalReceiver(); + } + + /* + * Try to advance postmaster's state machine, if a child requests it. + * + * Be careful about the order of this action relative to sigusr1_handler's + * other actions. Generally, this should be after other actions, in case + * they have effects PostmasterStateMachine would need to know about. + * However, we should do it before the CheckPromoteSignal step, which + * cannot have any (immediate) effect on the state machine, but does + * depend on what state we're in now. + */ + if (CheckPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE)) + { + PostmasterStateMachine(); + } + + if (StartupPID != 0 && + (pmState == PM_STARTUP || pmState == PM_RECOVERY || + pmState == PM_HOT_STANDBY) && + CheckPromoteSignal()) + { + /* + * Tell startup process to finish recovery. + * + * Leave the promote signal file in place and let the Startup process + * do the unlink. + */ + signal_child(StartupPID, SIGUSR2); + } + +#ifdef WIN32 + PG_SETMASK(&UnBlockSig); +#endif + + errno = save_errno; +} + +/* + * SIGTERM while processing startup packet. + * + * Running proc_exit() from a signal handler would be quite unsafe. + * However, since we have not yet touched shared memory, we can just + * pull the plug and exit without running any atexit handlers. + * + * One might be tempted to try to send a message, or log one, indicating + * why we are disconnecting. However, that would be quite unsafe in itself. + * Also, it seems undesirable to provide clues about the database's state + * to a client that has not yet completed authentication, or even sent us + * a startup packet. + */ +static void +process_startup_packet_die(SIGNAL_ARGS) +{ + _exit(1); +} + +/* + * Dummy signal handler + * + * We use this for signals that we don't actually use in the postmaster, + * but we do use in backends. If we were to SIG_IGN such signals in the + * postmaster, then a newly started backend might drop a signal that arrives + * before it's able to reconfigure its signal processing. (See notes in + * tcop/postgres.c.) + */ +static void +dummy_handler(SIGNAL_ARGS) +{ +} + +/* + * Timeout while processing startup packet. + * As for process_startup_packet_die(), we exit via _exit(1). + */ +static void +StartupPacketTimeoutHandler(void) +{ + _exit(1); +} + + +/* + * Generate a random cancel key. + */ +static bool +RandomCancelKey(int32 *cancel_key) +{ + return pg_strong_random(cancel_key, sizeof(int32)); +} + +/* + * Count up number of child processes of specified types (dead_end children + * are always excluded). + */ +static int +CountChildren(int target) +{ + dlist_iter iter; + int cnt = 0; + + dlist_foreach(iter, &BackendList) + { + Backend *bp = dlist_container(Backend, elem, iter.cur); + + if (bp->dead_end) + continue; + + /* + * Since target == BACKEND_TYPE_ALL is the most common case, we test + * it first and avoid touching shared memory for every child. + */ + if (target != BACKEND_TYPE_ALL) + { + /* + * Assign bkend_type for any recently announced WAL Sender + * processes. + */ + if (bp->bkend_type == BACKEND_TYPE_NORMAL && + IsPostmasterChildWalSender(bp->child_slot)) + bp->bkend_type = BACKEND_TYPE_WALSND; + + if (!(target & bp->bkend_type)) + continue; + } + + cnt++; + } + return cnt; +} + + +/* + * StartChildProcess -- start an auxiliary process for the postmaster + * + * "type" determines what kind of child will be started. All child types + * initially go to AuxiliaryProcessMain, which will handle common setup. + * + * Return value of StartChildProcess is subprocess' PID, or 0 if failed + * to start subprocess. + */ +static pid_t +StartChildProcess(AuxProcType type) +{ + pid_t pid; + char *av[10]; + int ac = 0; + char typebuf[32]; + + /* + * Set up command-line arguments for subprocess + */ + av[ac++] = "postgres"; + +#ifdef EXEC_BACKEND + av[ac++] = "--forkboot"; + av[ac++] = NULL; /* filled in by postmaster_forkexec */ +#endif + + snprintf(typebuf, sizeof(typebuf), "-x%d", type); + av[ac++] = typebuf; + + av[ac] = NULL; + Assert(ac < lengthof(av)); + +#ifdef EXEC_BACKEND + pid = postmaster_forkexec(ac, av); +#else /* !EXEC_BACKEND */ + pid = fork_process(); + + if (pid == 0) /* child */ + { + InitPostmasterChild(); + + /* Close the postmaster's sockets */ + ClosePostmasterPorts(false); + + /* Release postmaster's working memory context */ + MemoryContextSwitchTo(TopMemoryContext); + MemoryContextDelete(PostmasterContext); + PostmasterContext = NULL; + + AuxiliaryProcessMain(ac, av); /* does not return */ + } +#endif /* EXEC_BACKEND */ + + if (pid < 0) + { + /* in parent, fork failed */ + int save_errno = errno; + + errno = save_errno; + switch (type) + { + case StartupProcess: + ereport(LOG, + (errmsg("could not fork startup process: %m"))); + break; + case ArchiverProcess: + ereport(LOG, + (errmsg("could not fork archiver process: %m"))); + break; + case BgWriterProcess: + ereport(LOG, + (errmsg("could not fork background writer process: %m"))); + break; + case CheckpointerProcess: + ereport(LOG, + (errmsg("could not fork checkpointer process: %m"))); + break; + case WalWriterProcess: + ereport(LOG, + (errmsg("could not fork WAL writer process: %m"))); + break; + case WalReceiverProcess: + ereport(LOG, + (errmsg("could not fork WAL receiver process: %m"))); + break; + default: + ereport(LOG, + (errmsg("could not fork process: %m"))); + break; + } + + /* + * fork failure is fatal during startup, but there's no need to choke + * immediately if starting other child types fails. + */ + if (type == StartupProcess) + ExitPostmaster(1); + return 0; + } + + /* + * in parent, successful fork + */ + return pid; +} + +/* + * StartAutovacuumWorker + * Start an autovac worker process. + * + * This function is here because it enters the resulting PID into the + * postmaster's private backends list. + * + * NB -- this code very roughly matches BackendStartup. + */ +static void +StartAutovacuumWorker(void) +{ + Backend *bn; + + /* + * If not in condition to run a process, don't try, but handle it like a + * fork failure. This does not normally happen, since the signal is only + * supposed to be sent by autovacuum launcher when it's OK to do it, but + * we have to check to avoid race-condition problems during DB state + * changes. + */ + if (canAcceptConnections(BACKEND_TYPE_AUTOVAC) == CAC_OK) + { + /* + * Compute the cancel key that will be assigned to this session. We + * probably don't need cancel keys for autovac workers, but we'd + * better have something random in the field to prevent unfriendly + * people from sending cancels to them. + */ + if (!RandomCancelKey(&MyCancelKey)) + { + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("could not generate random cancel key"))); + return; + } + + bn = (Backend *) malloc(sizeof(Backend)); + if (bn) + { + bn->cancel_key = MyCancelKey; + + /* Autovac workers are not dead_end and need a child slot */ + bn->dead_end = false; + bn->child_slot = MyPMChildSlot = AssignPostmasterChildSlot(); + bn->bgworker_notify = false; + + bn->pid = StartAutoVacWorker(); + if (bn->pid > 0) + { + bn->bkend_type = BACKEND_TYPE_AUTOVAC; + dlist_push_head(&BackendList, &bn->elem); +#ifdef EXEC_BACKEND + ShmemBackendArrayAdd(bn); +#endif + /* all OK */ + return; + } + + /* + * fork failed, fall through to report -- actual error message was + * logged by StartAutoVacWorker + */ + (void) ReleasePostmasterChildSlot(bn->child_slot); + free(bn); + } + else + ereport(LOG, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + /* + * Report the failure to the launcher, if it's running. (If it's not, we + * might not even be connected to shared memory, so don't try to call + * AutoVacWorkerFailed.) Note that we also need to signal it so that it + * responds to the condition, but we don't do that here, instead waiting + * for ServerLoop to do it. This way we avoid a ping-pong signaling in + * quick succession between the autovac launcher and postmaster in case + * things get ugly. + */ + if (AutoVacPID != 0) + { + AutoVacWorkerFailed(); + avlauncher_needs_signal = true; + } +} + +/* + * MaybeStartWalReceiver + * Start the WAL receiver process, if not running and our state allows. + * + * Note: if WalReceiverPID is already nonzero, it might seem that we should + * clear WalReceiverRequested. However, there's a race condition if the + * walreceiver terminates and the startup process immediately requests a new + * one: it's quite possible to get the signal for the request before reaping + * the dead walreceiver process. Better to risk launching an extra + * walreceiver than to miss launching one we need. (The walreceiver code + * has logic to recognize that it should go away if not needed.) + */ +static void +MaybeStartWalReceiver(void) +{ + if (WalReceiverPID == 0 && + (pmState == PM_STARTUP || pmState == PM_RECOVERY || + pmState == PM_HOT_STANDBY) && + Shutdown <= SmartShutdown) + { + WalReceiverPID = StartWalReceiver(); + if (WalReceiverPID != 0) + WalReceiverRequested = false; + /* else leave the flag set, so we'll try again later */ + } +} + + +/* + * Create the opts file + */ +static bool +CreateOptsFile(int argc, char *argv[], char *fullprogname) +{ + FILE *fp; + int i; + +#define OPTS_FILE "postmaster.opts" + + if ((fp = fopen(OPTS_FILE, "w")) == NULL) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", OPTS_FILE))); + return false; + } + + fprintf(fp, "%s", fullprogname); + for (i = 1; i < argc; i++) + fprintf(fp, " \"%s\"", argv[i]); + fputs("\n", fp); + + if (fclose(fp)) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", OPTS_FILE))); + return false; + } + + return true; +} + + +/* + * MaxLivePostmasterChildren + * + * This reports the number of entries needed in per-child-process arrays + * (the PMChildFlags array, and if EXEC_BACKEND the ShmemBackendArray). + * These arrays include regular backends, autovac workers, walsenders + * and background workers, but not special children nor dead_end children. + * This allows the arrays to have a fixed maximum size, to wit the same + * too-many-children limit enforced by canAcceptConnections(). The exact value + * isn't too critical as long as it's more than MaxBackends. + */ +int +MaxLivePostmasterChildren(void) +{ + return 2 * (MaxConnections + autovacuum_max_workers + 1 + + max_wal_senders + max_worker_processes); +} + +/* + * Connect background worker to a database. + */ +void +BackgroundWorkerInitializeConnection(const char *dbname, const char *username, uint32 flags) +{ + BackgroundWorker *worker = MyBgworkerEntry; + + /* XXX is this the right errcode? */ + if (!(worker->bgw_flags & BGWORKER_BACKEND_DATABASE_CONNECTION)) + ereport(FATAL, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("database connection requirement not indicated during registration"))); + + InitPostgres(dbname, InvalidOid, username, InvalidOid, NULL, (flags & BGWORKER_BYPASS_ALLOWCONN) != 0); + + /* it had better not gotten out of "init" mode yet */ + if (!IsInitProcessingMode()) + ereport(ERROR, + (errmsg("invalid processing mode in background worker"))); + SetProcessingMode(NormalProcessing); +} + +/* + * Connect background worker to a database using OIDs. + */ +void +BackgroundWorkerInitializeConnectionByOid(Oid dboid, Oid useroid, uint32 flags) +{ + BackgroundWorker *worker = MyBgworkerEntry; + + /* XXX is this the right errcode? */ + if (!(worker->bgw_flags & BGWORKER_BACKEND_DATABASE_CONNECTION)) + ereport(FATAL, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("database connection requirement not indicated during registration"))); + + InitPostgres(NULL, dboid, NULL, useroid, NULL, (flags & BGWORKER_BYPASS_ALLOWCONN) != 0); + + /* it had better not gotten out of "init" mode yet */ + if (!IsInitProcessingMode()) + ereport(ERROR, + (errmsg("invalid processing mode in background worker"))); + SetProcessingMode(NormalProcessing); +} + +/* + * Block/unblock signals in a background worker + */ +void +BackgroundWorkerBlockSignals(void) +{ + PG_SETMASK(&BlockSig); +} + +void +BackgroundWorkerUnblockSignals(void) +{ + PG_SETMASK(&UnBlockSig); +} + +#ifdef EXEC_BACKEND +static pid_t +bgworker_forkexec(int shmem_slot) +{ + char *av[10]; + int ac = 0; + char forkav[MAXPGPATH]; + + snprintf(forkav, MAXPGPATH, "--forkbgworker=%d", shmem_slot); + + av[ac++] = "postgres"; + av[ac++] = forkav; + av[ac++] = NULL; /* filled in by postmaster_forkexec */ + av[ac] = NULL; + + Assert(ac < lengthof(av)); + + return postmaster_forkexec(ac, av); +} +#endif + +/* + * Start a new bgworker. + * Starting time conditions must have been checked already. + * + * Returns true on success, false on failure. + * In either case, update the RegisteredBgWorker's state appropriately. + * + * This code is heavily based on autovacuum.c, q.v. + */ +static bool +do_start_bgworker(RegisteredBgWorker *rw) +{ + pid_t worker_pid; + + Assert(rw->rw_pid == 0); + + /* + * Allocate and assign the Backend element. Note we must do this before + * forking, so that we can handle failures (out of memory or child-process + * slots) cleanly. + * + * Treat failure as though the worker had crashed. That way, the + * postmaster will wait a bit before attempting to start it again; if we + * tried again right away, most likely we'd find ourselves hitting the + * same resource-exhaustion condition. + */ + if (!assign_backendlist_entry(rw)) + { + rw->rw_crashed_at = GetCurrentTimestamp(); + return false; + } + + ereport(DEBUG1, + (errmsg_internal("starting background worker process \"%s\"", + rw->rw_worker.bgw_name))); + +#ifdef EXEC_BACKEND + switch ((worker_pid = bgworker_forkexec(rw->rw_shmem_slot))) +#else + switch ((worker_pid = fork_process())) +#endif + { + case -1: + /* in postmaster, fork failed ... */ + ereport(LOG, + (errmsg("could not fork worker process: %m"))); + /* undo what assign_backendlist_entry did */ + ReleasePostmasterChildSlot(rw->rw_child_slot); + rw->rw_child_slot = 0; + free(rw->rw_backend); + rw->rw_backend = NULL; + /* mark entry as crashed, so we'll try again later */ + rw->rw_crashed_at = GetCurrentTimestamp(); + break; + +#ifndef EXEC_BACKEND + case 0: + /* in postmaster child ... */ + InitPostmasterChild(); + + /* Close the postmaster's sockets */ + ClosePostmasterPorts(false); + + /* + * Before blowing away PostmasterContext, save this bgworker's + * data where it can find it. + */ + MyBgworkerEntry = (BackgroundWorker *) + MemoryContextAlloc(TopMemoryContext, sizeof(BackgroundWorker)); + memcpy(MyBgworkerEntry, &rw->rw_worker, sizeof(BackgroundWorker)); + + /* Release postmaster's working memory context */ + MemoryContextSwitchTo(TopMemoryContext); + MemoryContextDelete(PostmasterContext); + PostmasterContext = NULL; + + StartBackgroundWorker(); + + exit(1); /* should not get here */ + break; +#endif + default: + /* in postmaster, fork successful ... */ + rw->rw_pid = worker_pid; + rw->rw_backend->pid = rw->rw_pid; + ReportBackgroundWorkerPID(rw); + /* add new worker to lists of backends */ + dlist_push_head(&BackendList, &rw->rw_backend->elem); +#ifdef EXEC_BACKEND + ShmemBackendArrayAdd(rw->rw_backend); +#endif + return true; + } + + return false; +} + +/* + * Does the current postmaster state require starting a worker with the + * specified start_time? + */ +static bool +bgworker_should_start_now(BgWorkerStartTime start_time) +{ + switch (pmState) + { + case PM_NO_CHILDREN: + case PM_WAIT_DEAD_END: + case PM_SHUTDOWN_2: + case PM_SHUTDOWN: + case PM_WAIT_BACKENDS: + case PM_STOP_BACKENDS: + break; + + case PM_RUN: + if (start_time == BgWorkerStart_RecoveryFinished) + return true; + /* fall through */ + + case PM_HOT_STANDBY: + if (start_time == BgWorkerStart_ConsistentState) + return true; + /* fall through */ + + case PM_RECOVERY: + case PM_STARTUP: + case PM_INIT: + if (start_time == BgWorkerStart_PostmasterStart) + return true; + /* fall through */ + + } + + return false; +} + +/* + * Allocate the Backend struct for a connected background worker, but don't + * add it to the list of backends just yet. + * + * On failure, return false without changing any worker state. + * + * Some info from the Backend is copied into the passed rw. + */ +static bool +assign_backendlist_entry(RegisteredBgWorker *rw) +{ + Backend *bn; + + /* + * Check that database state allows another connection. Currently the + * only possible failure is CAC_TOOMANY, so we just log an error message + * based on that rather than checking the error code precisely. + */ + if (canAcceptConnections(BACKEND_TYPE_BGWORKER) != CAC_OK) + { + ereport(LOG, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("no slot available for new worker process"))); + return false; + } + + /* + * Compute the cancel key that will be assigned to this session. We + * probably don't need cancel keys for background workers, but we'd better + * have something random in the field to prevent unfriendly people from + * sending cancels to them. + */ + if (!RandomCancelKey(&MyCancelKey)) + { + ereport(LOG, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("could not generate random cancel key"))); + return false; + } + + bn = malloc(sizeof(Backend)); + if (bn == NULL) + { + ereport(LOG, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + return false; + } + + bn->cancel_key = MyCancelKey; + bn->child_slot = MyPMChildSlot = AssignPostmasterChildSlot(); + bn->bkend_type = BACKEND_TYPE_BGWORKER; + bn->dead_end = false; + bn->bgworker_notify = false; + + rw->rw_backend = bn; + rw->rw_child_slot = bn->child_slot; + + return true; +} + +/* + * If the time is right, start background worker(s). + * + * As a side effect, the bgworker control variables are set or reset + * depending on whether more workers may need to be started. + * + * We limit the number of workers started per call, to avoid consuming the + * postmaster's attention for too long when many such requests are pending. + * As long as StartWorkerNeeded is true, ServerLoop will not block and will + * call this function again after dealing with any other issues. + */ +static void +maybe_start_bgworkers(void) +{ +#define MAX_BGWORKERS_TO_LAUNCH 100 + int num_launched = 0; + TimestampTz now = 0; + slist_mutable_iter iter; + + /* + * During crash recovery, we have no need to be called until the state + * transition out of recovery. + */ + if (FatalError) + { + StartWorkerNeeded = false; + HaveCrashedWorker = false; + return; + } + + /* Don't need to be called again unless we find a reason for it below */ + StartWorkerNeeded = false; + HaveCrashedWorker = false; + + slist_foreach_modify(iter, &BackgroundWorkerList) + { + RegisteredBgWorker *rw; + + rw = slist_container(RegisteredBgWorker, rw_lnode, iter.cur); + + /* ignore if already running */ + if (rw->rw_pid != 0) + continue; + + /* if marked for death, clean up and remove from list */ + if (rw->rw_terminate) + { + ForgetBackgroundWorker(&iter); + continue; + } + + /* + * If this worker has crashed previously, maybe it needs to be + * restarted (unless on registration it specified it doesn't want to + * be restarted at all). Check how long ago did a crash last happen. + * If the last crash is too recent, don't start it right away; let it + * be restarted once enough time has passed. + */ + if (rw->rw_crashed_at != 0) + { + if (rw->rw_worker.bgw_restart_time == BGW_NEVER_RESTART) + { + int notify_pid; + + notify_pid = rw->rw_worker.bgw_notify_pid; + + ForgetBackgroundWorker(&iter); + + /* Report worker is gone now. */ + if (notify_pid != 0) + kill(notify_pid, SIGUSR1); + + continue; + } + + /* read system time only when needed */ + if (now == 0) + now = GetCurrentTimestamp(); + + if (!TimestampDifferenceExceeds(rw->rw_crashed_at, now, + rw->rw_worker.bgw_restart_time * 1000)) + { + /* Set flag to remember that we have workers to start later */ + HaveCrashedWorker = true; + continue; + } + } + + if (bgworker_should_start_now(rw->rw_worker.bgw_start_time)) + { + /* reset crash time before trying to start worker */ + rw->rw_crashed_at = 0; + + /* + * Try to start the worker. + * + * On failure, give up processing workers for now, but set + * StartWorkerNeeded so we'll come back here on the next iteration + * of ServerLoop to try again. (We don't want to wait, because + * there might be additional ready-to-run workers.) We could set + * HaveCrashedWorker as well, since this worker is now marked + * crashed, but there's no need because the next run of this + * function will do that. + */ + if (!do_start_bgworker(rw)) + { + StartWorkerNeeded = true; + return; + } + + /* + * If we've launched as many workers as allowed, quit, but have + * ServerLoop call us again to look for additional ready-to-run + * workers. There might not be any, but we'll find out the next + * time we run. + */ + if (++num_launched >= MAX_BGWORKERS_TO_LAUNCH) + { + StartWorkerNeeded = true; + return; + } + } + } +} + +/* + * When a backend asks to be notified about worker state changes, we + * set a flag in its backend entry. The background worker machinery needs + * to know when such backends exit. + */ +bool +PostmasterMarkPIDForWorkerNotify(int pid) +{ + dlist_iter iter; + Backend *bp; + + dlist_foreach(iter, &BackendList) + { + bp = dlist_container(Backend, elem, iter.cur); + if (bp->pid == pid) + { + bp->bgworker_notify = true; + return true; + } + } + return false; +} + +#ifdef EXEC_BACKEND + +/* + * The following need to be available to the save/restore_backend_variables + * functions. They are marked NON_EXEC_STATIC in their home modules. + */ +extern slock_t *ShmemLock; +extern slock_t *ProcStructLock; +extern PGPROC *AuxiliaryProcs; +extern PMSignalData *PMSignalState; +extern pgsocket pgStatSock; +extern pg_time_t first_syslogger_file_time; + +#ifndef WIN32 +#define write_inheritable_socket(dest, src, childpid) ((*(dest) = (src)), true) +#define read_inheritable_socket(dest, src) (*(dest) = *(src)) +#else +static bool write_duplicated_handle(HANDLE *dest, HANDLE src, HANDLE child); +static bool write_inheritable_socket(InheritableSocket *dest, SOCKET src, + pid_t childPid); +static void read_inheritable_socket(SOCKET *dest, InheritableSocket *src); +#endif + + +/* Save critical backend variables into the BackendParameters struct */ +#ifndef WIN32 +static bool +save_backend_variables(BackendParameters *param, Port *port) +#else +static bool +save_backend_variables(BackendParameters *param, Port *port, + HANDLE childProcess, pid_t childPid) +#endif +{ + memcpy(¶m->port, port, sizeof(Port)); + if (!write_inheritable_socket(¶m->portsocket, port->sock, childPid)) + return false; + + strlcpy(param->DataDir, DataDir, MAXPGPATH); + + memcpy(¶m->ListenSocket, &ListenSocket, sizeof(ListenSocket)); + + param->MyCancelKey = MyCancelKey; + param->MyPMChildSlot = MyPMChildSlot; + +#ifdef WIN32 + param->ShmemProtectiveRegion = ShmemProtectiveRegion; +#endif + param->UsedShmemSegID = UsedShmemSegID; + param->UsedShmemSegAddr = UsedShmemSegAddr; + + param->ShmemLock = ShmemLock; + param->ShmemVariableCache = ShmemVariableCache; + param->ShmemBackendArray = ShmemBackendArray; + +#ifndef HAVE_SPINLOCKS + param->SpinlockSemaArray = SpinlockSemaArray; +#endif + param->NamedLWLockTrancheRequests = NamedLWLockTrancheRequests; + param->NamedLWLockTrancheArray = NamedLWLockTrancheArray; + param->MainLWLockArray = MainLWLockArray; + param->ProcStructLock = ProcStructLock; + param->ProcGlobal = ProcGlobal; + param->AuxiliaryProcs = AuxiliaryProcs; + param->PreparedXactProcs = PreparedXactProcs; + param->PMSignalState = PMSignalState; + if (!write_inheritable_socket(¶m->pgStatSock, pgStatSock, childPid)) + return false; + + param->PostmasterPid = PostmasterPid; + param->PgStartTime = PgStartTime; + param->PgReloadTime = PgReloadTime; + param->first_syslogger_file_time = first_syslogger_file_time; + + param->redirection_done = redirection_done; + param->IsBinaryUpgrade = IsBinaryUpgrade; + param->query_id_enabled = query_id_enabled; + param->max_safe_fds = max_safe_fds; + + param->MaxBackends = MaxBackends; + +#ifdef WIN32 + param->PostmasterHandle = PostmasterHandle; + if (!write_duplicated_handle(¶m->initial_signal_pipe, + pgwin32_create_signal_listener(childPid), + childProcess)) + return false; +#else + memcpy(¶m->postmaster_alive_fds, &postmaster_alive_fds, + sizeof(postmaster_alive_fds)); +#endif + + memcpy(¶m->syslogPipe, &syslogPipe, sizeof(syslogPipe)); + + strlcpy(param->my_exec_path, my_exec_path, MAXPGPATH); + + strlcpy(param->pkglib_path, pkglib_path, MAXPGPATH); + + return true; +} + + +#ifdef WIN32 +/* + * Duplicate a handle for usage in a child process, and write the child + * process instance of the handle to the parameter file. + */ +static bool +write_duplicated_handle(HANDLE *dest, HANDLE src, HANDLE childProcess) +{ + HANDLE hChild = INVALID_HANDLE_VALUE; + + if (!DuplicateHandle(GetCurrentProcess(), + src, + childProcess, + &hChild, + 0, + TRUE, + DUPLICATE_CLOSE_SOURCE | DUPLICATE_SAME_ACCESS)) + { + ereport(LOG, + (errmsg_internal("could not duplicate handle to be written to backend parameter file: error code %lu", + GetLastError()))); + return false; + } + + *dest = hChild; + return true; +} + +/* + * Duplicate a socket for usage in a child process, and write the resulting + * structure to the parameter file. + * This is required because a number of LSPs (Layered Service Providers) very + * common on Windows (antivirus, firewalls, download managers etc) break + * straight socket inheritance. + */ +static bool +write_inheritable_socket(InheritableSocket *dest, SOCKET src, pid_t childpid) +{ + dest->origsocket = src; + if (src != 0 && src != PGINVALID_SOCKET) + { + /* Actual socket */ + if (WSADuplicateSocket(src, childpid, &dest->wsainfo) != 0) + { + ereport(LOG, + (errmsg("could not duplicate socket %d for use in backend: error code %d", + (int) src, WSAGetLastError()))); + return false; + } + } + return true; +} + +/* + * Read a duplicate socket structure back, and get the socket descriptor. + */ +static void +read_inheritable_socket(SOCKET *dest, InheritableSocket *src) +{ + SOCKET s; + + if (src->origsocket == PGINVALID_SOCKET || src->origsocket == 0) + { + /* Not a real socket! */ + *dest = src->origsocket; + } + else + { + /* Actual socket, so create from structure */ + s = WSASocket(FROM_PROTOCOL_INFO, + FROM_PROTOCOL_INFO, + FROM_PROTOCOL_INFO, + &src->wsainfo, + 0, + 0); + if (s == INVALID_SOCKET) + { + write_stderr("could not create inherited socket: error code %d\n", + WSAGetLastError()); + exit(1); + } + *dest = s; + + /* + * To make sure we don't get two references to the same socket, close + * the original one. (This would happen when inheritance actually + * works.. + */ + closesocket(src->origsocket); + } +} +#endif + +static void +read_backend_variables(char *id, Port *port) +{ + BackendParameters param; + +#ifndef WIN32 + /* Non-win32 implementation reads from file */ + FILE *fp; + + /* Open file */ + fp = AllocateFile(id, PG_BINARY_R); + if (!fp) + { + write_stderr("could not open backend variables file \"%s\": %s\n", + id, strerror(errno)); + exit(1); + } + + if (fread(¶m, sizeof(param), 1, fp) != 1) + { + write_stderr("could not read from backend variables file \"%s\": %s\n", + id, strerror(errno)); + exit(1); + } + + /* Release file */ + FreeFile(fp); + if (unlink(id) != 0) + { + write_stderr("could not remove file \"%s\": %s\n", + id, strerror(errno)); + exit(1); + } +#else + /* Win32 version uses mapped file */ + HANDLE paramHandle; + BackendParameters *paramp; + +#ifdef _WIN64 + paramHandle = (HANDLE) _atoi64(id); +#else + paramHandle = (HANDLE) atol(id); +#endif + paramp = MapViewOfFile(paramHandle, FILE_MAP_READ, 0, 0, 0); + if (!paramp) + { + write_stderr("could not map view of backend variables: error code %lu\n", + GetLastError()); + exit(1); + } + + memcpy(¶m, paramp, sizeof(BackendParameters)); + + if (!UnmapViewOfFile(paramp)) + { + write_stderr("could not unmap view of backend variables: error code %lu\n", + GetLastError()); + exit(1); + } + + if (!CloseHandle(paramHandle)) + { + write_stderr("could not close handle to backend parameter variables: error code %lu\n", + GetLastError()); + exit(1); + } +#endif + + restore_backend_variables(¶m, port); +} + +/* Restore critical backend variables from the BackendParameters struct */ +static void +restore_backend_variables(BackendParameters *param, Port *port) +{ + memcpy(port, ¶m->port, sizeof(Port)); + read_inheritable_socket(&port->sock, ¶m->portsocket); + + SetDataDir(param->DataDir); + + memcpy(&ListenSocket, ¶m->ListenSocket, sizeof(ListenSocket)); + + MyCancelKey = param->MyCancelKey; + MyPMChildSlot = param->MyPMChildSlot; + +#ifdef WIN32 + ShmemProtectiveRegion = param->ShmemProtectiveRegion; +#endif + UsedShmemSegID = param->UsedShmemSegID; + UsedShmemSegAddr = param->UsedShmemSegAddr; + + ShmemLock = param->ShmemLock; + ShmemVariableCache = param->ShmemVariableCache; + ShmemBackendArray = param->ShmemBackendArray; + +#ifndef HAVE_SPINLOCKS + SpinlockSemaArray = param->SpinlockSemaArray; +#endif + NamedLWLockTrancheRequests = param->NamedLWLockTrancheRequests; + NamedLWLockTrancheArray = param->NamedLWLockTrancheArray; + MainLWLockArray = param->MainLWLockArray; + ProcStructLock = param->ProcStructLock; + ProcGlobal = param->ProcGlobal; + AuxiliaryProcs = param->AuxiliaryProcs; + PreparedXactProcs = param->PreparedXactProcs; + PMSignalState = param->PMSignalState; + read_inheritable_socket(&pgStatSock, ¶m->pgStatSock); + + PostmasterPid = param->PostmasterPid; + PgStartTime = param->PgStartTime; + PgReloadTime = param->PgReloadTime; + first_syslogger_file_time = param->first_syslogger_file_time; + + redirection_done = param->redirection_done; + IsBinaryUpgrade = param->IsBinaryUpgrade; + query_id_enabled = param->query_id_enabled; + max_safe_fds = param->max_safe_fds; + + MaxBackends = param->MaxBackends; + +#ifdef WIN32 + PostmasterHandle = param->PostmasterHandle; + pgwin32_initial_signal_pipe = param->initial_signal_pipe; +#else + memcpy(&postmaster_alive_fds, ¶m->postmaster_alive_fds, + sizeof(postmaster_alive_fds)); +#endif + + memcpy(&syslogPipe, ¶m->syslogPipe, sizeof(syslogPipe)); + + strlcpy(my_exec_path, param->my_exec_path, MAXPGPATH); + + strlcpy(pkglib_path, param->pkglib_path, MAXPGPATH); + + /* + * We need to restore fd.c's counts of externally-opened FDs; to avoid + * confusion, be sure to do this after restoring max_safe_fds. (Note: + * BackendInitialize will handle this for port->sock.) + */ +#ifndef WIN32 + if (postmaster_alive_fds[0] >= 0) + ReserveExternalFD(); + if (postmaster_alive_fds[1] >= 0) + ReserveExternalFD(); +#endif + if (pgStatSock != PGINVALID_SOCKET) + ReserveExternalFD(); +} + + +Size +ShmemBackendArraySize(void) +{ + return mul_size(MaxLivePostmasterChildren(), sizeof(Backend)); +} + +void +ShmemBackendArrayAllocation(void) +{ + Size size = ShmemBackendArraySize(); + + ShmemBackendArray = (Backend *) ShmemAlloc(size); + /* Mark all slots as empty */ + memset(ShmemBackendArray, 0, size); +} + +static void +ShmemBackendArrayAdd(Backend *bn) +{ + /* The array slot corresponding to my PMChildSlot should be free */ + int i = bn->child_slot - 1; + + Assert(ShmemBackendArray[i].pid == 0); + ShmemBackendArray[i] = *bn; +} + +static void +ShmemBackendArrayRemove(Backend *bn) +{ + int i = bn->child_slot - 1; + + Assert(ShmemBackendArray[i].pid == bn->pid); + /* Mark the slot as empty */ + ShmemBackendArray[i].pid = 0; +} +#endif /* EXEC_BACKEND */ + + +#ifdef WIN32 + +/* + * Subset implementation of waitpid() for Windows. We assume pid is -1 + * (that is, check all child processes) and options is WNOHANG (don't wait). + */ +static pid_t +waitpid(pid_t pid, int *exitstatus, int options) +{ + DWORD dwd; + ULONG_PTR key; + OVERLAPPED *ovl; + + /* + * Check if there are any dead children. If there are, return the pid of + * the first one that died. + */ + if (GetQueuedCompletionStatus(win32ChildQueue, &dwd, &key, &ovl, 0)) + { + *exitstatus = (int) key; + return dwd; + } + + return -1; +} + +/* + * Note! Code below executes on a thread pool! All operations must + * be thread safe! Note that elog() and friends must *not* be used. + */ +static void WINAPI +pgwin32_deadchild_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired) +{ + win32_deadchild_waitinfo *childinfo = (win32_deadchild_waitinfo *) lpParameter; + DWORD exitcode; + + if (TimerOrWaitFired) + return; /* timeout. Should never happen, since we use + * INFINITE as timeout value. */ + + /* + * Remove handle from wait - required even though it's set to wait only + * once + */ + UnregisterWaitEx(childinfo->waitHandle, NULL); + + if (!GetExitCodeProcess(childinfo->procHandle, &exitcode)) + { + /* + * Should never happen. Inform user and set a fixed exitcode. + */ + write_stderr("could not read exit code for process\n"); + exitcode = 255; + } + + if (!PostQueuedCompletionStatus(win32ChildQueue, childinfo->procId, (ULONG_PTR) exitcode, NULL)) + write_stderr("could not post child completion status\n"); + + /* + * Handle is per-process, so we close it here instead of in the + * originating thread + */ + CloseHandle(childinfo->procHandle); + + /* + * Free struct that was allocated before the call to + * RegisterWaitForSingleObject() + */ + free(childinfo); + + /* Queue SIGCHLD signal */ + pg_queue_signal(SIGCHLD); +} +#endif /* WIN32 */ + +/* + * Initialize one and only handle for monitoring postmaster death. + * + * Called once in the postmaster, so that child processes can subsequently + * monitor if their parent is dead. + */ +static void +InitPostmasterDeathWatchHandle(void) +{ +#ifndef WIN32 + + /* + * Create a pipe. Postmaster holds the write end of the pipe open + * (POSTMASTER_FD_OWN), and children hold the read end. Children can pass + * the read file descriptor to select() to wake up in case postmaster + * dies, or check for postmaster death with a (read() == 0). Children must + * close the write end as soon as possible after forking, because EOF + * won't be signaled in the read end until all processes have closed the + * write fd. That is taken care of in ClosePostmasterPorts(). + */ + Assert(MyProcPid == PostmasterPid); + if (pipe(postmaster_alive_fds) < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg_internal("could not create pipe to monitor postmaster death: %m"))); + + /* Notify fd.c that we've eaten two FDs for the pipe. */ + ReserveExternalFD(); + ReserveExternalFD(); + + /* + * Set O_NONBLOCK to allow testing for the fd's presence with a read() + * call. + */ + if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1) + ereport(FATAL, + (errcode_for_socket_access(), + errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m"))); +#else + + /* + * On Windows, we use a process handle for the same purpose. + */ + if (DuplicateHandle(GetCurrentProcess(), + GetCurrentProcess(), + GetCurrentProcess(), + &PostmasterHandle, + 0, + TRUE, + DUPLICATE_SAME_ACCESS) == 0) + ereport(FATAL, + (errmsg_internal("could not duplicate postmaster handle: error code %lu", + GetLastError()))); +#endif /* WIN32 */ +} diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c new file mode 100644 index 0000000..69077bd --- /dev/null +++ b/src/backend/postmaster/startup.c @@ -0,0 +1,283 @@ +/*------------------------------------------------------------------------- + * + * startup.c + * + * The Startup process initialises the server and performs any recovery + * actions that have been specified. Notice that there is no "main loop" + * since the Startup process ends as soon as initialisation is complete. + * (in standby mode, one can think of the replay loop as a main loop, + * though.) + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/postmaster/startup.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xlog.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/interrupt.h" +#include "postmaster/startup.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/pmsignal.h" +#include "storage/procsignal.h" +#include "storage/standby.h" +#include "utils/guc.h" +#include "utils/timeout.h" + + +#ifndef USE_POSTMASTER_DEATH_SIGNAL +/* + * On systems that need to make a system call to find out if the postmaster has + * gone away, we'll do so only every Nth call to HandleStartupProcInterrupts(). + * This only affects how long it takes us to detect the condition while we're + * busy replaying WAL. Latch waits and similar which should react immediately + * through the usual techniques. + */ +#define POSTMASTER_POLL_RATE_LIMIT 1024 +#endif + +/* + * Flags set by interrupt handlers for later service in the redo loop. + */ +static volatile sig_atomic_t got_SIGHUP = false; +static volatile sig_atomic_t shutdown_requested = false; +static volatile sig_atomic_t promote_signaled = false; + +/* + * Flag set when executing a restore command, to tell SIGTERM signal handler + * that it's safe to just proc_exit. + */ +static volatile sig_atomic_t in_restore_command = false; + +/* Signal handlers */ +static void StartupProcTriggerHandler(SIGNAL_ARGS); +static void StartupProcSigHupHandler(SIGNAL_ARGS); + +/* Callbacks */ +static void StartupProcExit(int code, Datum arg); + + +/* -------------------------------- + * signal handler routines + * -------------------------------- + */ + +/* SIGUSR2: set flag to finish recovery */ +static void +StartupProcTriggerHandler(SIGNAL_ARGS) +{ + int save_errno = errno; + + promote_signaled = true; + WakeupRecovery(); + + errno = save_errno; +} + +/* SIGHUP: set flag to re-read config file at next convenient time */ +static void +StartupProcSigHupHandler(SIGNAL_ARGS) +{ + int save_errno = errno; + + got_SIGHUP = true; + WakeupRecovery(); + + errno = save_errno; +} + +/* SIGTERM: set flag to abort redo and exit */ +static void +StartupProcShutdownHandler(SIGNAL_ARGS) +{ + int save_errno = errno; + + if (in_restore_command) + proc_exit(1); + else + shutdown_requested = true; + WakeupRecovery(); + + errno = save_errno; +} + +/* + * Re-read the config file. + * + * If one of the critical walreceiver options has changed, flag xlog.c + * to restart it. + */ +static void +StartupRereadConfig(void) +{ + char *conninfo = pstrdup(PrimaryConnInfo); + char *slotname = pstrdup(PrimarySlotName); + bool tempSlot = wal_receiver_create_temp_slot; + bool conninfoChanged; + bool slotnameChanged; + bool tempSlotChanged = false; + + ProcessConfigFile(PGC_SIGHUP); + + conninfoChanged = strcmp(conninfo, PrimaryConnInfo) != 0; + slotnameChanged = strcmp(slotname, PrimarySlotName) != 0; + + /* + * wal_receiver_create_temp_slot is used only when we have no slot + * configured. We do not need to track this change if it has no effect. + */ + if (!slotnameChanged && strcmp(PrimarySlotName, "") == 0) + tempSlotChanged = tempSlot != wal_receiver_create_temp_slot; + pfree(conninfo); + pfree(slotname); + + if (conninfoChanged || slotnameChanged || tempSlotChanged) + StartupRequestWalReceiverRestart(); +} + +/* Handle various signals that might be sent to the startup process */ +void +HandleStartupProcInterrupts(void) +{ +#ifdef POSTMASTER_POLL_RATE_LIMIT + static uint32 postmaster_poll_count = 0; +#endif + + /* + * Process any requests or signals received recently. + */ + if (got_SIGHUP) + { + got_SIGHUP = false; + StartupRereadConfig(); + } + + /* + * Check if we were requested to exit without finishing recovery. + */ + if (shutdown_requested) + proc_exit(1); + + /* + * Emergency bailout if postmaster has died. This is to avoid the + * necessity for manual cleanup of all postmaster children. Do this less + * frequently on systems for which we don't have signals to make that + * cheap. + */ + if (IsUnderPostmaster && +#ifdef POSTMASTER_POLL_RATE_LIMIT + postmaster_poll_count++ % POSTMASTER_POLL_RATE_LIMIT == 0 && +#endif + !PostmasterIsAlive()) + exit(1); + + /* Process barrier events */ + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); +} + + +/* -------------------------------- + * signal handler routines + * -------------------------------- + */ +static void +StartupProcExit(int code, Datum arg) +{ + /* Shutdown the recovery environment */ + if (standbyState != STANDBY_DISABLED) + ShutdownRecoveryTransactionEnvironment(); +} + + +/* ---------------------------------- + * Startup Process main entry point + * ---------------------------------- + */ +void +StartupProcessMain(void) +{ + /* Arrange to clean up at startup process exit */ + on_shmem_exit(StartupProcExit, 0); + + /* + * Properly accept or ignore signals the postmaster might send us. + */ + pqsignal(SIGHUP, StartupProcSigHupHandler); /* reload config file */ + pqsignal(SIGINT, SIG_IGN); /* ignore query cancel */ + pqsignal(SIGTERM, StartupProcShutdownHandler); /* request shutdown */ + /* SIGQUIT handler was already set up by InitPostmasterChild */ + InitializeTimeouts(); /* establishes SIGALRM handler */ + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, StartupProcTriggerHandler); + + /* + * Reset some signals that are accepted by postmaster but not here + */ + pqsignal(SIGCHLD, SIG_DFL); + + /* + * Register timeouts needed for standby mode + */ + RegisterTimeout(STANDBY_DEADLOCK_TIMEOUT, StandbyDeadLockHandler); + RegisterTimeout(STANDBY_TIMEOUT, StandbyTimeoutHandler); + RegisterTimeout(STANDBY_LOCK_TIMEOUT, StandbyLockTimeoutHandler); + + /* + * Unblock signals (they were blocked when the postmaster forked us) + */ + PG_SETMASK(&UnBlockSig); + + /* + * Do what we came for. + */ + StartupXLOG(); + + /* + * Exit normally. Exit code 0 tells postmaster that we completed recovery + * successfully. + */ + proc_exit(0); +} + +void +PreRestoreCommand(void) +{ + /* + * Set in_restore_command to tell the signal handler that we should exit + * right away on SIGTERM. We know that we're at a safe point to do that. + * Check if we had already received the signal, so that we don't miss a + * shutdown request received just before this. + */ + in_restore_command = true; + if (shutdown_requested) + proc_exit(1); +} + +void +PostRestoreCommand(void) +{ + in_restore_command = false; +} + +bool +IsPromoteSignaled(void) +{ + return promote_signaled; +} + +void +ResetPromoteSignaled(void) +{ + promote_signaled = false; +} diff --git a/src/backend/postmaster/syslogger.c b/src/backend/postmaster/syslogger.c new file mode 100644 index 0000000..cad43bd --- /dev/null +++ b/src/backend/postmaster/syslogger.c @@ -0,0 +1,1566 @@ +/*------------------------------------------------------------------------- + * + * syslogger.c + * + * The system logger (syslogger) appeared in Postgres 8.0. It catches all + * stderr output from the postmaster, backends, and other subprocesses + * by redirecting to a pipe, and writes it to a set of logfiles. + * It's possible to have size and age limits for the logfile configured + * in postgresql.conf. If these limits are reached or passed, the + * current logfile is closed and a new one is created (rotated). + * The logfiles are stored in a subdirectory (configurable in + * postgresql.conf), using a user-selectable naming scheme. + * + * Author: Andreas Pflug <pgadmin@pse-consulting.de> + * + * Copyright (c) 2004-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/postmaster/syslogger.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <fcntl.h> +#include <limits.h> +#include <signal.h> +#include <time.h> +#include <unistd.h> +#include <sys/stat.h> +#include <sys/time.h> + +#include "common/file_perm.h" +#include "lib/stringinfo.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "nodes/pg_list.h" +#include "pgstat.h" +#include "pgtime.h" +#include "postmaster/fork_process.h" +#include "postmaster/interrupt.h" +#include "postmaster/postmaster.h" +#include "postmaster/syslogger.h" +#include "storage/dsm.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/pg_shmem.h" +#include "tcop/tcopprot.h" +#include "utils/guc.h" +#include "utils/ps_status.h" +#include "utils/timestamp.h" + +/* + * We read() into a temp buffer twice as big as a chunk, so that any fragment + * left after processing can be moved down to the front and we'll still have + * room to read a full chunk. + */ +#define READ_BUF_SIZE (2 * PIPE_CHUNK_SIZE) + +/* Log rotation signal file path, relative to $PGDATA */ +#define LOGROTATE_SIGNAL_FILE "logrotate" + + +/* + * GUC parameters. Logging_collector cannot be changed after postmaster + * start, but the rest can change at SIGHUP. + */ +bool Logging_collector = false; +int Log_RotationAge = HOURS_PER_DAY * MINS_PER_HOUR; +int Log_RotationSize = 10 * 1024; +char *Log_directory = NULL; +char *Log_filename = NULL; +bool Log_truncate_on_rotation = false; +int Log_file_mode = S_IRUSR | S_IWUSR; + +extern bool redirection_done; + +/* + * Private state + */ +static pg_time_t next_rotation_time; +static bool pipe_eof_seen = false; +static bool rotation_disabled = false; +static FILE *syslogFile = NULL; +static FILE *csvlogFile = NULL; +NON_EXEC_STATIC pg_time_t first_syslogger_file_time = 0; +static char *last_file_name = NULL; +static char *last_csv_file_name = NULL; + +/* + * Buffers for saving partial messages from different backends. + * + * Keep NBUFFER_LISTS lists of these, with the entry for a given source pid + * being in the list numbered (pid % NBUFFER_LISTS), so as to cut down on + * the number of entries we have to examine for any one incoming message. + * There must never be more than one entry for the same source pid. + * + * An inactive buffer is not removed from its list, just held for re-use. + * An inactive buffer has pid == 0 and undefined contents of data. + */ +typedef struct +{ + int32 pid; /* PID of source process */ + StringInfoData data; /* accumulated data, as a StringInfo */ +} save_buffer; + +#define NBUFFER_LISTS 256 +static List *buffer_lists[NBUFFER_LISTS]; + +/* These must be exported for EXEC_BACKEND case ... annoying */ +#ifndef WIN32 +int syslogPipe[2] = {-1, -1}; +#else +HANDLE syslogPipe[2] = {0, 0}; +#endif + +#ifdef WIN32 +static HANDLE threadHandle = 0; +static CRITICAL_SECTION sysloggerSection; +#endif + +/* + * Flags set by interrupt handlers for later service in the main loop. + */ +static volatile sig_atomic_t rotation_requested = false; + + +/* Local subroutines */ +#ifdef EXEC_BACKEND +static pid_t syslogger_forkexec(void); +static void syslogger_parseArgs(int argc, char *argv[]); +#endif +NON_EXEC_STATIC void SysLoggerMain(int argc, char *argv[]) pg_attribute_noreturn(); +static void process_pipe_input(char *logbuffer, int *bytes_in_logbuffer); +static void flush_pipe_input(char *logbuffer, int *bytes_in_logbuffer); +static FILE *logfile_open(const char *filename, const char *mode, + bool allow_errors); + +#ifdef WIN32 +static unsigned int __stdcall pipeThread(void *arg); +#endif +static void logfile_rotate(bool time_based_rotation, int size_rotation_for); +static char *logfile_getname(pg_time_t timestamp, const char *suffix); +static void set_next_rotation_time(void); +static void sigUsr1Handler(SIGNAL_ARGS); +static void update_metainfo_datafile(void); + + +/* + * Main entry point for syslogger process + * argc/argv parameters are valid only in EXEC_BACKEND case. + */ +NON_EXEC_STATIC void +SysLoggerMain(int argc, char *argv[]) +{ +#ifndef WIN32 + char logbuffer[READ_BUF_SIZE]; + int bytes_in_logbuffer = 0; +#endif + char *currentLogDir; + char *currentLogFilename; + int currentLogRotationAge; + pg_time_t now; + WaitEventSet *wes; + + now = MyStartTime; + +#ifdef EXEC_BACKEND + syslogger_parseArgs(argc, argv); +#endif /* EXEC_BACKEND */ + + MyBackendType = B_LOGGER; + init_ps_display(NULL); + + /* + * If we restarted, our stderr is already redirected into our own input + * pipe. This is of course pretty useless, not to mention that it + * interferes with detecting pipe EOF. Point stderr to /dev/null. This + * assumes that all interesting messages generated in the syslogger will + * come through elog.c and will be sent to write_syslogger_file. + */ + if (redirection_done) + { + int fd = open(DEVNULL, O_WRONLY, 0); + + /* + * The closes might look redundant, but they are not: we want to be + * darn sure the pipe gets closed even if the open failed. We can + * survive running with stderr pointing nowhere, but we can't afford + * to have extra pipe input descriptors hanging around. + * + * As we're just trying to reset these to go to DEVNULL, there's not + * much point in checking for failure from the close/dup2 calls here, + * if they fail then presumably the file descriptors are closed and + * any writes will go into the bitbucket anyway. + */ + close(fileno(stdout)); + close(fileno(stderr)); + if (fd != -1) + { + (void) dup2(fd, fileno(stdout)); + (void) dup2(fd, fileno(stderr)); + close(fd); + } + } + + /* + * Syslogger's own stderr can't be the syslogPipe, so set it back to text + * mode if we didn't just close it. (It was set to binary in + * SubPostmasterMain). + */ +#ifdef WIN32 + else + _setmode(_fileno(stderr), _O_TEXT); +#endif + + /* + * Also close our copy of the write end of the pipe. This is needed to + * ensure we can detect pipe EOF correctly. (But note that in the restart + * case, the postmaster already did this.) + */ +#ifndef WIN32 + if (syslogPipe[1] >= 0) + close(syslogPipe[1]); + syslogPipe[1] = -1; +#else + if (syslogPipe[1]) + CloseHandle(syslogPipe[1]); + syslogPipe[1] = 0; +#endif + + /* + * Properly accept or ignore signals the postmaster might send us + * + * Note: we ignore all termination signals, and instead exit only when all + * upstream processes are gone, to ensure we don't miss any dying gasps of + * broken backends... + */ + + pqsignal(SIGHUP, SignalHandlerForConfigReload); /* set flag to read config + * file */ + pqsignal(SIGINT, SIG_IGN); + pqsignal(SIGTERM, SIG_IGN); + pqsignal(SIGQUIT, SIG_IGN); + pqsignal(SIGALRM, SIG_IGN); + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, sigUsr1Handler); /* request log rotation */ + pqsignal(SIGUSR2, SIG_IGN); + + /* + * Reset some signals that are accepted by postmaster but not here + */ + pqsignal(SIGCHLD, SIG_DFL); + + PG_SETMASK(&UnBlockSig); + +#ifdef WIN32 + /* Fire up separate data transfer thread */ + InitializeCriticalSection(&sysloggerSection); + EnterCriticalSection(&sysloggerSection); + + threadHandle = (HANDLE) _beginthreadex(NULL, 0, pipeThread, NULL, 0, NULL); + if (threadHandle == 0) + elog(FATAL, "could not create syslogger data transfer thread: %m"); +#endif /* WIN32 */ + + /* + * Remember active logfiles' name(s). We recompute 'em from the reference + * time because passing down just the pg_time_t is a lot cheaper than + * passing a whole file path in the EXEC_BACKEND case. + */ + last_file_name = logfile_getname(first_syslogger_file_time, NULL); + if (csvlogFile != NULL) + last_csv_file_name = logfile_getname(first_syslogger_file_time, ".csv"); + + /* remember active logfile parameters */ + currentLogDir = pstrdup(Log_directory); + currentLogFilename = pstrdup(Log_filename); + currentLogRotationAge = Log_RotationAge; + /* set next planned rotation time */ + set_next_rotation_time(); + update_metainfo_datafile(); + + /* + * Reset whereToSendOutput, as the postmaster will do (but hasn't yet, at + * the point where we forked). This prevents duplicate output of messages + * from syslogger itself. + */ + whereToSendOutput = DestNone; + + /* + * Set up a reusable WaitEventSet object we'll use to wait for our latch, + * and (except on Windows) our socket. + * + * Unlike all other postmaster child processes, we'll ignore postmaster + * death because we want to collect final log output from all backends and + * then exit last. We'll do that by running until we see EOF on the + * syslog pipe, which implies that all other backends have exited + * (including the postmaster). + */ + wes = CreateWaitEventSet(CurrentMemoryContext, 2); + AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL); +#ifndef WIN32 + AddWaitEventToSet(wes, WL_SOCKET_READABLE, syslogPipe[0], NULL, NULL); +#endif + + /* main worker loop */ + for (;;) + { + bool time_based_rotation = false; + int size_rotation_for = 0; + long cur_timeout; + WaitEvent event; + +#ifndef WIN32 + int rc; +#endif + + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + /* + * Process any requests or signals received recently. + */ + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + + /* + * Check if the log directory or filename pattern changed in + * postgresql.conf. If so, force rotation to make sure we're + * writing the logfiles in the right place. + */ + if (strcmp(Log_directory, currentLogDir) != 0) + { + pfree(currentLogDir); + currentLogDir = pstrdup(Log_directory); + rotation_requested = true; + + /* + * Also, create new directory if not present; ignore errors + */ + (void) MakePGDirectory(Log_directory); + } + if (strcmp(Log_filename, currentLogFilename) != 0) + { + pfree(currentLogFilename); + currentLogFilename = pstrdup(Log_filename); + rotation_requested = true; + } + + /* + * Force a rotation if CSVLOG output was just turned on or off and + * we need to open or close csvlogFile accordingly. + */ + if (((Log_destination & LOG_DESTINATION_CSVLOG) != 0) != + (csvlogFile != NULL)) + rotation_requested = true; + + /* + * If rotation time parameter changed, reset next rotation time, + * but don't immediately force a rotation. + */ + if (currentLogRotationAge != Log_RotationAge) + { + currentLogRotationAge = Log_RotationAge; + set_next_rotation_time(); + } + + /* + * If we had a rotation-disabling failure, re-enable rotation + * attempts after SIGHUP, and force one immediately. + */ + if (rotation_disabled) + { + rotation_disabled = false; + rotation_requested = true; + } + + /* + * Force rewriting last log filename when reloading configuration. + * Even if rotation_requested is false, log_destination may have + * been changed and we don't want to wait the next file rotation. + */ + update_metainfo_datafile(); + } + + if (Log_RotationAge > 0 && !rotation_disabled) + { + /* Do a logfile rotation if it's time */ + now = (pg_time_t) time(NULL); + if (now >= next_rotation_time) + rotation_requested = time_based_rotation = true; + } + + if (!rotation_requested && Log_RotationSize > 0 && !rotation_disabled) + { + /* Do a rotation if file is too big */ + if (ftell(syslogFile) >= Log_RotationSize * 1024L) + { + rotation_requested = true; + size_rotation_for |= LOG_DESTINATION_STDERR; + } + if (csvlogFile != NULL && + ftell(csvlogFile) >= Log_RotationSize * 1024L) + { + rotation_requested = true; + size_rotation_for |= LOG_DESTINATION_CSVLOG; + } + } + + if (rotation_requested) + { + /* + * Force rotation when both values are zero. It means the request + * was sent by pg_rotate_logfile() or "pg_ctl logrotate". + */ + if (!time_based_rotation && size_rotation_for == 0) + size_rotation_for = LOG_DESTINATION_STDERR | LOG_DESTINATION_CSVLOG; + logfile_rotate(time_based_rotation, size_rotation_for); + } + + /* + * Calculate time till next time-based rotation, so that we don't + * sleep longer than that. We assume the value of "now" obtained + * above is still close enough. Note we can't make this calculation + * until after calling logfile_rotate(), since it will advance + * next_rotation_time. + * + * Also note that we need to beware of overflow in calculation of the + * timeout: with large settings of Log_RotationAge, next_rotation_time + * could be more than INT_MAX msec in the future. In that case we'll + * wait no more than INT_MAX msec, and try again. + */ + if (Log_RotationAge > 0 && !rotation_disabled) + { + pg_time_t delay; + + delay = next_rotation_time - now; + if (delay > 0) + { + if (delay > INT_MAX / 1000) + delay = INT_MAX / 1000; + cur_timeout = delay * 1000L; /* msec */ + } + else + cur_timeout = 0; + } + else + cur_timeout = -1L; + + /* + * Sleep until there's something to do + */ +#ifndef WIN32 + rc = WaitEventSetWait(wes, cur_timeout, &event, 1, + WAIT_EVENT_SYSLOGGER_MAIN); + + if (rc == 1 && event.events == WL_SOCKET_READABLE) + { + int bytesRead; + + bytesRead = read(syslogPipe[0], + logbuffer + bytes_in_logbuffer, + sizeof(logbuffer) - bytes_in_logbuffer); + if (bytesRead < 0) + { + if (errno != EINTR) + ereport(LOG, + (errcode_for_socket_access(), + errmsg("could not read from logger pipe: %m"))); + } + else if (bytesRead > 0) + { + bytes_in_logbuffer += bytesRead; + process_pipe_input(logbuffer, &bytes_in_logbuffer); + continue; + } + else + { + /* + * Zero bytes read when select() is saying read-ready means + * EOF on the pipe: that is, there are no longer any processes + * with the pipe write end open. Therefore, the postmaster + * and all backends are shut down, and we are done. + */ + pipe_eof_seen = true; + + /* if there's any data left then force it out now */ + flush_pipe_input(logbuffer, &bytes_in_logbuffer); + } + } +#else /* WIN32 */ + + /* + * On Windows we leave it to a separate thread to transfer data and + * detect pipe EOF. The main thread just wakes up to handle SIGHUP + * and rotation conditions. + * + * Server code isn't generally thread-safe, so we ensure that only one + * of the threads is active at a time by entering the critical section + * whenever we're not sleeping. + */ + LeaveCriticalSection(&sysloggerSection); + + (void) WaitEventSetWait(wes, cur_timeout, &event, 1, + WAIT_EVENT_SYSLOGGER_MAIN); + + EnterCriticalSection(&sysloggerSection); +#endif /* WIN32 */ + + if (pipe_eof_seen) + { + /* + * seeing this message on the real stderr is annoying - so we make + * it DEBUG1 to suppress in normal use. + */ + ereport(DEBUG1, + (errmsg_internal("logger shutting down"))); + + /* + * Normal exit from the syslogger is here. Note that we + * deliberately do not close syslogFile before exiting; this is to + * allow for the possibility of elog messages being generated + * inside proc_exit. Regular exit() will take care of flushing + * and closing stdio channels. + */ + proc_exit(0); + } + } +} + +/* + * Postmaster subroutine to start a syslogger subprocess. + */ +int +SysLogger_Start(void) +{ + pid_t sysloggerPid; + char *filename; + + if (!Logging_collector) + return 0; + + /* + * If first time through, create the pipe which will receive stderr + * output. + * + * If the syslogger crashes and needs to be restarted, we continue to use + * the same pipe (indeed must do so, since extant backends will be writing + * into that pipe). + * + * This means the postmaster must continue to hold the read end of the + * pipe open, so we can pass it down to the reincarnated syslogger. This + * is a bit klugy but we have little choice. + * + * Also note that we don't bother counting the pipe FDs by calling + * Reserve/ReleaseExternalFD. There's no real need to account for them + * accurately in the postmaster or syslogger process, and both ends of the + * pipe will wind up closed in all other postmaster children. + */ +#ifndef WIN32 + if (syslogPipe[0] < 0) + { + if (pipe(syslogPipe) < 0) + ereport(FATAL, + (errcode_for_socket_access(), + errmsg("could not create pipe for syslog: %m"))); + } +#else + if (!syslogPipe[0]) + { + SECURITY_ATTRIBUTES sa; + + memset(&sa, 0, sizeof(SECURITY_ATTRIBUTES)); + sa.nLength = sizeof(SECURITY_ATTRIBUTES); + sa.bInheritHandle = TRUE; + + if (!CreatePipe(&syslogPipe[0], &syslogPipe[1], &sa, 32768)) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not create pipe for syslog: %m"))); + } +#endif + + /* + * Create log directory if not present; ignore errors + */ + (void) MakePGDirectory(Log_directory); + + /* + * The initial logfile is created right in the postmaster, to verify that + * the Log_directory is writable. We save the reference time so that the + * syslogger child process can recompute this file name. + * + * It might look a bit strange to re-do this during a syslogger restart, + * but we must do so since the postmaster closed syslogFile after the + * previous fork (and remembering that old file wouldn't be right anyway). + * Note we always append here, we won't overwrite any existing file. This + * is consistent with the normal rules, because by definition this is not + * a time-based rotation. + */ + first_syslogger_file_time = time(NULL); + + filename = logfile_getname(first_syslogger_file_time, NULL); + + syslogFile = logfile_open(filename, "a", false); + + pfree(filename); + + /* + * Likewise for the initial CSV log file, if that's enabled. (Note that + * we open syslogFile even when only CSV output is nominally enabled, + * since some code paths will write to syslogFile anyway.) + */ + if (Log_destination & LOG_DESTINATION_CSVLOG) + { + filename = logfile_getname(first_syslogger_file_time, ".csv"); + + csvlogFile = logfile_open(filename, "a", false); + + pfree(filename); + } + +#ifdef EXEC_BACKEND + switch ((sysloggerPid = syslogger_forkexec())) +#else + switch ((sysloggerPid = fork_process())) +#endif + { + case -1: + ereport(LOG, + (errmsg("could not fork system logger: %m"))); + return 0; + +#ifndef EXEC_BACKEND + case 0: + /* in postmaster child ... */ + InitPostmasterChild(); + + /* Close the postmaster's sockets */ + ClosePostmasterPorts(true); + + /* Drop our connection to postmaster's shared memory, as well */ + dsm_detach_all(); + PGSharedMemoryDetach(); + + /* do the work */ + SysLoggerMain(0, NULL); + break; +#endif + + default: + /* success, in postmaster */ + + /* now we redirect stderr, if not done already */ + if (!redirection_done) + { +#ifdef WIN32 + int fd; +#endif + + /* + * Leave a breadcrumb trail when redirecting, in case the user + * forgets that redirection is active and looks only at the + * original stderr target file. + */ + ereport(LOG, + (errmsg("redirecting log output to logging collector process"), + errhint("Future log output will appear in directory \"%s\".", + Log_directory))); + +#ifndef WIN32 + fflush(stdout); + if (dup2(syslogPipe[1], fileno(stdout)) < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not redirect stdout: %m"))); + fflush(stderr); + if (dup2(syslogPipe[1], fileno(stderr)) < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not redirect stderr: %m"))); + /* Now we are done with the write end of the pipe. */ + close(syslogPipe[1]); + syslogPipe[1] = -1; +#else + + /* + * open the pipe in binary mode and make sure stderr is binary + * after it's been dup'ed into, to avoid disturbing the pipe + * chunking protocol. + */ + fflush(stderr); + fd = _open_osfhandle((intptr_t) syslogPipe[1], + _O_APPEND | _O_BINARY); + if (dup2(fd, _fileno(stderr)) < 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not redirect stderr: %m"))); + close(fd); + _setmode(_fileno(stderr), _O_BINARY); + + /* + * Now we are done with the write end of the pipe. + * CloseHandle() must not be called because the preceding + * close() closes the underlying handle. + */ + syslogPipe[1] = 0; +#endif + redirection_done = true; + } + + /* postmaster will never write the file(s); close 'em */ + fclose(syslogFile); + syslogFile = NULL; + if (csvlogFile != NULL) + { + fclose(csvlogFile); + csvlogFile = NULL; + } + return (int) sysloggerPid; + } + + /* we should never reach here */ + return 0; +} + + +#ifdef EXEC_BACKEND + +/* + * syslogger_forkexec() - + * + * Format up the arglist for, then fork and exec, a syslogger process + */ +static pid_t +syslogger_forkexec(void) +{ + char *av[10]; + int ac = 0; + char filenobuf[32]; + char csvfilenobuf[32]; + + av[ac++] = "postgres"; + av[ac++] = "--forklog"; + av[ac++] = NULL; /* filled in by postmaster_forkexec */ + + /* static variables (those not passed by write_backend_variables) */ +#ifndef WIN32 + if (syslogFile != NULL) + snprintf(filenobuf, sizeof(filenobuf), "%d", + fileno(syslogFile)); + else + strcpy(filenobuf, "-1"); +#else /* WIN32 */ + if (syslogFile != NULL) + snprintf(filenobuf, sizeof(filenobuf), "%ld", + (long) _get_osfhandle(_fileno(syslogFile))); + else + strcpy(filenobuf, "0"); +#endif /* WIN32 */ + av[ac++] = filenobuf; + +#ifndef WIN32 + if (csvlogFile != NULL) + snprintf(csvfilenobuf, sizeof(csvfilenobuf), "%d", + fileno(csvlogFile)); + else + strcpy(csvfilenobuf, "-1"); +#else /* WIN32 */ + if (csvlogFile != NULL) + snprintf(csvfilenobuf, sizeof(csvfilenobuf), "%ld", + (long) _get_osfhandle(_fileno(csvlogFile))); + else + strcpy(csvfilenobuf, "0"); +#endif /* WIN32 */ + av[ac++] = csvfilenobuf; + + av[ac] = NULL; + Assert(ac < lengthof(av)); + + return postmaster_forkexec(ac, av); +} + +/* + * syslogger_parseArgs() - + * + * Extract data from the arglist for exec'ed syslogger process + */ +static void +syslogger_parseArgs(int argc, char *argv[]) +{ + int fd; + + Assert(argc == 5); + argv += 3; + + /* + * Re-open the error output files that were opened by SysLogger_Start(). + * + * We expect this will always succeed, which is too optimistic, but if it + * fails there's not a lot we can do to report the problem anyway. As + * coded, we'll just crash on a null pointer dereference after failure... + */ +#ifndef WIN32 + fd = atoi(*argv++); + if (fd != -1) + { + syslogFile = fdopen(fd, "a"); + setvbuf(syslogFile, NULL, PG_IOLBF, 0); + } + fd = atoi(*argv++); + if (fd != -1) + { + csvlogFile = fdopen(fd, "a"); + setvbuf(csvlogFile, NULL, PG_IOLBF, 0); + } +#else /* WIN32 */ + fd = atoi(*argv++); + if (fd != 0) + { + fd = _open_osfhandle(fd, _O_APPEND | _O_TEXT); + if (fd > 0) + { + syslogFile = fdopen(fd, "a"); + setvbuf(syslogFile, NULL, PG_IOLBF, 0); + } + } + fd = atoi(*argv++); + if (fd != 0) + { + fd = _open_osfhandle(fd, _O_APPEND | _O_TEXT); + if (fd > 0) + { + csvlogFile = fdopen(fd, "a"); + setvbuf(csvlogFile, NULL, PG_IOLBF, 0); + } + } +#endif /* WIN32 */ +} +#endif /* EXEC_BACKEND */ + + +/* -------------------------------- + * pipe protocol handling + * -------------------------------- + */ + +/* + * Process data received through the syslogger pipe. + * + * This routine interprets the log pipe protocol which sends log messages as + * (hopefully atomic) chunks - such chunks are detected and reassembled here. + * + * The protocol has a header that starts with two nul bytes, then has a 16 bit + * length, the pid of the sending process, and a flag to indicate if it is + * the last chunk in a message. Incomplete chunks are saved until we read some + * more, and non-final chunks are accumulated until we get the final chunk. + * + * All of this is to avoid 2 problems: + * . partial messages being written to logfiles (messes rotation), and + * . messages from different backends being interleaved (messages garbled). + * + * Any non-protocol messages are written out directly. These should only come + * from non-PostgreSQL sources, however (e.g. third party libraries writing to + * stderr). + * + * logbuffer is the data input buffer, and *bytes_in_logbuffer is the number + * of bytes present. On exit, any not-yet-eaten data is left-justified in + * logbuffer, and *bytes_in_logbuffer is updated. + */ +static void +process_pipe_input(char *logbuffer, int *bytes_in_logbuffer) +{ + char *cursor = logbuffer; + int count = *bytes_in_logbuffer; + int dest = LOG_DESTINATION_STDERR; + + /* While we have enough for a header, process data... */ + while (count >= (int) (offsetof(PipeProtoHeader, data) + 1)) + { + PipeProtoHeader p; + int chunklen; + + /* Do we have a valid header? */ + memcpy(&p, cursor, offsetof(PipeProtoHeader, data)); + if (p.nuls[0] == '\0' && p.nuls[1] == '\0' && + p.len > 0 && p.len <= PIPE_MAX_PAYLOAD && + p.pid != 0 && + (p.is_last == 't' || p.is_last == 'f' || + p.is_last == 'T' || p.is_last == 'F')) + { + List *buffer_list; + ListCell *cell; + save_buffer *existing_slot = NULL, + *free_slot = NULL; + StringInfo str; + + chunklen = PIPE_HEADER_SIZE + p.len; + + /* Fall out of loop if we don't have the whole chunk yet */ + if (count < chunklen) + break; + + dest = (p.is_last == 'T' || p.is_last == 'F') ? + LOG_DESTINATION_CSVLOG : LOG_DESTINATION_STDERR; + + /* Locate any existing buffer for this source pid */ + buffer_list = buffer_lists[p.pid % NBUFFER_LISTS]; + foreach(cell, buffer_list) + { + save_buffer *buf = (save_buffer *) lfirst(cell); + + if (buf->pid == p.pid) + { + existing_slot = buf; + break; + } + if (buf->pid == 0 && free_slot == NULL) + free_slot = buf; + } + + if (p.is_last == 'f' || p.is_last == 'F') + { + /* + * Save a complete non-final chunk in a per-pid buffer + */ + if (existing_slot != NULL) + { + /* Add chunk to data from preceding chunks */ + str = &(existing_slot->data); + appendBinaryStringInfo(str, + cursor + PIPE_HEADER_SIZE, + p.len); + } + else + { + /* First chunk of message, save in a new buffer */ + if (free_slot == NULL) + { + /* + * Need a free slot, but there isn't one in the list, + * so create a new one and extend the list with it. + */ + free_slot = palloc(sizeof(save_buffer)); + buffer_list = lappend(buffer_list, free_slot); + buffer_lists[p.pid % NBUFFER_LISTS] = buffer_list; + } + free_slot->pid = p.pid; + str = &(free_slot->data); + initStringInfo(str); + appendBinaryStringInfo(str, + cursor + PIPE_HEADER_SIZE, + p.len); + } + } + else + { + /* + * Final chunk --- add it to anything saved for that pid, and + * either way write the whole thing out. + */ + if (existing_slot != NULL) + { + str = &(existing_slot->data); + appendBinaryStringInfo(str, + cursor + PIPE_HEADER_SIZE, + p.len); + write_syslogger_file(str->data, str->len, dest); + /* Mark the buffer unused, and reclaim string storage */ + existing_slot->pid = 0; + pfree(str->data); + } + else + { + /* The whole message was one chunk, evidently. */ + write_syslogger_file(cursor + PIPE_HEADER_SIZE, p.len, + dest); + } + } + + /* Finished processing this chunk */ + cursor += chunklen; + count -= chunklen; + } + else + { + /* Process non-protocol data */ + + /* + * Look for the start of a protocol header. If found, dump data + * up to there and repeat the loop. Otherwise, dump it all and + * fall out of the loop. (Note: we want to dump it all if at all + * possible, so as to avoid dividing non-protocol messages across + * logfiles. We expect that in many scenarios, a non-protocol + * message will arrive all in one read(), and we want to respect + * the read() boundary if possible.) + */ + for (chunklen = 1; chunklen < count; chunklen++) + { + if (cursor[chunklen] == '\0') + break; + } + /* fall back on the stderr log as the destination */ + write_syslogger_file(cursor, chunklen, LOG_DESTINATION_STDERR); + cursor += chunklen; + count -= chunklen; + } + } + + /* We don't have a full chunk, so left-align what remains in the buffer */ + if (count > 0 && cursor != logbuffer) + memmove(logbuffer, cursor, count); + *bytes_in_logbuffer = count; +} + +/* + * Force out any buffered data + * + * This is currently used only at syslogger shutdown, but could perhaps be + * useful at other times, so it is careful to leave things in a clean state. + */ +static void +flush_pipe_input(char *logbuffer, int *bytes_in_logbuffer) +{ + int i; + + /* Dump any incomplete protocol messages */ + for (i = 0; i < NBUFFER_LISTS; i++) + { + List *list = buffer_lists[i]; + ListCell *cell; + + foreach(cell, list) + { + save_buffer *buf = (save_buffer *) lfirst(cell); + + if (buf->pid != 0) + { + StringInfo str = &(buf->data); + + write_syslogger_file(str->data, str->len, + LOG_DESTINATION_STDERR); + /* Mark the buffer unused, and reclaim string storage */ + buf->pid = 0; + pfree(str->data); + } + } + } + + /* + * Force out any remaining pipe data as-is; we don't bother trying to + * remove any protocol headers that may exist in it. + */ + if (*bytes_in_logbuffer > 0) + write_syslogger_file(logbuffer, *bytes_in_logbuffer, + LOG_DESTINATION_STDERR); + *bytes_in_logbuffer = 0; +} + + +/* -------------------------------- + * logfile routines + * -------------------------------- + */ + +/* + * Write text to the currently open logfile + * + * This is exported so that elog.c can call it when MyBackendType is B_LOGGER. + * This allows the syslogger process to record elog messages of its own, + * even though its stderr does not point at the syslog pipe. + */ +void +write_syslogger_file(const char *buffer, int count, int destination) +{ + int rc; + FILE *logfile; + + /* + * If we're told to write to csvlogFile, but it's not open, dump the data + * to syslogFile (which is always open) instead. This can happen if CSV + * output is enabled after postmaster start and we've been unable to open + * csvlogFile. There are also race conditions during a parameter change + * whereby backends might send us CSV output before we open csvlogFile or + * after we close it. Writing CSV-formatted output to the regular log + * file isn't great, but it beats dropping log output on the floor. + * + * Think not to improve this by trying to open csvlogFile on-the-fly. Any + * failure in that would lead to recursion. + */ + logfile = (destination == LOG_DESTINATION_CSVLOG && + csvlogFile != NULL) ? csvlogFile : syslogFile; + + rc = fwrite(buffer, 1, count, logfile); + + /* + * Try to report any failure. We mustn't use ereport because it would + * just recurse right back here, but write_stderr is OK: it will write + * either to the postmaster's original stderr, or to /dev/null, but never + * to our input pipe which would result in a different sort of looping. + */ + if (rc != count) + write_stderr("could not write to log file: %s\n", strerror(errno)); +} + +#ifdef WIN32 + +/* + * Worker thread to transfer data from the pipe to the current logfile. + * + * We need this because on Windows, WaitForMultipleObjects does not work on + * unnamed pipes: it always reports "signaled", so the blocking ReadFile won't + * allow for SIGHUP; and select is for sockets only. + */ +static unsigned int __stdcall +pipeThread(void *arg) +{ + char logbuffer[READ_BUF_SIZE]; + int bytes_in_logbuffer = 0; + + for (;;) + { + DWORD bytesRead; + BOOL result; + + result = ReadFile(syslogPipe[0], + logbuffer + bytes_in_logbuffer, + sizeof(logbuffer) - bytes_in_logbuffer, + &bytesRead, 0); + + /* + * Enter critical section before doing anything that might touch + * global state shared by the main thread. Anything that uses + * palloc()/pfree() in particular are not safe outside the critical + * section. + */ + EnterCriticalSection(&sysloggerSection); + if (!result) + { + DWORD error = GetLastError(); + + if (error == ERROR_HANDLE_EOF || + error == ERROR_BROKEN_PIPE) + break; + _dosmaperr(error); + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not read from logger pipe: %m"))); + } + else if (bytesRead > 0) + { + bytes_in_logbuffer += bytesRead; + process_pipe_input(logbuffer, &bytes_in_logbuffer); + } + + /* + * If we've filled the current logfile, nudge the main thread to do a + * log rotation. + */ + if (Log_RotationSize > 0) + { + if (ftell(syslogFile) >= Log_RotationSize * 1024L || + (csvlogFile != NULL && ftell(csvlogFile) >= Log_RotationSize * 1024L)) + SetLatch(MyLatch); + } + LeaveCriticalSection(&sysloggerSection); + } + + /* We exit the above loop only upon detecting pipe EOF */ + pipe_eof_seen = true; + + /* if there's any data left then force it out now */ + flush_pipe_input(logbuffer, &bytes_in_logbuffer); + + /* set the latch to waken the main thread, which will quit */ + SetLatch(MyLatch); + + LeaveCriticalSection(&sysloggerSection); + _endthread(); + return 0; +} +#endif /* WIN32 */ + +/* + * Open a new logfile with proper permissions and buffering options. + * + * If allow_errors is true, we just log any open failure and return NULL + * (with errno still correct for the fopen failure). + * Otherwise, errors are treated as fatal. + */ +static FILE * +logfile_open(const char *filename, const char *mode, bool allow_errors) +{ + FILE *fh; + mode_t oumask; + + /* + * Note we do not let Log_file_mode disable IWUSR, since we certainly want + * to be able to write the files ourselves. + */ + oumask = umask((mode_t) ((~(Log_file_mode | S_IWUSR)) & (S_IRWXU | S_IRWXG | S_IRWXO))); + fh = fopen(filename, mode); + umask(oumask); + + if (fh) + { + setvbuf(fh, NULL, PG_IOLBF, 0); + +#ifdef WIN32 + /* use CRLF line endings on Windows */ + _setmode(_fileno(fh), _O_TEXT); +#endif + } + else + { + int save_errno = errno; + + ereport(allow_errors ? LOG : FATAL, + (errcode_for_file_access(), + errmsg("could not open log file \"%s\": %m", + filename))); + errno = save_errno; + } + + return fh; +} + +/* + * perform logfile rotation + */ +static void +logfile_rotate(bool time_based_rotation, int size_rotation_for) +{ + char *filename; + char *csvfilename = NULL; + pg_time_t fntime; + FILE *fh; + + rotation_requested = false; + + /* + * When doing a time-based rotation, invent the new logfile name based on + * the planned rotation time, not current time, to avoid "slippage" in the + * file name when we don't do the rotation immediately. + */ + if (time_based_rotation) + fntime = next_rotation_time; + else + fntime = time(NULL); + filename = logfile_getname(fntime, NULL); + if (Log_destination & LOG_DESTINATION_CSVLOG) + csvfilename = logfile_getname(fntime, ".csv"); + + /* + * Decide whether to overwrite or append. We can overwrite if (a) + * Log_truncate_on_rotation is set, (b) the rotation was triggered by + * elapsed time and not something else, and (c) the computed file name is + * different from what we were previously logging into. + * + * Note: last_file_name should never be NULL here, but if it is, append. + */ + if (time_based_rotation || (size_rotation_for & LOG_DESTINATION_STDERR)) + { + if (Log_truncate_on_rotation && time_based_rotation && + last_file_name != NULL && + strcmp(filename, last_file_name) != 0) + fh = logfile_open(filename, "w", true); + else + fh = logfile_open(filename, "a", true); + + if (!fh) + { + /* + * ENFILE/EMFILE are not too surprising on a busy system; just + * keep using the old file till we manage to get a new one. + * Otherwise, assume something's wrong with Log_directory and stop + * trying to create files. + */ + if (errno != ENFILE && errno != EMFILE) + { + ereport(LOG, + (errmsg("disabling automatic rotation (use SIGHUP to re-enable)"))); + rotation_disabled = true; + } + + if (filename) + pfree(filename); + if (csvfilename) + pfree(csvfilename); + return; + } + + fclose(syslogFile); + syslogFile = fh; + + /* instead of pfree'ing filename, remember it for next time */ + if (last_file_name != NULL) + pfree(last_file_name); + last_file_name = filename; + filename = NULL; + } + + /* + * Same as above, but for csv file. Note that if LOG_DESTINATION_CSVLOG + * was just turned on, we might have to open csvlogFile here though it was + * not open before. In such a case we'll append not overwrite (since + * last_csv_file_name will be NULL); that is consistent with the normal + * rules since it's not a time-based rotation. + */ + if ((Log_destination & LOG_DESTINATION_CSVLOG) && + (csvlogFile == NULL || + time_based_rotation || (size_rotation_for & LOG_DESTINATION_CSVLOG))) + { + if (Log_truncate_on_rotation && time_based_rotation && + last_csv_file_name != NULL && + strcmp(csvfilename, last_csv_file_name) != 0) + fh = logfile_open(csvfilename, "w", true); + else + fh = logfile_open(csvfilename, "a", true); + + if (!fh) + { + /* + * ENFILE/EMFILE are not too surprising on a busy system; just + * keep using the old file till we manage to get a new one. + * Otherwise, assume something's wrong with Log_directory and stop + * trying to create files. + */ + if (errno != ENFILE && errno != EMFILE) + { + ereport(LOG, + (errmsg("disabling automatic rotation (use SIGHUP to re-enable)"))); + rotation_disabled = true; + } + + if (filename) + pfree(filename); + if (csvfilename) + pfree(csvfilename); + return; + } + + if (csvlogFile != NULL) + fclose(csvlogFile); + csvlogFile = fh; + + /* instead of pfree'ing filename, remember it for next time */ + if (last_csv_file_name != NULL) + pfree(last_csv_file_name); + last_csv_file_name = csvfilename; + csvfilename = NULL; + } + else if (!(Log_destination & LOG_DESTINATION_CSVLOG) && + csvlogFile != NULL) + { + /* CSVLOG was just turned off, so close the old file */ + fclose(csvlogFile); + csvlogFile = NULL; + if (last_csv_file_name != NULL) + pfree(last_csv_file_name); + last_csv_file_name = NULL; + } + + if (filename) + pfree(filename); + if (csvfilename) + pfree(csvfilename); + + update_metainfo_datafile(); + + set_next_rotation_time(); +} + + +/* + * construct logfile name using timestamp information + * + * If suffix isn't NULL, append it to the name, replacing any ".log" + * that may be in the pattern. + * + * Result is palloc'd. + */ +static char * +logfile_getname(pg_time_t timestamp, const char *suffix) +{ + char *filename; + int len; + + filename = palloc(MAXPGPATH); + + snprintf(filename, MAXPGPATH, "%s/", Log_directory); + + len = strlen(filename); + + /* treat Log_filename as a strftime pattern */ + pg_strftime(filename + len, MAXPGPATH - len, Log_filename, + pg_localtime(×tamp, log_timezone)); + + if (suffix != NULL) + { + len = strlen(filename); + if (len > 4 && (strcmp(filename + (len - 4), ".log") == 0)) + len -= 4; + strlcpy(filename + len, suffix, MAXPGPATH - len); + } + + return filename; +} + +/* + * Determine the next planned rotation time, and store in next_rotation_time. + */ +static void +set_next_rotation_time(void) +{ + pg_time_t now; + struct pg_tm *tm; + int rotinterval; + + /* nothing to do if time-based rotation is disabled */ + if (Log_RotationAge <= 0) + return; + + /* + * The requirements here are to choose the next time > now that is a + * "multiple" of the log rotation interval. "Multiple" can be interpreted + * fairly loosely. In this version we align to log_timezone rather than + * GMT. + */ + rotinterval = Log_RotationAge * SECS_PER_MINUTE; /* convert to seconds */ + now = (pg_time_t) time(NULL); + tm = pg_localtime(&now, log_timezone); + now += tm->tm_gmtoff; + now -= now % rotinterval; + now += rotinterval; + now -= tm->tm_gmtoff; + next_rotation_time = now; +} + +/* + * Store the name of the file(s) where the log collector, when enabled, writes + * log messages. Useful for finding the name(s) of the current log file(s) + * when there is time-based logfile rotation. Filenames are stored in a + * temporary file and which is renamed into the final destination for + * atomicity. The file is opened with the same permissions as what gets + * created in the data directory and has proper buffering options. + */ +static void +update_metainfo_datafile(void) +{ + FILE *fh; + mode_t oumask; + + if (!(Log_destination & LOG_DESTINATION_STDERR) && + !(Log_destination & LOG_DESTINATION_CSVLOG)) + { + if (unlink(LOG_METAINFO_DATAFILE) < 0 && errno != ENOENT) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + LOG_METAINFO_DATAFILE))); + return; + } + + /* use the same permissions as the data directory for the new file */ + oumask = umask(pg_mode_mask); + fh = fopen(LOG_METAINFO_DATAFILE_TMP, "w"); + umask(oumask); + + if (fh) + { + setvbuf(fh, NULL, PG_IOLBF, 0); + +#ifdef WIN32 + /* use CRLF line endings on Windows */ + _setmode(_fileno(fh), _O_TEXT); +#endif + } + else + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + LOG_METAINFO_DATAFILE_TMP))); + return; + } + + if (last_file_name && (Log_destination & LOG_DESTINATION_STDERR)) + { + if (fprintf(fh, "stderr %s\n", last_file_name) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + LOG_METAINFO_DATAFILE_TMP))); + fclose(fh); + return; + } + } + + if (last_csv_file_name && (Log_destination & LOG_DESTINATION_CSVLOG)) + { + if (fprintf(fh, "csvlog %s\n", last_csv_file_name) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write file \"%s\": %m", + LOG_METAINFO_DATAFILE_TMP))); + fclose(fh); + return; + } + } + fclose(fh); + + if (rename(LOG_METAINFO_DATAFILE_TMP, LOG_METAINFO_DATAFILE) != 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + LOG_METAINFO_DATAFILE_TMP, LOG_METAINFO_DATAFILE))); +} + +/* -------------------------------- + * signal handler routines + * -------------------------------- + */ + +/* + * Check to see if a log rotation request has arrived. Should be + * called by postmaster after receiving SIGUSR1. + */ +bool +CheckLogrotateSignal(void) +{ + struct stat stat_buf; + + if (stat(LOGROTATE_SIGNAL_FILE, &stat_buf) == 0) + return true; + + return false; +} + +/* + * Remove the file signaling a log rotation request. + */ +void +RemoveLogrotateSignalFiles(void) +{ + unlink(LOGROTATE_SIGNAL_FILE); +} + +/* SIGUSR1: set flag to rotate logfile */ +static void +sigUsr1Handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + rotation_requested = true; + SetLatch(MyLatch); + + errno = save_errno; +} diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c new file mode 100644 index 0000000..626fae8 --- /dev/null +++ b/src/backend/postmaster/walwriter.c @@ -0,0 +1,309 @@ +/*------------------------------------------------------------------------- + * + * walwriter.c + * + * The WAL writer background process is new as of Postgres 8.3. It attempts + * to keep regular backends from having to write out (and fsync) WAL pages. + * Also, it guarantees that transaction commit records that weren't synced + * to disk immediately upon commit (ie, were "asynchronously committed") + * will reach disk within a knowable time --- which, as it happens, is at + * most three times the wal_writer_delay cycle time. + * + * Note that as with the bgwriter for shared buffers, regular backends are + * still empowered to issue WAL writes and fsyncs when the walwriter doesn't + * keep up. This means that the WALWriter is not an essential process and + * can shutdown quickly when requested. + * + * Because the walwriter's cycle is directly linked to the maximum delay + * before async-commit transactions are guaranteed committed, it's probably + * unwise to load additional functionality onto it. For instance, if you've + * got a yen to create xlog segments further in advance, that'd be better done + * in bgwriter than in walwriter. + * + * The walwriter is started by the postmaster as soon as the startup subprocess + * finishes. It remains alive until the postmaster commands it to terminate. + * Normal termination is by SIGTERM, which instructs the walwriter to exit(0). + * Emergency termination is by SIGQUIT; like any backend, the walwriter will + * simply abort and exit on SIGQUIT. + * + * If the walwriter exits unexpectedly, the postmaster treats that the same + * as a backend crash: shared memory may be corrupted, so remaining backends + * should be killed by SIGQUIT and then a recovery cycle started. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/postmaster/walwriter.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <signal.h> +#include <unistd.h> + +#include "access/xlog.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/interrupt.h" +#include "postmaster/walwriter.h" +#include "storage/bufmgr.h" +#include "storage/condition_variable.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/proc.h" +#include "storage/procsignal.h" +#include "storage/smgr.h" +#include "utils/guc.h" +#include "utils/hsearch.h" +#include "utils/memutils.h" +#include "utils/resowner.h" + + +/* + * GUC parameters + */ +int WalWriterDelay = 200; +int WalWriterFlushAfter = 128; + +/* + * Number of do-nothing loops before lengthening the delay time, and the + * multiplier to apply to WalWriterDelay when we do decide to hibernate. + * (Perhaps these need to be configurable?) + */ +#define LOOPS_UNTIL_HIBERNATE 50 +#define HIBERNATE_FACTOR 25 + +/* Prototypes for private functions */ +static void HandleWalWriterInterrupts(void); + +/* + * Main entry point for walwriter process + * + * This is invoked from AuxiliaryProcessMain, which has already created the + * basic execution environment, but not enabled signals yet. + */ +void +WalWriterMain(void) +{ + sigjmp_buf local_sigjmp_buf; + MemoryContext walwriter_context; + int left_till_hibernate; + bool hibernating; + + /* + * Properly accept or ignore signals the postmaster might send us + * + * We have no particular use for SIGINT at the moment, but seems + * reasonable to treat like SIGTERM. + */ + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGINT, SignalHandlerForShutdownRequest); + pqsignal(SIGTERM, SignalHandlerForShutdownRequest); + /* SIGQUIT handler was already set up by InitPostmasterChild */ + pqsignal(SIGALRM, SIG_IGN); + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, SIG_IGN); /* not used */ + + /* + * Reset some signals that are accepted by postmaster but not here + */ + pqsignal(SIGCHLD, SIG_DFL); + + /* + * Create a memory context that we will do all our work in. We do this so + * that we can reset the context during error recovery and thereby avoid + * possible memory leaks. Formerly this code just ran in + * TopMemoryContext, but resetting that would be a really bad idea. + */ + walwriter_context = AllocSetContextCreate(TopMemoryContext, + "Wal Writer", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(walwriter_context); + + /* + * If an exception is encountered, processing resumes here. + * + * You might wonder why this isn't coded as an infinite loop around a + * PG_TRY construct. The reason is that this is the bottom of the + * exception stack, and so with PG_TRY there would be no exception handler + * in force at all during the CATCH part. By leaving the outermost setjmp + * always active, we have at least some chance of recovering from an error + * during error recovery. (If we get into an infinite loop thereby, it + * will soon be stopped by overflow of elog.c's internal state stack.) + * + * Note that we use sigsetjmp(..., 1), so that the prevailing signal mask + * (to wit, BlockSig) will be restored when longjmp'ing to here. Thus, + * signals other than SIGQUIT will be blocked until we complete error + * recovery. It might seem that this policy makes the HOLD_INTERRUPTS() + * call redundant, but it is not since InterruptPending might be set + * already. + */ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* Since not using PG_TRY, must reset error stack by hand */ + error_context_stack = NULL; + + /* Prevent interrupts while cleaning up */ + HOLD_INTERRUPTS(); + + /* Report the error to the server log */ + EmitErrorReport(); + + /* + * These operations are really just a minimal subset of + * AbortTransaction(). We don't have very many resources to worry + * about in walwriter, but we do have LWLocks, and perhaps buffers? + */ + LWLockReleaseAll(); + ConditionVariableCancelSleep(); + pgstat_report_wait_end(); + AbortBufferIO(); + UnlockBuffers(); + ReleaseAuxProcessResources(false); + AtEOXact_Buffers(false); + AtEOXact_SMgr(); + AtEOXact_Files(false); + AtEOXact_HashTables(false); + + /* + * Now return to normal top-level context and clear ErrorContext for + * next time. + */ + MemoryContextSwitchTo(walwriter_context); + FlushErrorState(); + + /* Flush any leaked data in the top-level context */ + MemoryContextResetAndDeleteChildren(walwriter_context); + + /* Now we can allow interrupts again */ + RESUME_INTERRUPTS(); + + /* + * Sleep at least 1 second after any error. A write error is likely + * to be repeated, and we don't want to be filling the error logs as + * fast as we can. + */ + pg_usleep(1000000L); + + /* + * Close all open files after any error. This is helpful on Windows, + * where holding deleted files open causes various strange errors. + * It's not clear we need it elsewhere, but shouldn't hurt. + */ + smgrcloseall(); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + /* + * Unblock signals (they were blocked when the postmaster forked us) + */ + PG_SETMASK(&UnBlockSig); + + /* + * Reset hibernation state after any error. + */ + left_till_hibernate = LOOPS_UNTIL_HIBERNATE; + hibernating = false; + SetWalWriterSleeping(false); + + /* + * Advertise our latch that backends can use to wake us up while we're + * sleeping. + */ + ProcGlobal->walwriterLatch = &MyProc->procLatch; + + /* + * Loop forever + */ + for (;;) + { + long cur_timeout; + + /* + * Advertise whether we might hibernate in this cycle. We do this + * before resetting the latch to ensure that any async commits will + * see the flag set if they might possibly need to wake us up, and + * that we won't miss any signal they send us. (If we discover work + * to do in the last cycle before we would hibernate, the global flag + * will be set unnecessarily, but little harm is done.) But avoid + * touching the global flag if it doesn't need to change. + */ + if (hibernating != (left_till_hibernate <= 1)) + { + hibernating = (left_till_hibernate <= 1); + SetWalWriterSleeping(hibernating); + } + + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + /* Process any signals received recently */ + HandleWalWriterInterrupts(); + + /* + * Do what we're here for; then, if XLogBackgroundFlush() found useful + * work to do, reset hibernation counter. + */ + if (XLogBackgroundFlush()) + left_till_hibernate = LOOPS_UNTIL_HIBERNATE; + else if (left_till_hibernate > 0) + left_till_hibernate--; + + /* Send WAL statistics to the stats collector */ + pgstat_send_wal(false); + + /* + * Sleep until we are signaled or WalWriterDelay has elapsed. If we + * haven't done anything useful for quite some time, lengthen the + * sleep time so as to reduce the server's idle power consumption. + */ + if (left_till_hibernate > 0) + cur_timeout = WalWriterDelay; /* in ms */ + else + cur_timeout = WalWriterDelay * HIBERNATE_FACTOR; + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + cur_timeout, + WAIT_EVENT_WAL_WRITER_MAIN); + } +} + +/* + * Interrupt handler for main loops of WAL writer process. + */ +static void +HandleWalWriterInterrupts(void) +{ + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); + + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + if (ShutdownRequestPending) + { + /* + * Force to send remaining WAL statistics to the stats collector at + * process exit. + * + * Since pgstat_send_wal is invoked with 'force' is false in main loop + * to avoid overloading to the stats collector, there may exist unsent + * stats counters for the WAL writer. + */ + pgstat_send_wal(true); + + proc_exit(0); + } +} |