diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:17:33 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:17:33 +0000 |
commit | 5e45211a64149b3c659b90ff2de6fa982a5a93ed (patch) | |
tree | 739caf8c461053357daa9f162bef34516c7bf452 /src/backend/utils/activity | |
parent | Initial commit. (diff) | |
download | postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.tar.xz postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.zip |
Adding upstream version 15.5.upstream/15.5
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/utils/activity')
-rw-r--r-- | src/backend/utils/activity/Makefile | 34 | ||||
-rw-r--r-- | src/backend/utils/activity/backend_progress.c | 112 | ||||
-rw-r--r-- | src/backend/utils/activity/backend_status.c | 1151 | ||||
-rw-r--r-- | src/backend/utils/activity/pgstat.c | 1678 | ||||
-rw-r--r-- | src/backend/utils/activity/pgstat_archiver.c | 111 | ||||
-rw-r--r-- | src/backend/utils/activity/pgstat_bgwriter.c | 110 | ||||
-rw-r--r-- | src/backend/utils/activity/pgstat_checkpointer.c | 121 | ||||
-rw-r--r-- | src/backend/utils/activity/pgstat_database.c | 437 | ||||
-rw-r--r-- | src/backend/utils/activity/pgstat_function.c | 243 | ||||
-rw-r--r-- | src/backend/utils/activity/pgstat_relation.c | 938 | ||||
-rw-r--r-- | src/backend/utils/activity/pgstat_replslot.c | 224 | ||||
-rw-r--r-- | src/backend/utils/activity/pgstat_shmem.c | 1003 | ||||
-rw-r--r-- | src/backend/utils/activity/pgstat_slru.c | 248 | ||||
-rw-r--r-- | src/backend/utils/activity/pgstat_subscription.c | 110 | ||||
-rw-r--r-- | src/backend/utils/activity/pgstat_wal.c | 182 | ||||
-rw-r--r-- | src/backend/utils/activity/pgstat_xact.c | 391 | ||||
-rw-r--r-- | src/backend/utils/activity/wait_event.c | 749 |
17 files changed, 7842 insertions, 0 deletions
diff --git a/src/backend/utils/activity/Makefile b/src/backend/utils/activity/Makefile new file mode 100644 index 0000000..a2e8507 --- /dev/null +++ b/src/backend/utils/activity/Makefile @@ -0,0 +1,34 @@ +#------------------------------------------------------------------------- +# +# Makefile for backend/utils/activity +# +# Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/backend/utils/activity/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/utils/activity +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + backend_progress.o \ + backend_status.o \ + pgstat.o \ + pgstat_archiver.o \ + pgstat_bgwriter.o \ + pgstat_checkpointer.o \ + pgstat_database.o \ + pgstat_function.o \ + pgstat_relation.o \ + pgstat_replslot.o \ + pgstat_shmem.o \ + pgstat_slru.o \ + pgstat_subscription.o \ + pgstat_wal.o \ + pgstat_xact.o \ + wait_event.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/utils/activity/backend_progress.c b/src/backend/utils/activity/backend_progress.c new file mode 100644 index 0000000..f291997 --- /dev/null +++ b/src/backend/utils/activity/backend_progress.c @@ -0,0 +1,112 @@ +/* ---------- + * backend_progress.c + * + * Command progress reporting infrastructure. + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * src/backend/utils/activity/backend_progress.c + * ---------- + */ +#include "postgres.h" + +#include "port/atomics.h" /* for memory barriers */ +#include "utils/backend_progress.h" +#include "utils/backend_status.h" + + +/*----------- + * pgstat_progress_start_command() - + * + * Set st_progress_command (and st_progress_command_target) in own backend + * entry. Also, zero-initialize st_progress_param array. + *----------- + */ +void +pgstat_progress_start_command(ProgressCommandType cmdtype, Oid relid) +{ + volatile PgBackendStatus *beentry = MyBEEntry; + + if (!beentry || !pgstat_track_activities) + return; + + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); + beentry->st_progress_command = cmdtype; + beentry->st_progress_command_target = relid; + MemSet(&beentry->st_progress_param, 0, sizeof(beentry->st_progress_param)); + PGSTAT_END_WRITE_ACTIVITY(beentry); +} + +/*----------- + * pgstat_progress_update_param() - + * + * Update index'th member in st_progress_param[] of own backend entry. + *----------- + */ +void +pgstat_progress_update_param(int index, int64 val) +{ + volatile PgBackendStatus *beentry = MyBEEntry; + + Assert(index >= 0 && index < PGSTAT_NUM_PROGRESS_PARAM); + + if (!beentry || !pgstat_track_activities) + return; + + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); + beentry->st_progress_param[index] = val; + PGSTAT_END_WRITE_ACTIVITY(beentry); +} + +/*----------- + * pgstat_progress_update_multi_param() - + * + * Update multiple members in st_progress_param[] of own backend entry. + * This is atomic; readers won't see intermediate states. + *----------- + */ +void +pgstat_progress_update_multi_param(int nparam, const int *index, + const int64 *val) +{ + volatile PgBackendStatus *beentry = MyBEEntry; + int i; + + if (!beentry || !pgstat_track_activities || nparam == 0) + return; + + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); + + for (i = 0; i < nparam; ++i) + { + Assert(index[i] >= 0 && index[i] < PGSTAT_NUM_PROGRESS_PARAM); + + beentry->st_progress_param[index[i]] = val[i]; + } + + PGSTAT_END_WRITE_ACTIVITY(beentry); +} + +/*----------- + * pgstat_progress_end_command() - + * + * Reset st_progress_command (and st_progress_command_target) in own backend + * entry. This signals the end of the command. + *----------- + */ +void +pgstat_progress_end_command(void) +{ + volatile PgBackendStatus *beentry = MyBEEntry; + + if (!beentry || !pgstat_track_activities) + return; + + if (beentry->st_progress_command == PROGRESS_COMMAND_INVALID) + return; + + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); + beentry->st_progress_command = PROGRESS_COMMAND_INVALID; + beentry->st_progress_command_target = InvalidOid; + PGSTAT_END_WRITE_ACTIVITY(beentry); +} diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c new file mode 100644 index 0000000..3ecb15d --- /dev/null +++ b/src/backend/utils/activity/backend_status.c @@ -0,0 +1,1151 @@ +/* ---------- + * backend_status.c + * Backend status reporting infrastructure. + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/utils/activity/backend_status.c + * ---------- + */ +#include "postgres.h" + +#include "access/xact.h" +#include "libpq/libpq.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "port/atomics.h" /* for memory barriers */ +#include "storage/ipc.h" +#include "storage/proc.h" /* for MyProc */ +#include "storage/sinvaladt.h" +#include "utils/ascii.h" +#include "utils/backend_status.h" +#include "utils/guc.h" /* for application_name */ +#include "utils/memutils.h" + + +/* ---------- + * Total number of backends including auxiliary + * + * We reserve a slot for each possible BackendId, plus one for each + * possible auxiliary process type. (This scheme assumes there is not + * more than one of any auxiliary process type at a time.) MaxBackends + * includes autovacuum workers and background workers as well. + * ---------- + */ +#define NumBackendStatSlots (MaxBackends + NUM_AUXPROCTYPES) + + +/* ---------- + * GUC parameters + * ---------- + */ +bool pgstat_track_activities = false; +int pgstat_track_activity_query_size = 1024; + + +/* exposed so that backend_progress.c can access it */ +PgBackendStatus *MyBEEntry = NULL; + + +static PgBackendStatus *BackendStatusArray = NULL; +static char *BackendAppnameBuffer = NULL; +static char *BackendClientHostnameBuffer = NULL; +static char *BackendActivityBuffer = NULL; +static Size BackendActivityBufferSize = 0; +#ifdef USE_SSL +static PgBackendSSLStatus *BackendSslStatusBuffer = NULL; +#endif +#ifdef ENABLE_GSS +static PgBackendGSSStatus *BackendGssStatusBuffer = NULL; +#endif + + +/* Status for backends including auxiliary */ +static LocalPgBackendStatus *localBackendStatusTable = NULL; + +/* Total number of backends including auxiliary */ +static int localNumBackends = 0; + +static MemoryContext backendStatusSnapContext; + + +static void pgstat_beshutdown_hook(int code, Datum arg); +static void pgstat_read_current_status(void); +static void pgstat_setup_backend_status_context(void); + + +/* + * Report shared-memory space needed by CreateSharedBackendStatus. + */ +Size +BackendStatusShmemSize(void) +{ + Size size; + + /* BackendStatusArray: */ + size = mul_size(sizeof(PgBackendStatus), NumBackendStatSlots); + /* BackendAppnameBuffer: */ + size = add_size(size, + mul_size(NAMEDATALEN, NumBackendStatSlots)); + /* BackendClientHostnameBuffer: */ + size = add_size(size, + mul_size(NAMEDATALEN, NumBackendStatSlots)); + /* BackendActivityBuffer: */ + size = add_size(size, + mul_size(pgstat_track_activity_query_size, NumBackendStatSlots)); +#ifdef USE_SSL + /* BackendSslStatusBuffer: */ + size = add_size(size, + mul_size(sizeof(PgBackendSSLStatus), NumBackendStatSlots)); +#endif +#ifdef ENABLE_GSS + /* BackendGssStatusBuffer: */ + size = add_size(size, + mul_size(sizeof(PgBackendGSSStatus), NumBackendStatSlots)); +#endif + return size; +} + +/* + * Initialize the shared status array and several string buffers + * during postmaster startup. + */ +void +CreateSharedBackendStatus(void) +{ + Size size; + bool found; + int i; + char *buffer; + + /* Create or attach to the shared array */ + size = mul_size(sizeof(PgBackendStatus), NumBackendStatSlots); + BackendStatusArray = (PgBackendStatus *) + ShmemInitStruct("Backend Status Array", size, &found); + + if (!found) + { + /* + * We're the first - initialize. + */ + MemSet(BackendStatusArray, 0, size); + } + + /* Create or attach to the shared appname buffer */ + size = mul_size(NAMEDATALEN, NumBackendStatSlots); + BackendAppnameBuffer = (char *) + ShmemInitStruct("Backend Application Name Buffer", size, &found); + + if (!found) + { + MemSet(BackendAppnameBuffer, 0, size); + + /* Initialize st_appname pointers. */ + buffer = BackendAppnameBuffer; + for (i = 0; i < NumBackendStatSlots; i++) + { + BackendStatusArray[i].st_appname = buffer; + buffer += NAMEDATALEN; + } + } + + /* Create or attach to the shared client hostname buffer */ + size = mul_size(NAMEDATALEN, NumBackendStatSlots); + BackendClientHostnameBuffer = (char *) + ShmemInitStruct("Backend Client Host Name Buffer", size, &found); + + if (!found) + { + MemSet(BackendClientHostnameBuffer, 0, size); + + /* Initialize st_clienthostname pointers. */ + buffer = BackendClientHostnameBuffer; + for (i = 0; i < NumBackendStatSlots; i++) + { + BackendStatusArray[i].st_clienthostname = buffer; + buffer += NAMEDATALEN; + } + } + + /* Create or attach to the shared activity buffer */ + BackendActivityBufferSize = mul_size(pgstat_track_activity_query_size, + NumBackendStatSlots); + BackendActivityBuffer = (char *) + ShmemInitStruct("Backend Activity Buffer", + BackendActivityBufferSize, + &found); + + if (!found) + { + MemSet(BackendActivityBuffer, 0, BackendActivityBufferSize); + + /* Initialize st_activity pointers. */ + buffer = BackendActivityBuffer; + for (i = 0; i < NumBackendStatSlots; i++) + { + BackendStatusArray[i].st_activity_raw = buffer; + buffer += pgstat_track_activity_query_size; + } + } + +#ifdef USE_SSL + /* Create or attach to the shared SSL status buffer */ + size = mul_size(sizeof(PgBackendSSLStatus), NumBackendStatSlots); + BackendSslStatusBuffer = (PgBackendSSLStatus *) + ShmemInitStruct("Backend SSL Status Buffer", size, &found); + + if (!found) + { + PgBackendSSLStatus *ptr; + + MemSet(BackendSslStatusBuffer, 0, size); + + /* Initialize st_sslstatus pointers. */ + ptr = BackendSslStatusBuffer; + for (i = 0; i < NumBackendStatSlots; i++) + { + BackendStatusArray[i].st_sslstatus = ptr; + ptr++; + } + } +#endif + +#ifdef ENABLE_GSS + /* Create or attach to the shared GSSAPI status buffer */ + size = mul_size(sizeof(PgBackendGSSStatus), NumBackendStatSlots); + BackendGssStatusBuffer = (PgBackendGSSStatus *) + ShmemInitStruct("Backend GSS Status Buffer", size, &found); + + if (!found) + { + PgBackendGSSStatus *ptr; + + MemSet(BackendGssStatusBuffer, 0, size); + + /* Initialize st_gssstatus pointers. */ + ptr = BackendGssStatusBuffer; + for (i = 0; i < NumBackendStatSlots; i++) + { + BackendStatusArray[i].st_gssstatus = ptr; + ptr++; + } + } +#endif +} + +/* + * Initialize pgstats backend activity state, and set up our on-proc-exit + * hook. Called from InitPostgres and AuxiliaryProcessMain. For auxiliary + * process, MyBackendId is invalid. Otherwise, MyBackendId must be set, but we + * must not have started any transaction yet (since the exit hook must run + * after the last transaction exit). + * + * NOTE: MyDatabaseId isn't set yet; so the shutdown hook has to be careful. + */ +void +pgstat_beinit(void) +{ + /* Initialize MyBEEntry */ + if (MyBackendId != InvalidBackendId) + { + Assert(MyBackendId >= 1 && MyBackendId <= MaxBackends); + MyBEEntry = &BackendStatusArray[MyBackendId - 1]; + } + else + { + /* Must be an auxiliary process */ + Assert(MyAuxProcType != NotAnAuxProcess); + + /* + * Assign the MyBEEntry for an auxiliary process. Since it doesn't + * have a BackendId, the slot is statically allocated based on the + * auxiliary process type (MyAuxProcType). Backends use slots indexed + * in the range from 1 to MaxBackends (inclusive), so we use + * MaxBackends + AuxBackendType + 1 as the index of the slot for an + * auxiliary process. + */ + MyBEEntry = &BackendStatusArray[MaxBackends + MyAuxProcType]; + } + + /* Set up a process-exit hook to clean up */ + on_shmem_exit(pgstat_beshutdown_hook, 0); +} + + +/* ---------- + * pgstat_bestart() - + * + * Initialize this backend's entry in the PgBackendStatus array. + * Called from InitPostgres. + * + * Apart from auxiliary processes, MyBackendId, MyDatabaseId, + * session userid, and application_name must be set for a + * backend (hence, this cannot be combined with pgbestat_beinit). + * Note also that we must be inside a transaction if this isn't an aux + * process, as we may need to do encoding conversion on some strings. + * ---------- + */ +void +pgstat_bestart(void) +{ + volatile PgBackendStatus *vbeentry = MyBEEntry; + PgBackendStatus lbeentry; +#ifdef USE_SSL + PgBackendSSLStatus lsslstatus; +#endif +#ifdef ENABLE_GSS + PgBackendGSSStatus lgssstatus; +#endif + + /* pgstats state must be initialized from pgstat_beinit() */ + Assert(vbeentry != NULL); + + /* + * To minimize the time spent modifying the PgBackendStatus entry, and + * avoid risk of errors inside the critical section, we first copy the + * shared-memory struct to a local variable, then modify the data in the + * local variable, then copy the local variable back to shared memory. + * Only the last step has to be inside the critical section. + * + * Most of the data we copy from shared memory is just going to be + * overwritten, but the struct's not so large that it's worth the + * maintenance hassle to copy only the needful fields. + */ + memcpy(&lbeentry, + unvolatize(PgBackendStatus *, vbeentry), + sizeof(PgBackendStatus)); + + /* These structs can just start from zeroes each time, though */ +#ifdef USE_SSL + memset(&lsslstatus, 0, sizeof(lsslstatus)); +#endif +#ifdef ENABLE_GSS + memset(&lgssstatus, 0, sizeof(lgssstatus)); +#endif + + /* + * Now fill in all the fields of lbeentry, except for strings that are + * out-of-line data. Those have to be handled separately, below. + */ + lbeentry.st_procpid = MyProcPid; + lbeentry.st_backendType = MyBackendType; + lbeentry.st_proc_start_timestamp = MyStartTimestamp; + lbeentry.st_activity_start_timestamp = 0; + lbeentry.st_state_start_timestamp = 0; + lbeentry.st_xact_start_timestamp = 0; + lbeentry.st_databaseid = MyDatabaseId; + + /* We have userid for client-backends, wal-sender and bgworker processes */ + if (lbeentry.st_backendType == B_BACKEND + || lbeentry.st_backendType == B_WAL_SENDER + || lbeentry.st_backendType == B_BG_WORKER) + lbeentry.st_userid = GetSessionUserId(); + else + lbeentry.st_userid = InvalidOid; + + /* + * We may not have a MyProcPort (eg, if this is the autovacuum process). + * If so, use all-zeroes client address, which is dealt with specially in + * pg_stat_get_backend_client_addr and pg_stat_get_backend_client_port. + */ + if (MyProcPort) + memcpy(&lbeentry.st_clientaddr, &MyProcPort->raddr, + sizeof(lbeentry.st_clientaddr)); + else + MemSet(&lbeentry.st_clientaddr, 0, sizeof(lbeentry.st_clientaddr)); + +#ifdef USE_SSL + if (MyProcPort && MyProcPort->ssl_in_use) + { + lbeentry.st_ssl = true; + lsslstatus.ssl_bits = be_tls_get_cipher_bits(MyProcPort); + strlcpy(lsslstatus.ssl_version, be_tls_get_version(MyProcPort), NAMEDATALEN); + strlcpy(lsslstatus.ssl_cipher, be_tls_get_cipher(MyProcPort), NAMEDATALEN); + be_tls_get_peer_subject_name(MyProcPort, lsslstatus.ssl_client_dn, NAMEDATALEN); + be_tls_get_peer_serial(MyProcPort, lsslstatus.ssl_client_serial, NAMEDATALEN); + be_tls_get_peer_issuer_name(MyProcPort, lsslstatus.ssl_issuer_dn, NAMEDATALEN); + } + else + { + lbeentry.st_ssl = false; + } +#else + lbeentry.st_ssl = false; +#endif + +#ifdef ENABLE_GSS + if (MyProcPort && MyProcPort->gss != NULL) + { + const char *princ = be_gssapi_get_princ(MyProcPort); + + lbeentry.st_gss = true; + lgssstatus.gss_auth = be_gssapi_get_auth(MyProcPort); + lgssstatus.gss_enc = be_gssapi_get_enc(MyProcPort); + if (princ) + strlcpy(lgssstatus.gss_princ, princ, NAMEDATALEN); + } + else + { + lbeentry.st_gss = false; + } +#else + lbeentry.st_gss = false; +#endif + + lbeentry.st_state = STATE_UNDEFINED; + lbeentry.st_progress_command = PROGRESS_COMMAND_INVALID; + lbeentry.st_progress_command_target = InvalidOid; + lbeentry.st_query_id = UINT64CONST(0); + + /* + * we don't zero st_progress_param here to save cycles; nobody should + * examine it until st_progress_command has been set to something other + * than PROGRESS_COMMAND_INVALID + */ + + /* + * We're ready to enter the critical section that fills the shared-memory + * status entry. We follow the protocol of bumping st_changecount before + * and after; and make sure it's even afterwards. We use a volatile + * pointer here to ensure the compiler doesn't try to get cute. + */ + PGSTAT_BEGIN_WRITE_ACTIVITY(vbeentry); + + /* make sure we'll memcpy the same st_changecount back */ + lbeentry.st_changecount = vbeentry->st_changecount; + + memcpy(unvolatize(PgBackendStatus *, vbeentry), + &lbeentry, + sizeof(PgBackendStatus)); + + /* + * We can write the out-of-line strings and structs using the pointers + * that are in lbeentry; this saves some de-volatilizing messiness. + */ + lbeentry.st_appname[0] = '\0'; + if (MyProcPort && MyProcPort->remote_hostname) + strlcpy(lbeentry.st_clienthostname, MyProcPort->remote_hostname, + NAMEDATALEN); + else + lbeentry.st_clienthostname[0] = '\0'; + lbeentry.st_activity_raw[0] = '\0'; + /* Also make sure the last byte in each string area is always 0 */ + lbeentry.st_appname[NAMEDATALEN - 1] = '\0'; + lbeentry.st_clienthostname[NAMEDATALEN - 1] = '\0'; + lbeentry.st_activity_raw[pgstat_track_activity_query_size - 1] = '\0'; + +#ifdef USE_SSL + memcpy(lbeentry.st_sslstatus, &lsslstatus, sizeof(PgBackendSSLStatus)); +#endif +#ifdef ENABLE_GSS + memcpy(lbeentry.st_gssstatus, &lgssstatus, sizeof(PgBackendGSSStatus)); +#endif + + PGSTAT_END_WRITE_ACTIVITY(vbeentry); + + /* Update app name to current GUC setting */ + if (application_name) + pgstat_report_appname(application_name); +} + +/* + * Clear out our entry in the PgBackendStatus array. + */ +static void +pgstat_beshutdown_hook(int code, Datum arg) +{ + volatile PgBackendStatus *beentry = MyBEEntry; + + /* + * Clear my status entry, following the protocol of bumping st_changecount + * before and after. We use a volatile pointer here to ensure the + * compiler doesn't try to get cute. + */ + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); + + beentry->st_procpid = 0; /* mark invalid */ + + PGSTAT_END_WRITE_ACTIVITY(beentry); + + /* so that functions can check if backend_status.c is up via MyBEEntry */ + MyBEEntry = NULL; +} + +/* + * Discard any data collected in the current transaction. Any subsequent + * request will cause new snapshots to be read. + * + * This is also invoked during transaction commit or abort to discard the + * no-longer-wanted snapshot. + */ +void +pgstat_clear_backend_activity_snapshot(void) +{ + /* Release memory, if any was allocated */ + if (backendStatusSnapContext) + { + MemoryContextDelete(backendStatusSnapContext); + backendStatusSnapContext = NULL; + } + + /* Reset variables */ + localBackendStatusTable = NULL; + localNumBackends = 0; +} + +static void +pgstat_setup_backend_status_context(void) +{ + if (!backendStatusSnapContext) + backendStatusSnapContext = AllocSetContextCreate(TopMemoryContext, + "Backend Status Snapshot", + ALLOCSET_SMALL_SIZES); +} + + +/* ---------- + * pgstat_report_activity() - + * + * Called from tcop/postgres.c to report what the backend is actually doing + * (but note cmd_str can be NULL for certain cases). + * + * All updates of the status entry follow the protocol of bumping + * st_changecount before and after. We use a volatile pointer here to + * ensure the compiler doesn't try to get cute. + * ---------- + */ +void +pgstat_report_activity(BackendState state, const char *cmd_str) +{ + volatile PgBackendStatus *beentry = MyBEEntry; + TimestampTz start_timestamp; + TimestampTz current_timestamp; + int len = 0; + + TRACE_POSTGRESQL_STATEMENT_STATUS(cmd_str); + + if (!beentry) + return; + + if (!pgstat_track_activities) + { + if (beentry->st_state != STATE_DISABLED) + { + volatile PGPROC *proc = MyProc; + + /* + * track_activities is disabled, but we last reported a + * non-disabled state. As our final update, change the state and + * clear fields we will not be updating anymore. + */ + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); + beentry->st_state = STATE_DISABLED; + beentry->st_state_start_timestamp = 0; + beentry->st_activity_raw[0] = '\0'; + beentry->st_activity_start_timestamp = 0; + /* st_xact_start_timestamp and wait_event_info are also disabled */ + beentry->st_xact_start_timestamp = 0; + beentry->st_query_id = UINT64CONST(0); + proc->wait_event_info = 0; + PGSTAT_END_WRITE_ACTIVITY(beentry); + } + return; + } + + /* + * To minimize the time spent modifying the entry, and avoid risk of + * errors inside the critical section, fetch all the needed data first. + */ + start_timestamp = GetCurrentStatementStartTimestamp(); + if (cmd_str != NULL) + { + /* + * Compute length of to-be-stored string unaware of multi-byte + * characters. For speed reasons that'll get corrected on read, rather + * than computed every write. + */ + len = Min(strlen(cmd_str), pgstat_track_activity_query_size - 1); + } + current_timestamp = GetCurrentTimestamp(); + + /* + * If the state has changed from "active" or "idle in transaction", + * calculate the duration. + */ + if ((beentry->st_state == STATE_RUNNING || + beentry->st_state == STATE_FASTPATH || + beentry->st_state == STATE_IDLEINTRANSACTION || + beentry->st_state == STATE_IDLEINTRANSACTION_ABORTED) && + state != beentry->st_state) + { + long secs; + int usecs; + + TimestampDifference(beentry->st_state_start_timestamp, + current_timestamp, + &secs, &usecs); + + if (beentry->st_state == STATE_RUNNING || + beentry->st_state == STATE_FASTPATH) + pgstat_count_conn_active_time((PgStat_Counter) secs * 1000000 + usecs); + else + pgstat_count_conn_txn_idle_time((PgStat_Counter) secs * 1000000 + usecs); + } + + /* + * Now update the status entry + */ + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); + + beentry->st_state = state; + beentry->st_state_start_timestamp = current_timestamp; + + /* + * If a new query is started, we reset the query identifier as it'll only + * be known after parse analysis, to avoid reporting last query's + * identifier. + */ + if (state == STATE_RUNNING) + beentry->st_query_id = UINT64CONST(0); + + if (cmd_str != NULL) + { + memcpy((char *) beentry->st_activity_raw, cmd_str, len); + beentry->st_activity_raw[len] = '\0'; + beentry->st_activity_start_timestamp = start_timestamp; + } + + PGSTAT_END_WRITE_ACTIVITY(beentry); +} + +/* -------- + * pgstat_report_query_id() - + * + * Called to update top-level query identifier. + * -------- + */ +void +pgstat_report_query_id(uint64 query_id, bool force) +{ + volatile PgBackendStatus *beentry = MyBEEntry; + + /* + * if track_activities is disabled, st_query_id should already have been + * reset + */ + if (!beentry || !pgstat_track_activities) + return; + + /* + * We only report the top-level query identifiers. The stored query_id is + * reset when a backend calls pgstat_report_activity(STATE_RUNNING), or + * with an explicit call to this function using the force flag. If the + * saved query identifier is not zero it means that it's not a top-level + * command, so ignore the one provided unless it's an explicit call to + * reset the identifier. + */ + if (beentry->st_query_id != 0 && !force) + return; + + /* + * Update my status entry, following the protocol of bumping + * st_changecount before and after. We use a volatile pointer here to + * ensure the compiler doesn't try to get cute. + */ + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); + beentry->st_query_id = query_id; + PGSTAT_END_WRITE_ACTIVITY(beentry); +} + + +/* ---------- + * pgstat_report_appname() - + * + * Called to update our application name. + * ---------- + */ +void +pgstat_report_appname(const char *appname) +{ + volatile PgBackendStatus *beentry = MyBEEntry; + int len; + + if (!beentry) + return; + + /* This should be unnecessary if GUC did its job, but be safe */ + len = pg_mbcliplen(appname, strlen(appname), NAMEDATALEN - 1); + + /* + * Update my status entry, following the protocol of bumping + * st_changecount before and after. We use a volatile pointer here to + * ensure the compiler doesn't try to get cute. + */ + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); + + memcpy((char *) beentry->st_appname, appname, len); + beentry->st_appname[len] = '\0'; + + PGSTAT_END_WRITE_ACTIVITY(beentry); +} + +/* + * Report current transaction start timestamp as the specified value. + * Zero means there is no active transaction. + */ +void +pgstat_report_xact_timestamp(TimestampTz tstamp) +{ + volatile PgBackendStatus *beentry = MyBEEntry; + + if (!pgstat_track_activities || !beentry) + return; + + /* + * Update my status entry, following the protocol of bumping + * st_changecount before and after. We use a volatile pointer here to + * ensure the compiler doesn't try to get cute. + */ + PGSTAT_BEGIN_WRITE_ACTIVITY(beentry); + + beentry->st_xact_start_timestamp = tstamp; + + PGSTAT_END_WRITE_ACTIVITY(beentry); +} + +/* ---------- + * pgstat_read_current_status() - + * + * Copy the current contents of the PgBackendStatus array to local memory, + * if not already done in this transaction. + * ---------- + */ +static void +pgstat_read_current_status(void) +{ + volatile PgBackendStatus *beentry; + LocalPgBackendStatus *localtable; + LocalPgBackendStatus *localentry; + char *localappname, + *localclienthostname, + *localactivity; +#ifdef USE_SSL + PgBackendSSLStatus *localsslstatus; +#endif +#ifdef ENABLE_GSS + PgBackendGSSStatus *localgssstatus; +#endif + int i; + + if (localBackendStatusTable) + return; /* already done */ + + pgstat_setup_backend_status_context(); + + /* + * Allocate storage for local copy of state data. We can presume that + * none of these requests overflow size_t, because we already calculated + * the same values using mul_size during shmem setup. However, with + * probably-silly values of pgstat_track_activity_query_size and + * max_connections, the localactivity buffer could exceed 1GB, so use + * "huge" allocation for that one. + */ + localtable = (LocalPgBackendStatus *) + MemoryContextAlloc(backendStatusSnapContext, + sizeof(LocalPgBackendStatus) * NumBackendStatSlots); + localappname = (char *) + MemoryContextAlloc(backendStatusSnapContext, + NAMEDATALEN * NumBackendStatSlots); + localclienthostname = (char *) + MemoryContextAlloc(backendStatusSnapContext, + NAMEDATALEN * NumBackendStatSlots); + localactivity = (char *) + MemoryContextAllocHuge(backendStatusSnapContext, + (Size) pgstat_track_activity_query_size * + (Size) NumBackendStatSlots); +#ifdef USE_SSL + localsslstatus = (PgBackendSSLStatus *) + MemoryContextAlloc(backendStatusSnapContext, + sizeof(PgBackendSSLStatus) * NumBackendStatSlots); +#endif +#ifdef ENABLE_GSS + localgssstatus = (PgBackendGSSStatus *) + MemoryContextAlloc(backendStatusSnapContext, + sizeof(PgBackendGSSStatus) * NumBackendStatSlots); +#endif + + localNumBackends = 0; + + beentry = BackendStatusArray; + localentry = localtable; + for (i = 1; i <= NumBackendStatSlots; i++) + { + /* + * Follow the protocol of retrying if st_changecount changes while we + * copy the entry, or if it's odd. (The check for odd is needed to + * cover the case where we are able to completely copy the entry while + * the source backend is between increment steps.) We use a volatile + * pointer here to ensure the compiler doesn't try to get cute. + */ + for (;;) + { + int before_changecount; + int after_changecount; + + pgstat_begin_read_activity(beentry, before_changecount); + + localentry->backendStatus.st_procpid = beentry->st_procpid; + /* Skip all the data-copying work if entry is not in use */ + if (localentry->backendStatus.st_procpid > 0) + { + memcpy(&localentry->backendStatus, unvolatize(PgBackendStatus *, beentry), sizeof(PgBackendStatus)); + + /* + * For each PgBackendStatus field that is a pointer, copy the + * pointed-to data, then adjust the local copy of the pointer + * field to point at the local copy of the data. + * + * strcpy is safe even if the string is modified concurrently, + * because there's always a \0 at the end of the buffer. + */ + strcpy(localappname, (char *) beentry->st_appname); + localentry->backendStatus.st_appname = localappname; + strcpy(localclienthostname, (char *) beentry->st_clienthostname); + localentry->backendStatus.st_clienthostname = localclienthostname; + strcpy(localactivity, (char *) beentry->st_activity_raw); + localentry->backendStatus.st_activity_raw = localactivity; +#ifdef USE_SSL + if (beentry->st_ssl) + { + memcpy(localsslstatus, beentry->st_sslstatus, sizeof(PgBackendSSLStatus)); + localentry->backendStatus.st_sslstatus = localsslstatus; + } +#endif +#ifdef ENABLE_GSS + if (beentry->st_gss) + { + memcpy(localgssstatus, beentry->st_gssstatus, sizeof(PgBackendGSSStatus)); + localentry->backendStatus.st_gssstatus = localgssstatus; + } +#endif + } + + pgstat_end_read_activity(beentry, after_changecount); + + if (pgstat_read_activity_complete(before_changecount, + after_changecount)) + break; + + /* Make sure we can break out of loop if stuck... */ + CHECK_FOR_INTERRUPTS(); + } + + beentry++; + /* Only valid entries get included into the local array */ + if (localentry->backendStatus.st_procpid > 0) + { + BackendIdGetTransactionIds(i, + &localentry->backend_xid, + &localentry->backend_xmin); + + localentry++; + localappname += NAMEDATALEN; + localclienthostname += NAMEDATALEN; + localactivity += pgstat_track_activity_query_size; +#ifdef USE_SSL + localsslstatus++; +#endif +#ifdef ENABLE_GSS + localgssstatus++; +#endif + localNumBackends++; + } + } + + /* Set the pointer only after completion of a valid table */ + localBackendStatusTable = localtable; +} + + +/* ---------- + * pgstat_get_backend_current_activity() - + * + * Return a string representing the current activity of the backend with + * the specified PID. This looks directly at the BackendStatusArray, + * and so will provide current information regardless of the age of our + * transaction's snapshot of the status array. + * + * It is the caller's responsibility to invoke this only for backends whose + * state is expected to remain stable while the result is in use. The + * only current use is in deadlock reporting, where we can expect that + * the target backend is blocked on a lock. (There are corner cases + * where the target's wait could get aborted while we are looking at it, + * but the very worst consequence is to return a pointer to a string + * that's been changed, so we won't worry too much.) + * + * Note: return strings for special cases match pg_stat_get_backend_activity. + * ---------- + */ +const char * +pgstat_get_backend_current_activity(int pid, bool checkUser) +{ + PgBackendStatus *beentry; + int i; + + beentry = BackendStatusArray; + for (i = 1; i <= MaxBackends; i++) + { + /* + * Although we expect the target backend's entry to be stable, that + * doesn't imply that anyone else's is. To avoid identifying the + * wrong backend, while we check for a match to the desired PID we + * must follow the protocol of retrying if st_changecount changes + * while we examine the entry, or if it's odd. (This might be + * unnecessary, since fetching or storing an int is almost certainly + * atomic, but let's play it safe.) We use a volatile pointer here to + * ensure the compiler doesn't try to get cute. + */ + volatile PgBackendStatus *vbeentry = beentry; + bool found; + + for (;;) + { + int before_changecount; + int after_changecount; + + pgstat_begin_read_activity(vbeentry, before_changecount); + + found = (vbeentry->st_procpid == pid); + + pgstat_end_read_activity(vbeentry, after_changecount); + + if (pgstat_read_activity_complete(before_changecount, + after_changecount)) + break; + + /* Make sure we can break out of loop if stuck... */ + CHECK_FOR_INTERRUPTS(); + } + + if (found) + { + /* Now it is safe to use the non-volatile pointer */ + if (checkUser && !superuser() && beentry->st_userid != GetUserId()) + return "<insufficient privilege>"; + else if (*(beentry->st_activity_raw) == '\0') + return "<command string not enabled>"; + else + { + /* this'll leak a bit of memory, but that seems acceptable */ + return pgstat_clip_activity(beentry->st_activity_raw); + } + } + + beentry++; + } + + /* If we get here, caller is in error ... */ + return "<backend information not available>"; +} + +/* ---------- + * pgstat_get_crashed_backend_activity() - + * + * Return a string representing the current activity of the backend with + * the specified PID. Like the function above, but reads shared memory with + * the expectation that it may be corrupt. On success, copy the string + * into the "buffer" argument and return that pointer. On failure, + * return NULL. + * + * This function is only intended to be used by the postmaster to report the + * query that crashed a backend. In particular, no attempt is made to + * follow the correct concurrency protocol when accessing the + * BackendStatusArray. But that's OK, in the worst case we'll return a + * corrupted message. We also must take care not to trip on ereport(ERROR). + * ---------- + */ +const char * +pgstat_get_crashed_backend_activity(int pid, char *buffer, int buflen) +{ + volatile PgBackendStatus *beentry; + int i; + + beentry = BackendStatusArray; + + /* + * We probably shouldn't get here before shared memory has been set up, + * but be safe. + */ + if (beentry == NULL || BackendActivityBuffer == NULL) + return NULL; + + for (i = 1; i <= MaxBackends; i++) + { + if (beentry->st_procpid == pid) + { + /* Read pointer just once, so it can't change after validation */ + const char *activity = beentry->st_activity_raw; + const char *activity_last; + + /* + * We mustn't access activity string before we verify that it + * falls within the BackendActivityBuffer. To make sure that the + * entire string including its ending is contained within the + * buffer, subtract one activity length from the buffer size. + */ + activity_last = BackendActivityBuffer + BackendActivityBufferSize + - pgstat_track_activity_query_size; + + if (activity < BackendActivityBuffer || + activity > activity_last) + return NULL; + + /* If no string available, no point in a report */ + if (activity[0] == '\0') + return NULL; + + /* + * Copy only ASCII-safe characters so we don't run into encoding + * problems when reporting the message; and be sure not to run off + * the end of memory. As only ASCII characters are reported, it + * doesn't seem necessary to perform multibyte aware clipping. + */ + ascii_safe_strlcpy(buffer, activity, + Min(buflen, pgstat_track_activity_query_size)); + + return buffer; + } + + beentry++; + } + + /* PID not found */ + return NULL; +} + +/* ---------- + * pgstat_get_my_query_id() - + * + * Return current backend's query identifier. + */ +uint64 +pgstat_get_my_query_id(void) +{ + if (!MyBEEntry) + return 0; + + /* + * There's no need for a lock around pgstat_begin_read_activity / + * pgstat_end_read_activity here as it's only called from + * pg_stat_get_activity which is already protected, or from the same + * backend which means that there won't be concurrent writes. + */ + return MyBEEntry->st_query_id; +} + + +/* ---------- + * pgstat_fetch_stat_beentry() - + * + * Support function for the SQL-callable pgstat* functions. Returns + * our local copy of the current-activity entry for one backend. + * + * NB: caller is responsible for a check if the user is permitted to see + * this info (especially the querystring). + * ---------- + */ +PgBackendStatus * +pgstat_fetch_stat_beentry(int beid) +{ + pgstat_read_current_status(); + + if (beid < 1 || beid > localNumBackends) + return NULL; + + return &localBackendStatusTable[beid - 1].backendStatus; +} + + +/* ---------- + * pgstat_fetch_stat_local_beentry() - + * + * Like pgstat_fetch_stat_beentry() but with locally computed additions (like + * xid and xmin values of the backend) + * + * NB: caller is responsible for a check if the user is permitted to see + * this info (especially the querystring). + * ---------- + */ +LocalPgBackendStatus * +pgstat_fetch_stat_local_beentry(int beid) +{ + pgstat_read_current_status(); + + if (beid < 1 || beid > localNumBackends) + return NULL; + + return &localBackendStatusTable[beid - 1]; +} + + +/* ---------- + * pgstat_fetch_stat_numbackends() - + * + * Support function for the SQL-callable pgstat* functions. Returns + * the maximum current backend id. + * ---------- + */ +int +pgstat_fetch_stat_numbackends(void) +{ + pgstat_read_current_status(); + + return localNumBackends; +} + +/* + * Convert a potentially unsafely truncated activity string (see + * PgBackendStatus.st_activity_raw's documentation) into a correctly truncated + * one. + * + * The returned string is allocated in the caller's memory context and may be + * freed. + */ +char * +pgstat_clip_activity(const char *raw_activity) +{ + char *activity; + int rawlen; + int cliplen; + + /* + * Some callers, like pgstat_get_backend_current_activity(), do not + * guarantee that the buffer isn't concurrently modified. We try to take + * care that the buffer is always terminated by a NUL byte regardless, but + * let's still be paranoid about the string's length. In those cases the + * underlying buffer is guaranteed to be pgstat_track_activity_query_size + * large. + */ + activity = pnstrdup(raw_activity, pgstat_track_activity_query_size - 1); + + /* now double-guaranteed to be NUL terminated */ + rawlen = strlen(activity); + + /* + * All supported server-encodings make it possible to determine the length + * of a multi-byte character from its first byte (this is not the case for + * client encodings, see GB18030). As st_activity is always stored using + * server encoding, this allows us to perform multi-byte aware truncation, + * even if the string earlier was truncated in the middle of a multi-byte + * character. + */ + cliplen = pg_mbcliplen(activity, rawlen, + pgstat_track_activity_query_size - 1); + + activity[cliplen] = '\0'; + + return activity; +} diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c new file mode 100644 index 0000000..84d65a7 --- /dev/null +++ b/src/backend/utils/activity/pgstat.c @@ -0,0 +1,1678 @@ +/* ---------- + * pgstat.c + * Infrastructure for the cumulative statistics system. + * + * The cumulative statistics system accumulates statistics for different kinds + * of objects. Some kinds of statistics are collected for a fixed number of + * objects (most commonly 1), e.g., checkpointer statistics. Other kinds of + * statistics are collected for a varying number of objects + * (e.g. relations). See PgStat_KindInfo for a list of currently handled + * statistics. + * + * Statistics are loaded from the filesystem during startup (by the startup + * process), unless preceded by a crash, in which case all stats are + * discarded. They are written out by the checkpointer process just before + * shutting down, except when shutting down in immediate mode. + * + * Fixed-numbered stats are stored in plain (non-dynamic) shared memory. + * + * Statistics for variable-numbered objects are stored in dynamic shared + * memory and can be found via a dshash hashtable. The statistics counters are + * not part of the dshash entry (PgStatShared_HashEntry) directly, but are + * separately allocated (PgStatShared_HashEntry->body). The separate + * allocation allows different kinds of statistics to be stored in the same + * hashtable without wasting space in PgStatShared_HashEntry. + * + * Variable-numbered stats are addressed by PgStat_HashKey while running. It + * is not possible to have statistics for an object that cannot be addressed + * that way at runtime. A wider identifier can be used when serializing to + * disk (used for replication slot stats). + * + * To avoid contention on the shared hashtable, each backend has a + * backend-local hashtable (pgStatEntryRefHash) in front of the shared + * hashtable, containing references (PgStat_EntryRef) to shared hashtable + * entries. The shared hashtable only needs to be accessed when no prior + * reference is found in the local hashtable. Besides pointing to the + * shared hashtable entry (PgStatShared_HashEntry) PgStat_EntryRef also + * contains a pointer to the shared statistics data, as a process-local + * address, to reduce access costs. + * + * The names for structs stored in shared memory are prefixed with + * PgStatShared instead of PgStat. Each stats entry in shared memory is + * protected by a dedicated lwlock. + * + * Most stats updates are first accumulated locally in each process as pending + * entries, then later flushed to shared memory (just after commit, or by + * idle-timeout). This practically eliminates contention on individual stats + * entries. For most kinds of variable-numbered pending stats data is stored + * in PgStat_EntryRef->pending. All entries with pending data are in the + * pgStatPending list. Pending statistics updates are flushed out by + * pgstat_report_stat(). + * + * The behavior of different kinds of statistics is determined by the kind's + * entry in pgstat_kind_infos, see PgStat_KindInfo for details. + * + * The consistency of read accesses to statistics can be configured using the + * stats_fetch_consistency GUC (see config.sgml and monitoring.sgml for the + * settings). When using PGSTAT_FETCH_CONSISTENCY_CACHE or + * PGSTAT_FETCH_CONSISTENCY_SNAPSHOT statistics are stored in + * pgStatLocal.snapshot. + * + * To keep things manageable, stats handling is split across several + * files. Infrastructure pieces are in: + * - pgstat.c - this file, to tie it all together + * - pgstat_shmem.c - nearly everything dealing with shared memory, including + * the maintenance of hashtable entries + * - pgstat_xact.c - transactional integration, including the transactional + * creation and dropping of stats entries + * + * Each statistics kind is handled in a dedicated file: + * - pgstat_archiver.c + * - pgstat_bgwriter.c + * - pgstat_checkpointer.c + * - pgstat_database.c + * - pgstat_function.c + * - pgstat_relation.c + * - pgstat_replslot.c + * - pgstat_slru.c + * - pgstat_subscription.c + * - pgstat_wal.c + * + * Whenever possible infrastructure files should not contain code related to + * specific kinds of stats. + * + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/activity/pgstat.c + * ---------- + */ +#include "postgres.h" + +#include <unistd.h> + +#include "access/transam.h" +#include "access/xact.h" +#include "lib/dshash.h" +#include "pgstat.h" +#include "port/atomics.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/pg_shmem.h" +#include "storage/shmem.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/pgstat_internal.h" +#include "utils/timestamp.h" + + +/* ---------- + * Timer definitions. + * + * In milliseconds. + * ---------- + */ + +/* minimum interval non-forced stats flushes.*/ +#define PGSTAT_MIN_INTERVAL 1000 +/* how long until to block flushing pending stats updates */ +#define PGSTAT_MAX_INTERVAL 60000 +/* when to call pgstat_report_stat() again, even when idle */ +#define PGSTAT_IDLE_INTERVAL 10000 + +/* ---------- + * Initial size hints for the hash tables used in statistics. + * ---------- + */ + +#define PGSTAT_SNAPSHOT_HASH_SIZE 512 + + +/* hash table for statistics snapshots entry */ +typedef struct PgStat_SnapshotEntry +{ + PgStat_HashKey key; + char status; /* for simplehash use */ + void *data; /* the stats data itself */ +} PgStat_SnapshotEntry; + + +/* ---------- + * Backend-local Hash Table Definitions + * ---------- + */ + +/* for stats snapshot entries */ +#define SH_PREFIX pgstat_snapshot +#define SH_ELEMENT_TYPE PgStat_SnapshotEntry +#define SH_KEY_TYPE PgStat_HashKey +#define SH_KEY key +#define SH_HASH_KEY(tb, key) \ + pgstat_hash_hash_key(&key, sizeof(PgStat_HashKey), NULL) +#define SH_EQUAL(tb, a, b) \ + pgstat_cmp_hash_key(&a, &b, sizeof(PgStat_HashKey), NULL) == 0 +#define SH_SCOPE static inline +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + + +/* ---------- + * Local function forward declarations + * ---------- + */ + +static void pgstat_write_statsfile(void); +static void pgstat_read_statsfile(void); + +static void pgstat_reset_after_failure(void); + +static bool pgstat_flush_pending_entries(bool nowait); + +static void pgstat_prep_snapshot(void); +static void pgstat_build_snapshot(void); +static void pgstat_build_snapshot_fixed(PgStat_Kind kind); + +static inline bool pgstat_is_kind_valid(int ikind); + + +/* ---------- + * GUC parameters + * ---------- + */ + +bool pgstat_track_counts = false; +int pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_CACHE; + + +/* ---------- + * state shared with pgstat_*.c + * ---------- + */ + +PgStat_LocalState pgStatLocal; + + +/* ---------- + * Local data + * + * NB: There should be only variables related to stats infrastructure here, + * not for specific kinds of stats. + * ---------- + */ + +/* + * Memory contexts containing the pgStatEntryRefHash table, the + * pgStatSharedRef entries, and pending data respectively. Mostly to make it + * easier to track / attribute memory usage. + */ + +static MemoryContext pgStatPendingContext = NULL; + +/* + * Backend local list of PgStat_EntryRef with unflushed pending stats. + * + * Newly pending entries should only ever be added to the end of the list, + * otherwise pgstat_flush_pending_entries() might not see them immediately. + */ +static dlist_head pgStatPending = DLIST_STATIC_INIT(pgStatPending); + + +/* + * Force the next stats flush to happen regardless of + * PGSTAT_MIN_INTERVAL. Useful in test scripts. + */ +static bool pgStatForceNextFlush = false; + +/* + * Force-clear existing snapshot before next use when stats_fetch_consistency + * is changed. + */ +static bool force_stats_snapshot_clear = false; + + +/* + * For assertions that check pgstat is not used before initialization / after + * shutdown. + */ +#ifdef USE_ASSERT_CHECKING +static bool pgstat_is_initialized = false; +static bool pgstat_is_shutdown = false; +#endif + + +/* + * The different kinds of statistics. + * + * If reasonably possible, handling specific to one kind of stats should go + * through this abstraction, rather than making more of pgstat.c aware. + * + * See comments for struct PgStat_KindInfo for details about the individual + * fields. + * + * XXX: It'd be nicer to define this outside of this file. But there doesn't + * seem to be a great way of doing that, given the split across multiple + * files. + */ +static const PgStat_KindInfo pgstat_kind_infos[PGSTAT_NUM_KINDS] = { + + /* stats kinds for variable-numbered objects */ + + [PGSTAT_KIND_DATABASE] = { + .name = "database", + + .fixed_amount = false, + /* so pg_stat_database entries can be seen in all databases */ + .accessed_across_databases = true, + + .shared_size = sizeof(PgStatShared_Database), + .shared_data_off = offsetof(PgStatShared_Database, stats), + .shared_data_len = sizeof(((PgStatShared_Database *) 0)->stats), + .pending_size = sizeof(PgStat_StatDBEntry), + + .flush_pending_cb = pgstat_database_flush_cb, + .reset_timestamp_cb = pgstat_database_reset_timestamp_cb, + }, + + [PGSTAT_KIND_RELATION] = { + .name = "relation", + + .fixed_amount = false, + + .shared_size = sizeof(PgStatShared_Relation), + .shared_data_off = offsetof(PgStatShared_Relation, stats), + .shared_data_len = sizeof(((PgStatShared_Relation *) 0)->stats), + .pending_size = sizeof(PgStat_TableStatus), + + .flush_pending_cb = pgstat_relation_flush_cb, + .delete_pending_cb = pgstat_relation_delete_pending_cb, + }, + + [PGSTAT_KIND_FUNCTION] = { + .name = "function", + + .fixed_amount = false, + + .shared_size = sizeof(PgStatShared_Function), + .shared_data_off = offsetof(PgStatShared_Function, stats), + .shared_data_len = sizeof(((PgStatShared_Function *) 0)->stats), + .pending_size = sizeof(PgStat_BackendFunctionEntry), + + .flush_pending_cb = pgstat_function_flush_cb, + }, + + [PGSTAT_KIND_REPLSLOT] = { + .name = "replslot", + + .fixed_amount = false, + + .accessed_across_databases = true, + .named_on_disk = true, + + .shared_size = sizeof(PgStatShared_ReplSlot), + .shared_data_off = offsetof(PgStatShared_ReplSlot, stats), + .shared_data_len = sizeof(((PgStatShared_ReplSlot *) 0)->stats), + + .reset_timestamp_cb = pgstat_replslot_reset_timestamp_cb, + .to_serialized_name = pgstat_replslot_to_serialized_name_cb, + .from_serialized_name = pgstat_replslot_from_serialized_name_cb, + }, + + [PGSTAT_KIND_SUBSCRIPTION] = { + .name = "subscription", + + .fixed_amount = false, + /* so pg_stat_subscription_stats entries can be seen in all databases */ + .accessed_across_databases = true, + + .shared_size = sizeof(PgStatShared_Subscription), + .shared_data_off = offsetof(PgStatShared_Subscription, stats), + .shared_data_len = sizeof(((PgStatShared_Subscription *) 0)->stats), + .pending_size = sizeof(PgStat_BackendSubEntry), + + .flush_pending_cb = pgstat_subscription_flush_cb, + .reset_timestamp_cb = pgstat_subscription_reset_timestamp_cb, + }, + + + /* stats for fixed-numbered (mostly 1) objects */ + + [PGSTAT_KIND_ARCHIVER] = { + .name = "archiver", + + .fixed_amount = true, + + .reset_all_cb = pgstat_archiver_reset_all_cb, + .snapshot_cb = pgstat_archiver_snapshot_cb, + }, + + [PGSTAT_KIND_BGWRITER] = { + .name = "bgwriter", + + .fixed_amount = true, + + .reset_all_cb = pgstat_bgwriter_reset_all_cb, + .snapshot_cb = pgstat_bgwriter_snapshot_cb, + }, + + [PGSTAT_KIND_CHECKPOINTER] = { + .name = "checkpointer", + + .fixed_amount = true, + + .reset_all_cb = pgstat_checkpointer_reset_all_cb, + .snapshot_cb = pgstat_checkpointer_snapshot_cb, + }, + + [PGSTAT_KIND_SLRU] = { + .name = "slru", + + .fixed_amount = true, + + .reset_all_cb = pgstat_slru_reset_all_cb, + .snapshot_cb = pgstat_slru_snapshot_cb, + }, + + [PGSTAT_KIND_WAL] = { + .name = "wal", + + .fixed_amount = true, + + .reset_all_cb = pgstat_wal_reset_all_cb, + .snapshot_cb = pgstat_wal_snapshot_cb, + }, +}; + + +/* ------------------------------------------------------------ + * Functions managing the state of the stats system for all backends. + * ------------------------------------------------------------ + */ + +/* + * Read on-disk stats into memory at server start. + * + * Should only be called by the startup process or in single user mode. + */ +void +pgstat_restore_stats(void) +{ + pgstat_read_statsfile(); +} + +/* + * Remove the stats file. This is currently used only if WAL recovery is + * needed after a crash. + * + * Should only be called by the startup process or in single user mode. + */ +void +pgstat_discard_stats(void) +{ + int ret; + + /* NB: this needs to be done even in single user mode */ + + ret = unlink(PGSTAT_STAT_PERMANENT_FILENAME); + if (ret != 0) + { + if (errno == ENOENT) + elog(DEBUG2, + "didn't need to unlink permanent stats file \"%s\" - didn't exist", + PGSTAT_STAT_PERMANENT_FILENAME); + else + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not unlink permanent statistics file \"%s\": %m", + PGSTAT_STAT_PERMANENT_FILENAME))); + } + else + { + ereport(DEBUG2, + (errcode_for_file_access(), + errmsg_internal("unlinked permanent statistics file \"%s\"", + PGSTAT_STAT_PERMANENT_FILENAME))); + } + + /* + * Reset stats contents. This will set reset timestamps of fixed-numbered + * stats to the current time (no variable stats exist). + */ + pgstat_reset_after_failure(); +} + +/* + * pgstat_before_server_shutdown() needs to be called by exactly one process + * during regular server shutdowns. Otherwise all stats will be lost. + * + * We currently only write out stats for proc_exit(0). We might want to change + * that at some point... But right now pgstat_discard_stats() would be called + * during the start after a disorderly shutdown, anyway. + */ +void +pgstat_before_server_shutdown(int code, Datum arg) +{ + Assert(pgStatLocal.shmem != NULL); + Assert(!pgStatLocal.shmem->is_shutdown); + + /* + * Stats should only be reported after pgstat_initialize() and before + * pgstat_shutdown(). This is a convenient point to catch most violations + * of this rule. + */ + Assert(pgstat_is_initialized && !pgstat_is_shutdown); + + /* flush out our own pending changes before writing out */ + pgstat_report_stat(true); + + /* + * Only write out file during normal shutdown. Don't even signal that + * we've shutdown during irregular shutdowns, because the shutdown + * sequence isn't coordinated to ensure this backend shuts down last. + */ + if (code == 0) + { + pgStatLocal.shmem->is_shutdown = true; + pgstat_write_statsfile(); + } +} + + +/* ------------------------------------------------------------ + * Backend initialization / shutdown functions + * ------------------------------------------------------------ + */ + +/* + * Shut down a single backend's statistics reporting at process exit. + * + * Flush out any remaining statistics counts. Without this, operations + * triggered during backend exit (such as temp table deletions) won't be + * counted. + */ +static void +pgstat_shutdown_hook(int code, Datum arg) +{ + Assert(!pgstat_is_shutdown); + Assert(IsUnderPostmaster || !IsPostmasterEnvironment); + + /* + * If we got as far as discovering our own database ID, we can flush out + * what we did so far. Otherwise, we'd be reporting an invalid database + * ID, so forget it. (This means that accesses to pg_database during + * failed backend starts might never get counted.) + */ + if (OidIsValid(MyDatabaseId)) + pgstat_report_disconnect(MyDatabaseId); + + pgstat_report_stat(true); + + /* there shouldn't be any pending changes left */ + Assert(dlist_is_empty(&pgStatPending)); + dlist_init(&pgStatPending); + + pgstat_detach_shmem(); + +#ifdef USE_ASSERT_CHECKING + pgstat_is_shutdown = true; +#endif +} + +/* + * Initialize pgstats state, and set up our on-proc-exit hook. Called from + * BaseInit(). + * + * NOTE: MyDatabaseId isn't set yet; so the shutdown hook has to be careful. + */ +void +pgstat_initialize(void) +{ + Assert(!pgstat_is_initialized); + + pgstat_attach_shmem(); + + pgstat_init_wal(); + + /* Set up a process-exit hook to clean up */ + before_shmem_exit(pgstat_shutdown_hook, 0); + +#ifdef USE_ASSERT_CHECKING + pgstat_is_initialized = true; +#endif +} + + +/* ------------------------------------------------------------ + * Public functions used by backends follow + * ------------------------------------------------------------ + */ + +/* + * Must be called by processes that performs DML: tcop/postgres.c, logical + * receiver processes, SPI worker, etc. to flush pending statistics updates to + * shared memory. + * + * Unless called with 'force', pending stats updates are flushed happen once + * per PGSTAT_MIN_INTERVAL (1000ms). When not forced, stats flushes do not + * block on lock acquisition, except if stats updates have been pending for + * longer than PGSTAT_MAX_INTERVAL (60000ms). + * + * Whenever pending stats updates remain at the end of pgstat_report_stat() a + * suggested idle timeout is returned. Currently this is always + * PGSTAT_IDLE_INTERVAL (10000ms). Callers can use the returned time to set up + * a timeout after which to call pgstat_report_stat(true), but are not + * required to to do so. + * + * Note that this is called only when not within a transaction, so it is fair + * to use transaction stop time as an approximation of current time. + */ +long +pgstat_report_stat(bool force) +{ + static TimestampTz pending_since = 0; + static TimestampTz last_flush = 0; + bool partial_flush; + TimestampTz now; + bool nowait; + + pgstat_assert_is_up(); + Assert(!IsTransactionOrTransactionBlock()); + + /* "absorb" the forced flush even if there's nothing to flush */ + if (pgStatForceNextFlush) + { + force = true; + pgStatForceNextFlush = false; + } + + /* Don't expend a clock check if nothing to do */ + if (dlist_is_empty(&pgStatPending) && + !have_slrustats && + !pgstat_have_pending_wal()) + { + Assert(pending_since == 0); + return 0; + } + + /* + * There should never be stats to report once stats are shut down. Can't + * assert that before the checks above, as there is an unconditional + * pgstat_report_stat() call in pgstat_shutdown_hook() - which at least + * the process that ran pgstat_before_server_shutdown() will still call. + */ + Assert(!pgStatLocal.shmem->is_shutdown); + + now = GetCurrentTransactionStopTimestamp(); + + if (!force) + { + if (pending_since > 0 && + TimestampDifferenceExceeds(pending_since, now, PGSTAT_MAX_INTERVAL)) + { + /* don't keep pending updates longer than PGSTAT_MAX_INTERVAL */ + force = true; + } + else if (last_flush > 0 && + !TimestampDifferenceExceeds(last_flush, now, PGSTAT_MIN_INTERVAL)) + { + /* don't flush too frequently */ + if (pending_since == 0) + pending_since = now; + + return PGSTAT_IDLE_INTERVAL; + } + } + + pgstat_update_dbstats(now); + + /* don't wait for lock acquisition when !force */ + nowait = !force; + + partial_flush = false; + + /* flush database / relation / function / ... stats */ + partial_flush |= pgstat_flush_pending_entries(nowait); + + /* flush wal stats */ + partial_flush |= pgstat_flush_wal(nowait); + + /* flush SLRU stats */ + partial_flush |= pgstat_slru_flush(nowait); + + last_flush = now; + + /* + * If some of the pending stats could not be flushed due to lock + * contention, let the caller know when to retry. + */ + if (partial_flush) + { + /* force should have prevented us from getting here */ + Assert(!force); + + /* remember since when stats have been pending */ + if (pending_since == 0) + pending_since = now; + + return PGSTAT_IDLE_INTERVAL; + } + + pending_since = 0; + + return 0; +} + +/* + * Force locally pending stats to be flushed during the next + * pgstat_report_stat() call. This is useful for writing tests. + */ +void +pgstat_force_next_flush(void) +{ + pgStatForceNextFlush = true; +} + +/* + * Only for use by pgstat_reset_counters() + */ +static bool +match_db_entries(PgStatShared_HashEntry *entry, Datum match_data) +{ + return entry->key.dboid == DatumGetObjectId(MyDatabaseId); +} + +/* + * Reset counters for our database. + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +void +pgstat_reset_counters(void) +{ + TimestampTz ts = GetCurrentTimestamp(); + + pgstat_reset_matching_entries(match_db_entries, + ObjectIdGetDatum(MyDatabaseId), + ts); +} + +/* + * Reset a single variable-numbered entry. + * + * If the stats kind is within a database, also reset the database's + * stat_reset_timestamp. + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +void +pgstat_reset(PgStat_Kind kind, Oid dboid, Oid objoid) +{ + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + TimestampTz ts = GetCurrentTimestamp(); + + /* not needed atm, and doesn't make sense with the current signature */ + Assert(!pgstat_get_kind_info(kind)->fixed_amount); + + /* reset the "single counter" */ + pgstat_reset_entry(kind, dboid, objoid, ts); + + if (!kind_info->accessed_across_databases) + pgstat_reset_database_timestamp(dboid, ts); +} + +/* + * Reset stats for all entries of a kind. + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +void +pgstat_reset_of_kind(PgStat_Kind kind) +{ + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + TimestampTz ts = GetCurrentTimestamp(); + + if (kind_info->fixed_amount) + kind_info->reset_all_cb(ts); + else + pgstat_reset_entries_of_kind(kind, ts); +} + + +/* ------------------------------------------------------------ + * Fetching of stats + * ------------------------------------------------------------ + */ + +/* + * Discard any data collected in the current transaction. Any subsequent + * request will cause new snapshots to be read. + * + * This is also invoked during transaction commit or abort to discard + * the no-longer-wanted snapshot. Updates of stats_fetch_consistency can + * cause this routine to be called. + */ +void +pgstat_clear_snapshot(void) +{ + pgstat_assert_is_up(); + + memset(&pgStatLocal.snapshot.fixed_valid, 0, + sizeof(pgStatLocal.snapshot.fixed_valid)); + pgStatLocal.snapshot.stats = NULL; + pgStatLocal.snapshot.mode = PGSTAT_FETCH_CONSISTENCY_NONE; + + /* Release memory, if any was allocated */ + if (pgStatLocal.snapshot.context) + { + MemoryContextDelete(pgStatLocal.snapshot.context); + + /* Reset variables */ + pgStatLocal.snapshot.context = NULL; + } + + /* + * Historically the backend_status.c facilities lived in this file, and + * were reset with the same function. For now keep it that way, and + * forward the reset request. + */ + pgstat_clear_backend_activity_snapshot(); + + /* Reset this flag, as it may be possible that a cleanup was forced. */ + force_stats_snapshot_clear = false; +} + +void * +pgstat_fetch_entry(PgStat_Kind kind, Oid dboid, Oid objoid) +{ + PgStat_HashKey key; + PgStat_EntryRef *entry_ref; + void *stats_data; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + /* should be called from backends */ + Assert(IsUnderPostmaster || !IsPostmasterEnvironment); + AssertArg(!kind_info->fixed_amount); + + pgstat_prep_snapshot(); + + key.kind = kind; + key.dboid = dboid; + key.objoid = objoid; + + /* if we need to build a full snapshot, do so */ + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) + pgstat_build_snapshot(); + + /* if caching is desired, look up in cache */ + if (pgstat_fetch_consistency > PGSTAT_FETCH_CONSISTENCY_NONE) + { + PgStat_SnapshotEntry *entry = NULL; + + entry = pgstat_snapshot_lookup(pgStatLocal.snapshot.stats, key); + + if (entry) + return entry->data; + + /* + * If we built a full snapshot and the key is not in + * pgStatLocal.snapshot.stats, there are no matching stats. + */ + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) + return NULL; + } + + pgStatLocal.snapshot.mode = pgstat_fetch_consistency; + + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, false, NULL); + + if (entry_ref == NULL || entry_ref->shared_entry->dropped) + { + /* create empty entry when using PGSTAT_FETCH_CONSISTENCY_CACHE */ + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_CACHE) + { + PgStat_SnapshotEntry *entry = NULL; + bool found; + + entry = pgstat_snapshot_insert(pgStatLocal.snapshot.stats, key, &found); + Assert(!found); + entry->data = NULL; + } + return NULL; + } + + /* + * Allocate in caller's context for PGSTAT_FETCH_CONSISTENCY_NONE, + * otherwise we could quickly end up with a fair bit of memory used due to + * repeated accesses. + */ + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_NONE) + stats_data = palloc(kind_info->shared_data_len); + else + stats_data = MemoryContextAlloc(pgStatLocal.snapshot.context, + kind_info->shared_data_len); + + pgstat_lock_entry_shared(entry_ref, false); + memcpy(stats_data, + pgstat_get_entry_data(kind, entry_ref->shared_stats), + kind_info->shared_data_len); + pgstat_unlock_entry(entry_ref); + + if (pgstat_fetch_consistency > PGSTAT_FETCH_CONSISTENCY_NONE) + { + PgStat_SnapshotEntry *entry = NULL; + bool found; + + entry = pgstat_snapshot_insert(pgStatLocal.snapshot.stats, key, &found); + entry->data = stats_data; + } + + return stats_data; +} + +/* + * If a stats snapshot has been taken, return the timestamp at which that was + * done, and set *have_snapshot to true. Otherwise *have_snapshot is set to + * false. + */ +TimestampTz +pgstat_get_stat_snapshot_timestamp(bool *have_snapshot) +{ + if (force_stats_snapshot_clear) + pgstat_clear_snapshot(); + + if (pgStatLocal.snapshot.mode == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) + { + *have_snapshot = true; + return pgStatLocal.snapshot.snapshot_timestamp; + } + + *have_snapshot = false; + + return 0; +} + +bool +pgstat_have_entry(PgStat_Kind kind, Oid dboid, Oid objoid) +{ + /* fixed-numbered stats always exist */ + if (pgstat_get_kind_info(kind)->fixed_amount) + return true; + + return pgstat_get_entry_ref(kind, dboid, objoid, false, NULL) != NULL; +} + +/* + * Ensure snapshot for fixed-numbered 'kind' exists. + * + * Typically used by the pgstat_fetch_* functions for a kind of stats, before + * massaging the data into the desired format. + */ +void +pgstat_snapshot_fixed(PgStat_Kind kind) +{ + AssertArg(pgstat_is_kind_valid(kind)); + AssertArg(pgstat_get_kind_info(kind)->fixed_amount); + + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) + pgstat_build_snapshot(); + else + pgstat_build_snapshot_fixed(kind); + + Assert(pgStatLocal.snapshot.fixed_valid[kind]); +} + +static void +pgstat_prep_snapshot(void) +{ + if (force_stats_snapshot_clear) + pgstat_clear_snapshot(); + + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_NONE || + pgStatLocal.snapshot.stats != NULL) + return; + + if (!pgStatLocal.snapshot.context) + pgStatLocal.snapshot.context = AllocSetContextCreate(TopMemoryContext, + "PgStat Snapshot", + ALLOCSET_SMALL_SIZES); + + pgStatLocal.snapshot.stats = + pgstat_snapshot_create(pgStatLocal.snapshot.context, + PGSTAT_SNAPSHOT_HASH_SIZE, + NULL); +} + +static void +pgstat_build_snapshot(void) +{ + dshash_seq_status hstat; + PgStatShared_HashEntry *p; + + /* should only be called when we need a snapshot */ + Assert(pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT); + + /* snapshot already built */ + if (pgStatLocal.snapshot.mode == PGSTAT_FETCH_CONSISTENCY_SNAPSHOT) + return; + + pgstat_prep_snapshot(); + + Assert(pgStatLocal.snapshot.stats->members == 0); + + pgStatLocal.snapshot.snapshot_timestamp = GetCurrentTimestamp(); + + /* + * Snapshot all variable stats. + */ + dshash_seq_init(&hstat, pgStatLocal.shared_hash, false); + while ((p = dshash_seq_next(&hstat)) != NULL) + { + PgStat_Kind kind = p->key.kind; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + bool found; + PgStat_SnapshotEntry *entry; + PgStatShared_Common *stats_data; + + /* + * Check if the stats object should be included in the snapshot. + * Unless the stats kind can be accessed from all databases (e.g., + * database stats themselves), we only include stats for the current + * database or objects not associated with a database (e.g. shared + * relations). + */ + if (p->key.dboid != MyDatabaseId && + p->key.dboid != InvalidOid && + !kind_info->accessed_across_databases) + continue; + + if (p->dropped) + continue; + + Assert(pg_atomic_read_u32(&p->refcount) > 0); + + stats_data = dsa_get_address(pgStatLocal.dsa, p->body); + Assert(stats_data); + + entry = pgstat_snapshot_insert(pgStatLocal.snapshot.stats, p->key, &found); + Assert(!found); + + entry->data = MemoryContextAlloc(pgStatLocal.snapshot.context, + kind_info->shared_size); + /* + * Acquire the LWLock directly instead of using + * pg_stat_lock_entry_shared() which requires a reference. + */ + LWLockAcquire(&stats_data->lock, LW_SHARED); + memcpy(entry->data, + pgstat_get_entry_data(kind, stats_data), + kind_info->shared_size); + LWLockRelease(&stats_data->lock); + } + dshash_seq_term(&hstat); + + /* + * Build snapshot of all fixed-numbered stats. + */ + for (int kind = PGSTAT_KIND_FIRST_VALID; kind <= PGSTAT_KIND_LAST; kind++) + { + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + if (!kind_info->fixed_amount) + { + Assert(kind_info->snapshot_cb == NULL); + continue; + } + + pgstat_build_snapshot_fixed(kind); + } + + pgStatLocal.snapshot.mode = PGSTAT_FETCH_CONSISTENCY_SNAPSHOT; +} + +static void +pgstat_build_snapshot_fixed(PgStat_Kind kind) +{ + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + Assert(kind_info->fixed_amount); + Assert(kind_info->snapshot_cb != NULL); + + if (pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_NONE) + { + /* rebuild every time */ + pgStatLocal.snapshot.fixed_valid[kind] = false; + } + else if (pgStatLocal.snapshot.fixed_valid[kind]) + { + /* in snapshot mode we shouldn't get called again */ + Assert(pgstat_fetch_consistency == PGSTAT_FETCH_CONSISTENCY_CACHE); + return; + } + + Assert(!pgStatLocal.snapshot.fixed_valid[kind]); + + kind_info->snapshot_cb(); + + Assert(!pgStatLocal.snapshot.fixed_valid[kind]); + pgStatLocal.snapshot.fixed_valid[kind] = true; +} + + +/* ------------------------------------------------------------ + * Backend-local pending stats infrastructure + * ------------------------------------------------------------ + */ + +/* + * Returns the appropriate PgStat_EntryRef, preparing it to receive pending + * stats if not already done. + * + * If created_entry is non-NULL, it'll be set to true if the entry is newly + * created, false otherwise. + */ +PgStat_EntryRef * +pgstat_prep_pending_entry(PgStat_Kind kind, Oid dboid, Oid objoid, bool *created_entry) +{ + PgStat_EntryRef *entry_ref; + + /* need to be able to flush out */ + Assert(pgstat_get_kind_info(kind)->flush_pending_cb != NULL); + + if (unlikely(!pgStatPendingContext)) + { + pgStatPendingContext = + AllocSetContextCreate(TopMemoryContext, + "PgStat Pending", + ALLOCSET_SMALL_SIZES); + } + + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, + true, created_entry); + + if (entry_ref->pending == NULL) + { + size_t entrysize = pgstat_get_kind_info(kind)->pending_size; + + Assert(entrysize != (size_t) -1); + + entry_ref->pending = MemoryContextAllocZero(pgStatPendingContext, entrysize); + dlist_push_tail(&pgStatPending, &entry_ref->pending_node); + } + + return entry_ref; +} + +/* + * Return an existing stats entry, or NULL. + * + * This should only be used for helper function for pgstatfuncs.c - outside of + * that it shouldn't be needed. + */ +PgStat_EntryRef * +pgstat_fetch_pending_entry(PgStat_Kind kind, Oid dboid, Oid objoid) +{ + PgStat_EntryRef *entry_ref; + + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, false, NULL); + + if (entry_ref == NULL || entry_ref->pending == NULL) + return NULL; + + return entry_ref; +} + +void +pgstat_delete_pending_entry(PgStat_EntryRef *entry_ref) +{ + PgStat_Kind kind = entry_ref->shared_entry->key.kind; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + void *pending_data = entry_ref->pending; + + Assert(pending_data != NULL); + /* !fixed_amount stats should be handled explicitly */ + Assert(!pgstat_get_kind_info(kind)->fixed_amount); + + if (kind_info->delete_pending_cb) + kind_info->delete_pending_cb(entry_ref); + + pfree(pending_data); + entry_ref->pending = NULL; + + dlist_delete(&entry_ref->pending_node); +} + +/* + * Flush out pending stats for database objects (databases, relations, + * functions). + */ +static bool +pgstat_flush_pending_entries(bool nowait) +{ + bool have_pending = false; + dlist_node *cur = NULL; + + /* + * Need to be a bit careful iterating over the list of pending entries. + * Processing a pending entry may queue further pending entries to the end + * of the list that we want to process, so a simple iteration won't do. + * Further complicating matters is that we want to delete the current + * entry in each iteration from the list if we flushed successfully. + * + * So we just keep track of the next pointer in each loop iteration. + */ + if (!dlist_is_empty(&pgStatPending)) + cur = dlist_head_node(&pgStatPending); + + while (cur) + { + PgStat_EntryRef *entry_ref = + dlist_container(PgStat_EntryRef, pending_node, cur); + PgStat_HashKey key = entry_ref->shared_entry->key; + PgStat_Kind kind = key.kind; + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + bool did_flush; + dlist_node *next; + + Assert(!kind_info->fixed_amount); + Assert(kind_info->flush_pending_cb != NULL); + + /* flush the stats, if possible */ + did_flush = kind_info->flush_pending_cb(entry_ref, nowait); + + Assert(did_flush || nowait); + + /* determine next entry, before deleting the pending entry */ + if (dlist_has_next(&pgStatPending, cur)) + next = dlist_next_node(&pgStatPending, cur); + else + next = NULL; + + /* if successfully flushed, remove entry */ + if (did_flush) + pgstat_delete_pending_entry(entry_ref); + else + have_pending = true; + + cur = next; + } + + Assert(dlist_is_empty(&pgStatPending) == !have_pending); + + return have_pending; +} + + +/* ------------------------------------------------------------ + * Helper / infrastructure functions + * ------------------------------------------------------------ + */ + +PgStat_Kind +pgstat_get_kind_from_str(char *kind_str) +{ + for (int kind = PGSTAT_KIND_FIRST_VALID; kind <= PGSTAT_KIND_LAST; kind++) + { + if (pg_strcasecmp(kind_str, pgstat_kind_infos[kind].name) == 0) + return kind; + } + + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid statistics kind: \"%s\"", kind_str))); + return PGSTAT_KIND_DATABASE; /* avoid compiler warnings */ +} + +static inline bool +pgstat_is_kind_valid(int ikind) +{ + return ikind >= PGSTAT_KIND_FIRST_VALID && ikind <= PGSTAT_KIND_LAST; +} + +const PgStat_KindInfo * +pgstat_get_kind_info(PgStat_Kind kind) +{ + AssertArg(pgstat_is_kind_valid(kind)); + + return &pgstat_kind_infos[kind]; +} + +/* + * Stats should only be reported after pgstat_initialize() and before + * pgstat_shutdown(). This check is put in a few central places to catch + * violations of this rule more easily. + */ +#ifdef USE_ASSERT_CHECKING +void +pgstat_assert_is_up(void) +{ + Assert(pgstat_is_initialized && !pgstat_is_shutdown); +} +#endif + + +/* ------------------------------------------------------------ + * reading and writing of on-disk stats file + * ------------------------------------------------------------ + */ + +/* helpers for pgstat_write_statsfile() */ +static void +write_chunk(FILE *fpout, void *ptr, size_t len) +{ + int rc; + + rc = fwrite(ptr, len, 1, fpout); + + /* we'll check for errors with ferror once at the end */ + (void) rc; +} + +#define write_chunk_s(fpout, ptr) write_chunk(fpout, ptr, sizeof(*ptr)) + +/* + * This function is called in the last process that is accessing the shared + * stats so locking is not required. + */ +static void +pgstat_write_statsfile(void) +{ + FILE *fpout; + int32 format_id; + const char *tmpfile = PGSTAT_STAT_PERMANENT_TMPFILE; + const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME; + dshash_seq_status hstat; + PgStatShared_HashEntry *ps; + + pgstat_assert_is_up(); + + /* we're shutting down, so it's ok to just override this */ + pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_NONE; + + elog(DEBUG2, "writing stats file \"%s\"", statfile); + + /* + * Open the statistics temp file to write out the current values. + */ + fpout = AllocateFile(tmpfile, PG_BINARY_W); + if (fpout == NULL) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open temporary statistics file \"%s\": %m", + tmpfile))); + return; + } + + /* + * Write the file header --- currently just a format ID. + */ + format_id = PGSTAT_FILE_FORMAT_ID; + write_chunk_s(fpout, &format_id); + + /* + * XXX: The following could now be generalized to just iterate over + * pgstat_kind_infos instead of knowing about the different kinds of + * stats. + */ + + /* + * Write archiver stats struct + */ + pgstat_build_snapshot_fixed(PGSTAT_KIND_ARCHIVER); + write_chunk_s(fpout, &pgStatLocal.snapshot.archiver); + + /* + * Write bgwriter stats struct + */ + pgstat_build_snapshot_fixed(PGSTAT_KIND_BGWRITER); + write_chunk_s(fpout, &pgStatLocal.snapshot.bgwriter); + + /* + * Write checkpointer stats struct + */ + pgstat_build_snapshot_fixed(PGSTAT_KIND_CHECKPOINTER); + write_chunk_s(fpout, &pgStatLocal.snapshot.checkpointer); + + /* + * Write SLRU stats struct + */ + pgstat_build_snapshot_fixed(PGSTAT_KIND_SLRU); + write_chunk_s(fpout, &pgStatLocal.snapshot.slru); + + /* + * Write WAL stats struct + */ + pgstat_build_snapshot_fixed(PGSTAT_KIND_WAL); + write_chunk_s(fpout, &pgStatLocal.snapshot.wal); + + /* + * Walk through the stats entries + */ + dshash_seq_init(&hstat, pgStatLocal.shared_hash, false); + while ((ps = dshash_seq_next(&hstat)) != NULL) + { + PgStatShared_Common *shstats; + const PgStat_KindInfo *kind_info = NULL; + + CHECK_FOR_INTERRUPTS(); + + /* we may have some "dropped" entries not yet removed, skip them */ + Assert(!ps->dropped); + if (ps->dropped) + continue; + + shstats = (PgStatShared_Common *) dsa_get_address(pgStatLocal.dsa, ps->body); + + kind_info = pgstat_get_kind_info(ps->key.kind); + + /* if not dropped the valid-entry refcount should exist */ + Assert(pg_atomic_read_u32(&ps->refcount) > 0); + + if (!kind_info->to_serialized_name) + { + /* normal stats entry, identified by PgStat_HashKey */ + fputc('S', fpout); + write_chunk_s(fpout, &ps->key); + } + else + { + /* stats entry identified by name on disk (e.g. slots) */ + NameData name; + + kind_info->to_serialized_name(&ps->key, shstats, &name); + + fputc('N', fpout); + write_chunk_s(fpout, &ps->key.kind); + write_chunk_s(fpout, &name); + } + + /* Write except the header part of the entry */ + write_chunk(fpout, + pgstat_get_entry_data(ps->key.kind, shstats), + pgstat_get_entry_len(ps->key.kind)); + } + dshash_seq_term(&hstat); + + /* + * No more output to be done. Close the temp file and replace the old + * pgstat.stat with it. The ferror() check replaces testing for error + * after each individual fputc or fwrite (in write_chunk()) above. + */ + fputc('E', fpout); + + if (ferror(fpout)) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write temporary statistics file \"%s\": %m", + tmpfile))); + FreeFile(fpout); + unlink(tmpfile); + } + else if (FreeFile(fpout) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not close temporary statistics file \"%s\": %m", + tmpfile))); + unlink(tmpfile); + } + else if (rename(tmpfile, statfile) < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not rename temporary statistics file \"%s\" to \"%s\": %m", + tmpfile, statfile))); + unlink(tmpfile); + } +} + +/* helpers for pgstat_read_statsfile() */ +static bool +read_chunk(FILE *fpin, void *ptr, size_t len) +{ + return fread(ptr, 1, len, fpin) == len; +} + +#define read_chunk_s(fpin, ptr) read_chunk(fpin, ptr, sizeof(*ptr)) + +/* + * Reads in existing statistics file into the shared stats hash. + * + * This function is called in the only process that is accessing the shared + * stats so locking is not required. + */ +static void +pgstat_read_statsfile(void) +{ + FILE *fpin; + int32 format_id; + bool found; + const char *statfile = PGSTAT_STAT_PERMANENT_FILENAME; + PgStat_ShmemControl *shmem = pgStatLocal.shmem; + + /* shouldn't be called from postmaster */ + Assert(IsUnderPostmaster || !IsPostmasterEnvironment); + + elog(DEBUG2, "reading stats file \"%s\"", statfile); + + /* + * Try to open the stats file. If it doesn't exist, the backends simply + * returns zero for anything and statistics simply starts from scratch + * with empty counters. + * + * ENOENT is a possibility if stats collection was previously disabled or + * has not yet written the stats file for the first time. Any other + * failure condition is suspicious. + */ + if ((fpin = AllocateFile(statfile, PG_BINARY_R)) == NULL) + { + if (errno != ENOENT) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open statistics file \"%s\": %m", + statfile))); + pgstat_reset_after_failure(); + return; + } + + /* + * Verify it's of the expected format. + */ + if (!read_chunk_s(fpin, &format_id) || + format_id != PGSTAT_FILE_FORMAT_ID) + goto error; + + /* + * XXX: The following could now be generalized to just iterate over + * pgstat_kind_infos instead of knowing about the different kinds of + * stats. + */ + + /* + * Read archiver stats struct + */ + if (!read_chunk_s(fpin, &shmem->archiver.stats)) + goto error; + + /* + * Read bgwriter stats struct + */ + if (!read_chunk_s(fpin, &shmem->bgwriter.stats)) + goto error; + + /* + * Read checkpointer stats struct + */ + if (!read_chunk_s(fpin, &shmem->checkpointer.stats)) + goto error; + + /* + * Read SLRU stats struct + */ + if (!read_chunk_s(fpin, &shmem->slru.stats)) + goto error; + + /* + * Read WAL stats struct + */ + if (!read_chunk_s(fpin, &shmem->wal.stats)) + goto error; + + /* + * We found an existing statistics file. Read it and put all the hash + * table entries into place. + */ + for (;;) + { + int t = fgetc(fpin); + + switch (t) + { + case 'S': + case 'N': + { + PgStat_HashKey key; + PgStatShared_HashEntry *p; + PgStatShared_Common *header; + + CHECK_FOR_INTERRUPTS(); + + if (t == 'S') + { + /* normal stats entry, identified by PgStat_HashKey */ + if (!read_chunk_s(fpin, &key)) + goto error; + + if (!pgstat_is_kind_valid(key.kind)) + goto error; + } + else + { + /* stats entry identified by name on disk (e.g. slots) */ + const PgStat_KindInfo *kind_info = NULL; + PgStat_Kind kind; + NameData name; + + if (!read_chunk_s(fpin, &kind)) + goto error; + if (!read_chunk_s(fpin, &name)) + goto error; + if (!pgstat_is_kind_valid(kind)) + goto error; + + kind_info = pgstat_get_kind_info(kind); + + if (!kind_info->from_serialized_name) + goto error; + + if (!kind_info->from_serialized_name(&name, &key)) + { + /* skip over data for entry we don't care about */ + if (fseek(fpin, pgstat_get_entry_len(kind), SEEK_CUR) != 0) + goto error; + + continue; + } + + Assert(key.kind == kind); + } + + /* + * This intentionally doesn't use pgstat_get_entry_ref() - + * putting all stats into checkpointer's + * pgStatEntryRefHash would be wasted effort and memory. + */ + p = dshash_find_or_insert(pgStatLocal.shared_hash, &key, &found); + + /* don't allow duplicate entries */ + if (found) + { + dshash_release_lock(pgStatLocal.shared_hash, p); + elog(WARNING, "found duplicate stats entry %d/%u/%u", + key.kind, key.dboid, key.objoid); + goto error; + } + + header = pgstat_init_entry(key.kind, p); + dshash_release_lock(pgStatLocal.shared_hash, p); + + if (!read_chunk(fpin, + pgstat_get_entry_data(key.kind, header), + pgstat_get_entry_len(key.kind))) + goto error; + + break; + } + case 'E': + /* check that 'E' actually signals end of file */ + if (fgetc(fpin) != EOF) + goto error; + + goto done; + + default: + goto error; + } + } + +done: + FreeFile(fpin); + + elog(DEBUG2, "removing permanent stats file \"%s\"", statfile); + unlink(statfile); + + return; + +error: + ereport(LOG, + (errmsg("corrupted statistics file \"%s\"", statfile))); + + pgstat_reset_after_failure(); + + goto done; +} + +/* + * Helper to reset / drop stats after a crash or after restoring stats from + * disk failed, potentially after already loading parts. + */ +static void +pgstat_reset_after_failure(void) +{ + TimestampTz ts = GetCurrentTimestamp(); + + /* reset fixed-numbered stats */ + for (int kind = PGSTAT_KIND_FIRST_VALID; kind <= PGSTAT_KIND_LAST; kind++) + { + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + if (!kind_info->fixed_amount) + continue; + + kind_info->reset_all_cb(ts); + } + + /* and drop variable-numbered ones */ + pgstat_drop_all_entries(); +} + +/* + * GUC assign_hook for stats_fetch_consistency. + */ +void +assign_stats_fetch_consistency(int newval, void *extra) +{ + /* + * Changing this value in a transaction may cause snapshot state + * inconsistencies, so force a clear of the current snapshot on the next + * snapshot build attempt. + */ + if (pgstat_fetch_consistency != newval) + force_stats_snapshot_clear = true; +} diff --git a/src/backend/utils/activity/pgstat_archiver.c b/src/backend/utils/activity/pgstat_archiver.c new file mode 100644 index 0000000..851726f --- /dev/null +++ b/src/backend/utils/activity/pgstat_archiver.c @@ -0,0 +1,111 @@ +/* ------------------------------------------------------------------------- + * + * pgstat_archiver.c + * Implementation of archiver statistics. + * + * This file contains the implementation of archiver statistics. It is kept + * separate from pgstat.c to enforce the line between the statistics access / + * storage implementation and the details about individual types of + * statistics. + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/activity/pgstat_archiver.c + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "utils/pgstat_internal.h" +#include "utils/timestamp.h" + + +/* + * Report archiver statistics + */ +void +pgstat_report_archiver(const char *xlog, bool failed) +{ + PgStatShared_Archiver *stats_shmem = &pgStatLocal.shmem->archiver; + TimestampTz now = GetCurrentTimestamp(); + + pgstat_begin_changecount_write(&stats_shmem->changecount); + + if (failed) + { + ++stats_shmem->stats.failed_count; + memcpy(&stats_shmem->stats.last_failed_wal, xlog, + sizeof(stats_shmem->stats.last_failed_wal)); + stats_shmem->stats.last_failed_timestamp = now; + } + else + { + ++stats_shmem->stats.archived_count; + memcpy(&stats_shmem->stats.last_archived_wal, xlog, + sizeof(stats_shmem->stats.last_archived_wal)); + stats_shmem->stats.last_archived_timestamp = now; + } + + pgstat_end_changecount_write(&stats_shmem->changecount); +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the archiver statistics struct. + */ +PgStat_ArchiverStats * +pgstat_fetch_stat_archiver(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_ARCHIVER); + + return &pgStatLocal.snapshot.archiver; +} + +void +pgstat_archiver_reset_all_cb(TimestampTz ts) +{ + PgStatShared_Archiver *stats_shmem = &pgStatLocal.shmem->archiver; + + /* see explanation above PgStatShared_Archiver for the reset protocol */ + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + pgstat_copy_changecounted_stats(&stats_shmem->reset_offset, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + stats_shmem->stats.stat_reset_timestamp = ts; + LWLockRelease(&stats_shmem->lock); +} + +void +pgstat_archiver_snapshot_cb(void) +{ + PgStatShared_Archiver *stats_shmem = &pgStatLocal.shmem->archiver; + PgStat_ArchiverStats *stat_snap = &pgStatLocal.snapshot.archiver; + PgStat_ArchiverStats *reset_offset = &stats_shmem->reset_offset; + PgStat_ArchiverStats reset; + + pgstat_copy_changecounted_stats(stat_snap, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + memcpy(&reset, reset_offset, sizeof(stats_shmem->stats)); + LWLockRelease(&stats_shmem->lock); + + /* compensate by reset offsets */ + if (stat_snap->archived_count == reset.archived_count) + { + stat_snap->last_archived_wal[0] = 0; + stat_snap->last_archived_timestamp = 0; + } + stat_snap->archived_count -= reset.archived_count; + + if (stat_snap->failed_count == reset.failed_count) + { + stat_snap->last_failed_wal[0] = 0; + stat_snap->last_failed_timestamp = 0; + } + stat_snap->failed_count -= reset.failed_count; +} diff --git a/src/backend/utils/activity/pgstat_bgwriter.c b/src/backend/utils/activity/pgstat_bgwriter.c new file mode 100644 index 0000000..fbb1edc --- /dev/null +++ b/src/backend/utils/activity/pgstat_bgwriter.c @@ -0,0 +1,110 @@ +/* ------------------------------------------------------------------------- + * + * pgstat_bgwriter.c + * Implementation of bgwriter statistics. + * + * This file contains the implementation of bgwriter statistics. It is kept + * separate from pgstat.c to enforce the line between the statistics access / + * storage implementation and the details about individual types of + * statistics. + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/activity/pgstat_bgwriter.c + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "utils/pgstat_internal.h" + + +PgStat_BgWriterStats PendingBgWriterStats = {0}; + + +/* + * Report bgwriter statistics + */ +void +pgstat_report_bgwriter(void) +{ + PgStatShared_BgWriter *stats_shmem = &pgStatLocal.shmem->bgwriter; + static const PgStat_BgWriterStats all_zeroes; + + Assert(!pgStatLocal.shmem->is_shutdown); + pgstat_assert_is_up(); + + /* + * This function can be called even if nothing at all has happened. In + * this case, avoid unnecessarily modifying the stats entry. + */ + if (memcmp(&PendingBgWriterStats, &all_zeroes, sizeof(all_zeroes)) == 0) + return; + + pgstat_begin_changecount_write(&stats_shmem->changecount); + +#define BGWRITER_ACC(fld) stats_shmem->stats.fld += PendingBgWriterStats.fld + BGWRITER_ACC(buf_written_clean); + BGWRITER_ACC(maxwritten_clean); + BGWRITER_ACC(buf_alloc); +#undef BGWRITER_ACC + + pgstat_end_changecount_write(&stats_shmem->changecount); + + /* + * Clear out the statistics buffer, so it can be re-used. + */ + MemSet(&PendingBgWriterStats, 0, sizeof(PendingBgWriterStats)); +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the bgwriter statistics struct. + */ +PgStat_BgWriterStats * +pgstat_fetch_stat_bgwriter(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_BGWRITER); + + return &pgStatLocal.snapshot.bgwriter; +} + +void +pgstat_bgwriter_reset_all_cb(TimestampTz ts) +{ + PgStatShared_BgWriter *stats_shmem = &pgStatLocal.shmem->bgwriter; + + /* see explanation above PgStatShared_BgWriter for the reset protocol */ + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + pgstat_copy_changecounted_stats(&stats_shmem->reset_offset, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + stats_shmem->stats.stat_reset_timestamp = ts; + LWLockRelease(&stats_shmem->lock); +} + +void +pgstat_bgwriter_snapshot_cb(void) +{ + PgStatShared_BgWriter *stats_shmem = &pgStatLocal.shmem->bgwriter; + PgStat_BgWriterStats *reset_offset = &stats_shmem->reset_offset; + PgStat_BgWriterStats reset; + + pgstat_copy_changecounted_stats(&pgStatLocal.snapshot.bgwriter, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + memcpy(&reset, reset_offset, sizeof(stats_shmem->stats)); + LWLockRelease(&stats_shmem->lock); + + /* compensate by reset offsets */ +#define BGWRITER_COMP(fld) pgStatLocal.snapshot.bgwriter.fld -= reset.fld; + BGWRITER_COMP(buf_written_clean); + BGWRITER_COMP(maxwritten_clean); + BGWRITER_COMP(buf_alloc); +#undef BGWRITER_COMP +} diff --git a/src/backend/utils/activity/pgstat_checkpointer.c b/src/backend/utils/activity/pgstat_checkpointer.c new file mode 100644 index 0000000..af8d513 --- /dev/null +++ b/src/backend/utils/activity/pgstat_checkpointer.c @@ -0,0 +1,121 @@ +/* ------------------------------------------------------------------------- + * + * pgstat_checkpointer.c + * Implementation of checkpoint statistics. + * + * This file contains the implementation of checkpoint statistics. It is kept + * separate from pgstat.c to enforce the line between the statistics access / + * storage implementation and the details about individual types of + * statistics. + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/activity/pgstat_checkpointer.c + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "utils/pgstat_internal.h" + + +PgStat_CheckpointerStats PendingCheckpointerStats = {0}; + + +/* + * Report checkpointer statistics + */ +void +pgstat_report_checkpointer(void) +{ + /* We assume this initializes to zeroes */ + static const PgStat_CheckpointerStats all_zeroes; + PgStatShared_Checkpointer *stats_shmem = &pgStatLocal.shmem->checkpointer; + + Assert(!pgStatLocal.shmem->is_shutdown); + pgstat_assert_is_up(); + + /* + * This function can be called even if nothing at all has happened. In + * this case, avoid unnecessarily modifying the stats entry. + */ + if (memcmp(&PendingCheckpointerStats, &all_zeroes, + sizeof(all_zeroes)) == 0) + return; + + pgstat_begin_changecount_write(&stats_shmem->changecount); + +#define CHECKPOINTER_ACC(fld) stats_shmem->stats.fld += PendingCheckpointerStats.fld + CHECKPOINTER_ACC(timed_checkpoints); + CHECKPOINTER_ACC(requested_checkpoints); + CHECKPOINTER_ACC(checkpoint_write_time); + CHECKPOINTER_ACC(checkpoint_sync_time); + CHECKPOINTER_ACC(buf_written_checkpoints); + CHECKPOINTER_ACC(buf_written_backend); + CHECKPOINTER_ACC(buf_fsync_backend); +#undef CHECKPOINTER_ACC + + pgstat_end_changecount_write(&stats_shmem->changecount); + + /* + * Clear out the statistics buffer, so it can be re-used. + */ + MemSet(&PendingCheckpointerStats, 0, sizeof(PendingCheckpointerStats)); +} + +/* + * pgstat_fetch_stat_checkpointer() - + * + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the checkpointer statistics struct. + */ +PgStat_CheckpointerStats * +pgstat_fetch_stat_checkpointer(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_CHECKPOINTER); + + return &pgStatLocal.snapshot.checkpointer; +} + +void +pgstat_checkpointer_reset_all_cb(TimestampTz ts) +{ + PgStatShared_Checkpointer *stats_shmem = &pgStatLocal.shmem->checkpointer; + + /* see explanation above PgStatShared_Checkpointer for the reset protocol */ + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + pgstat_copy_changecounted_stats(&stats_shmem->reset_offset, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + LWLockRelease(&stats_shmem->lock); +} + +void +pgstat_checkpointer_snapshot_cb(void) +{ + PgStatShared_Checkpointer *stats_shmem = &pgStatLocal.shmem->checkpointer; + PgStat_CheckpointerStats *reset_offset = &stats_shmem->reset_offset; + PgStat_CheckpointerStats reset; + + pgstat_copy_changecounted_stats(&pgStatLocal.snapshot.checkpointer, + &stats_shmem->stats, + sizeof(stats_shmem->stats), + &stats_shmem->changecount); + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + memcpy(&reset, reset_offset, sizeof(stats_shmem->stats)); + LWLockRelease(&stats_shmem->lock); + + /* compensate by reset offsets */ +#define CHECKPOINTER_COMP(fld) pgStatLocal.snapshot.checkpointer.fld -= reset.fld; + CHECKPOINTER_COMP(timed_checkpoints); + CHECKPOINTER_COMP(requested_checkpoints); + CHECKPOINTER_COMP(checkpoint_write_time); + CHECKPOINTER_COMP(checkpoint_sync_time); + CHECKPOINTER_COMP(buf_written_checkpoints); + CHECKPOINTER_COMP(buf_written_backend); + CHECKPOINTER_COMP(buf_fsync_backend); +#undef CHECKPOINTER_COMP +} diff --git a/src/backend/utils/activity/pgstat_database.c b/src/backend/utils/activity/pgstat_database.c new file mode 100644 index 0000000..4235fa0 --- /dev/null +++ b/src/backend/utils/activity/pgstat_database.c @@ -0,0 +1,437 @@ +/* ------------------------------------------------------------------------- + * + * pgstat_database.c + * Implementation of database statistics. + * + * This file contains the implementation of database statistics. It is kept + * separate from pgstat.c to enforce the line between the statistics access / + * storage implementation and the details about individual types of + * statistics. + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/activity/pgstat_database.c + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "utils/pgstat_internal.h" +#include "utils/timestamp.h" +#include "storage/procsignal.h" + + +static bool pgstat_should_report_connstat(void); + + +PgStat_Counter pgStatBlockReadTime = 0; +PgStat_Counter pgStatBlockWriteTime = 0; +PgStat_Counter pgStatActiveTime = 0; +PgStat_Counter pgStatTransactionIdleTime = 0; +SessionEndType pgStatSessionEndCause = DISCONNECT_NORMAL; + + +static int pgStatXactCommit = 0; +static int pgStatXactRollback = 0; +static PgStat_Counter pgLastSessionReportTime = 0; + + +/* + * Remove entry for the database being dropped. + */ +void +pgstat_drop_database(Oid databaseid) +{ + pgstat_drop_transactional(PGSTAT_KIND_DATABASE, databaseid, InvalidOid); +} + +/* + * Called from autovacuum.c to report startup of an autovacuum process. + * We are called before InitPostgres is done, so can't rely on MyDatabaseId; + * the db OID must be passed in, instead. + */ +void +pgstat_report_autovac(Oid dboid) +{ + PgStat_EntryRef *entry_ref; + PgStatShared_Database *dbentry; + + /* can't get here in single user mode */ + Assert(IsUnderPostmaster); + + /* + * End-of-vacuum is reported instantly. Report the start the same way for + * consistency. Vacuum doesn't run frequently and is a long-lasting + * operation so it doesn't matter if we get blocked here a little. + */ + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_DATABASE, + dboid, InvalidOid, false); + + dbentry = (PgStatShared_Database *) entry_ref->shared_stats; + dbentry->stats.last_autovac_time = GetCurrentTimestamp(); + + pgstat_unlock_entry(entry_ref); +} + +/* + * Report a Hot Standby recovery conflict. + */ +void +pgstat_report_recovery_conflict(int reason) +{ + PgStat_StatDBEntry *dbentry; + + Assert(IsUnderPostmaster); + if (!pgstat_track_counts) + return; + + dbentry = pgstat_prep_database_pending(MyDatabaseId); + + switch (reason) + { + case PROCSIG_RECOVERY_CONFLICT_DATABASE: + + /* + * Since we drop the information about the database as soon as it + * replicates, there is no point in counting these conflicts. + */ + break; + case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: + dbentry->n_conflict_tablespace++; + break; + case PROCSIG_RECOVERY_CONFLICT_LOCK: + dbentry->n_conflict_lock++; + break; + case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: + dbentry->n_conflict_snapshot++; + break; + case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: + dbentry->n_conflict_bufferpin++; + break; + case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: + dbentry->n_conflict_startup_deadlock++; + break; + } +} + +/* + * Report a detected deadlock. + */ +void +pgstat_report_deadlock(void) +{ + PgStat_StatDBEntry *dbent; + + if (!pgstat_track_counts) + return; + + dbent = pgstat_prep_database_pending(MyDatabaseId); + dbent->n_deadlocks++; +} + +/* + * Report one or more checksum failures. + */ +void +pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount) +{ + PgStat_EntryRef *entry_ref; + PgStatShared_Database *sharedent; + + if (!pgstat_track_counts) + return; + + /* + * Update the shared stats directly - checksum failures should never be + * common enough for that to be a problem. + */ + entry_ref = + pgstat_get_entry_ref_locked(PGSTAT_KIND_DATABASE, dboid, InvalidOid, false); + + sharedent = (PgStatShared_Database *) entry_ref->shared_stats; + sharedent->stats.n_checksum_failures += failurecount; + sharedent->stats.last_checksum_failure = GetCurrentTimestamp(); + + pgstat_unlock_entry(entry_ref); +} + +/* + * Report one checksum failure in the current database. + */ +void +pgstat_report_checksum_failure(void) +{ + pgstat_report_checksum_failures_in_db(MyDatabaseId, 1); +} + +/* + * Report creation of temporary file. + */ +void +pgstat_report_tempfile(size_t filesize) +{ + PgStat_StatDBEntry *dbent; + + if (!pgstat_track_counts) + return; + + dbent = pgstat_prep_database_pending(MyDatabaseId); + dbent->n_temp_bytes += filesize; + dbent->n_temp_files++; +} + +/* + * Notify stats system of a new connection. + */ +void +pgstat_report_connect(Oid dboid) +{ + PgStat_StatDBEntry *dbentry; + + if (!pgstat_should_report_connstat()) + return; + + pgLastSessionReportTime = MyStartTimestamp; + + dbentry = pgstat_prep_database_pending(MyDatabaseId); + dbentry->n_sessions++; +} + +/* + * Notify the stats system of a disconnect. + */ +void +pgstat_report_disconnect(Oid dboid) +{ + PgStat_StatDBEntry *dbentry; + + if (!pgstat_should_report_connstat()) + return; + + dbentry = pgstat_prep_database_pending(MyDatabaseId); + + switch (pgStatSessionEndCause) + { + case DISCONNECT_NOT_YET: + case DISCONNECT_NORMAL: + /* we don't collect these */ + break; + case DISCONNECT_CLIENT_EOF: + dbentry->n_sessions_abandoned++; + break; + case DISCONNECT_FATAL: + dbentry->n_sessions_fatal++; + break; + case DISCONNECT_KILLED: + dbentry->n_sessions_killed++; + break; + } +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one database or NULL. NULL doesn't mean + * that the database doesn't exist, just that there are no statistics, so the + * caller is better off to report ZERO instead. + */ +PgStat_StatDBEntry * +pgstat_fetch_stat_dbentry(Oid dboid) +{ + return (PgStat_StatDBEntry *) + pgstat_fetch_entry(PGSTAT_KIND_DATABASE, dboid, InvalidOid); +} + +void +AtEOXact_PgStat_Database(bool isCommit, bool parallel) +{ + /* Don't count parallel worker transaction stats */ + if (!parallel) + { + /* + * Count transaction commit or abort. (We use counters, not just + * bools, in case the reporting message isn't sent right away.) + */ + if (isCommit) + pgStatXactCommit++; + else + pgStatXactRollback++; + } +} + +/* + * Subroutine for pgstat_report_stat(): Handle xact commit/rollback and I/O + * timings. + */ +void +pgstat_update_dbstats(TimestampTz ts) +{ + PgStat_StatDBEntry *dbentry; + + /* + * If not connected to a database yet, don't attribute time to "shared + * state" (InvalidOid is used to track stats for shared relations, etc.). + */ + if (!OidIsValid(MyDatabaseId)) + return; + + dbentry = pgstat_prep_database_pending(MyDatabaseId); + + /* + * Accumulate xact commit/rollback and I/O timings to stats entry of the + * current database. + */ + dbentry->n_xact_commit += pgStatXactCommit; + dbentry->n_xact_rollback += pgStatXactRollback; + dbentry->n_block_read_time += pgStatBlockReadTime; + dbentry->n_block_write_time += pgStatBlockWriteTime; + + if (pgstat_should_report_connstat()) + { + long secs; + int usecs; + + /* + * pgLastSessionReportTime is initialized to MyStartTimestamp by + * pgstat_report_connect(). + */ + TimestampDifference(pgLastSessionReportTime, ts, &secs, &usecs); + pgLastSessionReportTime = ts; + dbentry->total_session_time += (PgStat_Counter) secs * 1000000 + usecs; + dbentry->total_active_time += pgStatActiveTime; + dbentry->total_idle_in_xact_time += pgStatTransactionIdleTime; + } + + pgStatXactCommit = 0; + pgStatXactRollback = 0; + pgStatBlockReadTime = 0; + pgStatBlockWriteTime = 0; + pgStatActiveTime = 0; + pgStatTransactionIdleTime = 0; +} + +/* + * We report session statistics only for normal backend processes. Parallel + * workers run in parallel, so they don't contribute to session times, even + * though they use CPU time. Walsender processes could be considered here, + * but they have different session characteristics from normal backends (for + * example, they are always "active"), so they would skew session statistics. + */ +static bool +pgstat_should_report_connstat(void) +{ + return MyBackendType == B_BACKEND; +} + +/* + * Find or create a local PgStat_StatDBEntry entry for dboid. + */ +PgStat_StatDBEntry * +pgstat_prep_database_pending(Oid dboid) +{ + PgStat_EntryRef *entry_ref; + + /* + * This should not report stats on database objects before having + * connected to a database. + */ + Assert(!OidIsValid(dboid) || OidIsValid(MyDatabaseId)); + + entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_DATABASE, dboid, InvalidOid, + NULL); + + return entry_ref->pending; +} + +/* + * Reset the database's reset timestamp, without resetting the contents of the + * database stats. + */ +void +pgstat_reset_database_timestamp(Oid dboid, TimestampTz ts) +{ + PgStat_EntryRef *dbref; + PgStatShared_Database *dbentry; + + dbref = pgstat_get_entry_ref_locked(PGSTAT_KIND_DATABASE, MyDatabaseId, InvalidOid, + false); + + dbentry = (PgStatShared_Database *) dbref->shared_stats; + dbentry->stats.stat_reset_timestamp = ts; + + pgstat_unlock_entry(dbref); +} + +/* + * Flush out pending stats for the entry + * + * If nowait is true, this function returns false if lock could not + * immediately acquired, otherwise true is returned. + */ +bool +pgstat_database_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) +{ + PgStatShared_Database *sharedent; + PgStat_StatDBEntry *pendingent; + + pendingent = (PgStat_StatDBEntry *) entry_ref->pending; + sharedent = (PgStatShared_Database *) entry_ref->shared_stats; + + if (!pgstat_lock_entry(entry_ref, nowait)) + return false; + +#define PGSTAT_ACCUM_DBCOUNT(item) \ + (sharedent)->stats.item += (pendingent)->item + + PGSTAT_ACCUM_DBCOUNT(n_xact_commit); + PGSTAT_ACCUM_DBCOUNT(n_xact_rollback); + PGSTAT_ACCUM_DBCOUNT(n_blocks_fetched); + PGSTAT_ACCUM_DBCOUNT(n_blocks_hit); + + PGSTAT_ACCUM_DBCOUNT(n_tuples_returned); + PGSTAT_ACCUM_DBCOUNT(n_tuples_fetched); + PGSTAT_ACCUM_DBCOUNT(n_tuples_inserted); + PGSTAT_ACCUM_DBCOUNT(n_tuples_updated); + PGSTAT_ACCUM_DBCOUNT(n_tuples_deleted); + + /* last_autovac_time is reported immediately */ + Assert(pendingent->last_autovac_time == 0); + + PGSTAT_ACCUM_DBCOUNT(n_conflict_tablespace); + PGSTAT_ACCUM_DBCOUNT(n_conflict_lock); + PGSTAT_ACCUM_DBCOUNT(n_conflict_snapshot); + PGSTAT_ACCUM_DBCOUNT(n_conflict_bufferpin); + PGSTAT_ACCUM_DBCOUNT(n_conflict_startup_deadlock); + + PGSTAT_ACCUM_DBCOUNT(n_temp_bytes); + PGSTAT_ACCUM_DBCOUNT(n_temp_files); + PGSTAT_ACCUM_DBCOUNT(n_deadlocks); + + /* checksum failures are reported immediately */ + Assert(pendingent->n_checksum_failures == 0); + Assert(pendingent->last_checksum_failure == 0); + + PGSTAT_ACCUM_DBCOUNT(n_block_read_time); + PGSTAT_ACCUM_DBCOUNT(n_block_write_time); + + PGSTAT_ACCUM_DBCOUNT(n_sessions); + PGSTAT_ACCUM_DBCOUNT(total_session_time); + PGSTAT_ACCUM_DBCOUNT(total_active_time); + PGSTAT_ACCUM_DBCOUNT(total_idle_in_xact_time); + PGSTAT_ACCUM_DBCOUNT(n_sessions_abandoned); + PGSTAT_ACCUM_DBCOUNT(n_sessions_fatal); + PGSTAT_ACCUM_DBCOUNT(n_sessions_killed); +#undef PGSTAT_ACCUM_DBCOUNT + + pgstat_unlock_entry(entry_ref); + + memset(pendingent, 0, sizeof(*pendingent)); + + return true; +} + +void +pgstat_database_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts) +{ + ((PgStatShared_Database *) header)->stats.stat_reset_timestamp = ts; +} diff --git a/src/backend/utils/activity/pgstat_function.c b/src/backend/utils/activity/pgstat_function.c new file mode 100644 index 0000000..427d8c4 --- /dev/null +++ b/src/backend/utils/activity/pgstat_function.c @@ -0,0 +1,243 @@ +/* ------------------------------------------------------------------------- + * + * pgstat_function.c + * Implementation of function statistics. + * + * This file contains the implementation of function statistics. It is kept + * separate from pgstat.c to enforce the line between the statistics access / + * storage implementation and the details about individual types of + * statistics. + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/activity/pgstat_function.c + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" +#include "utils/inval.h" +#include "utils/pgstat_internal.h" +#include "utils/syscache.h" + + +/* ---------- + * GUC parameters + * ---------- + */ +int pgstat_track_functions = TRACK_FUNC_OFF; + + +/* + * Total time charged to functions so far in the current backend. + * We use this to help separate "self" and "other" time charges. + * (We assume this initializes to zero.) + */ +static instr_time total_func_time; + + +/* + * Ensure that stats are dropped if transaction aborts. + */ +void +pgstat_create_function(Oid proid) +{ + pgstat_create_transactional(PGSTAT_KIND_FUNCTION, + MyDatabaseId, + proid); +} + +/* + * Ensure that stats are dropped if transaction commits. + * + * NB: This is only reliable because pgstat_init_function_usage() does some + * extra work. If other places start emitting function stats they likely need + * similar logic. + */ +void +pgstat_drop_function(Oid proid) +{ + pgstat_drop_transactional(PGSTAT_KIND_FUNCTION, + MyDatabaseId, + proid); +} + +/* + * Initialize function call usage data. + * Called by the executor before invoking a function. + */ +void +pgstat_init_function_usage(FunctionCallInfo fcinfo, + PgStat_FunctionCallUsage *fcu) +{ + PgStat_EntryRef *entry_ref; + PgStat_BackendFunctionEntry *pending; + bool created_entry; + + if (pgstat_track_functions <= fcinfo->flinfo->fn_stats) + { + /* stats not wanted */ + fcu->fs = NULL; + return; + } + + entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_FUNCTION, + MyDatabaseId, + fcinfo->flinfo->fn_oid, + &created_entry); + + /* + * If no shared entry already exists, check if the function has been + * deleted concurrently. This can go unnoticed until here because + * executing a statement that just calls a function, does not trigger + * cache invalidation processing. The reason we care about this case is + * that otherwise we could create a new stats entry for an already dropped + * function (for relations etc this is not possible because emitting stats + * requires a lock for the relation to already have been acquired). + * + * It's somewhat ugly to have a behavioral difference based on + * track_functions being enabled/disabled. But it seems acceptable, given + * that there's already behavioral differences depending on whether the + * function is the caches etc. + * + * For correctness it'd be sufficient to set ->dropped to true. However, + * the accepted invalidation will commonly cause "low level" failures in + * PL code, with an OID in the error message. Making this harder to + * test... + */ + if (created_entry) + { + AcceptInvalidationMessages(); + if (!SearchSysCacheExists1(PROCOID, ObjectIdGetDatum(fcinfo->flinfo->fn_oid))) + { + pgstat_drop_entry(PGSTAT_KIND_FUNCTION, MyDatabaseId, + fcinfo->flinfo->fn_oid); + ereport(ERROR, errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("function call to dropped function")); + } + } + + pending = entry_ref->pending; + + fcu->fs = &pending->f_counts; + + /* save stats for this function, later used to compensate for recursion */ + fcu->save_f_total_time = pending->f_counts.f_total_time; + + /* save current backend-wide total time */ + fcu->save_total = total_func_time; + + /* get clock time as of function start */ + INSTR_TIME_SET_CURRENT(fcu->f_start); +} + +/* + * Calculate function call usage and update stat counters. + * Called by the executor after invoking a function. + * + * In the case of a set-returning function that runs in value-per-call mode, + * we will see multiple pgstat_init_function_usage/pgstat_end_function_usage + * calls for what the user considers a single call of the function. The + * finalize flag should be TRUE on the last call. + */ +void +pgstat_end_function_usage(PgStat_FunctionCallUsage *fcu, bool finalize) +{ + PgStat_FunctionCounts *fs = fcu->fs; + instr_time f_total; + instr_time f_others; + instr_time f_self; + + /* stats not wanted? */ + if (fs == NULL) + return; + + /* total elapsed time in this function call */ + INSTR_TIME_SET_CURRENT(f_total); + INSTR_TIME_SUBTRACT(f_total, fcu->f_start); + + /* self usage: elapsed minus anything already charged to other calls */ + f_others = total_func_time; + INSTR_TIME_SUBTRACT(f_others, fcu->save_total); + f_self = f_total; + INSTR_TIME_SUBTRACT(f_self, f_others); + + /* update backend-wide total time */ + INSTR_TIME_ADD(total_func_time, f_self); + + /* + * Compute the new f_total_time as the total elapsed time added to the + * pre-call value of f_total_time. This is necessary to avoid + * double-counting any time taken by recursive calls of myself. (We do + * not need any similar kluge for self time, since that already excludes + * any recursive calls.) + */ + INSTR_TIME_ADD(f_total, fcu->save_f_total_time); + + /* update counters in function stats table */ + if (finalize) + fs->f_numcalls++; + fs->f_total_time = f_total; + INSTR_TIME_ADD(fs->f_self_time, f_self); +} + +/* + * Flush out pending stats for the entry + * + * If nowait is true, this function returns false if lock could not + * immediately acquired, otherwise true is returned. + */ +bool +pgstat_function_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) +{ + PgStat_BackendFunctionEntry *localent; + PgStatShared_Function *shfuncent; + + localent = (PgStat_BackendFunctionEntry *) entry_ref->pending; + shfuncent = (PgStatShared_Function *) entry_ref->shared_stats; + + /* localent always has non-zero content */ + + if (!pgstat_lock_entry(entry_ref, nowait)) + return false; + + shfuncent->stats.f_numcalls += localent->f_counts.f_numcalls; + shfuncent->stats.f_total_time += + INSTR_TIME_GET_MICROSEC(localent->f_counts.f_total_time); + shfuncent->stats.f_self_time += + INSTR_TIME_GET_MICROSEC(localent->f_counts.f_self_time); + + pgstat_unlock_entry(entry_ref); + + return true; +} + +/* + * find any existing PgStat_BackendFunctionEntry entry for specified function + * + * If no entry, return NULL, don't create a new one + */ +PgStat_BackendFunctionEntry * +find_funcstat_entry(Oid func_id) +{ + PgStat_EntryRef *entry_ref; + + entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_FUNCTION, MyDatabaseId, func_id); + + if (entry_ref) + return entry_ref->pending; + return NULL; +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one function or NULL. + */ +PgStat_StatFuncEntry * +pgstat_fetch_stat_funcentry(Oid func_id) +{ + return (PgStat_StatFuncEntry *) + pgstat_fetch_entry(PGSTAT_KIND_FUNCTION, MyDatabaseId, func_id); +} diff --git a/src/backend/utils/activity/pgstat_relation.c b/src/backend/utils/activity/pgstat_relation.c new file mode 100644 index 0000000..a846d9f --- /dev/null +++ b/src/backend/utils/activity/pgstat_relation.c @@ -0,0 +1,938 @@ +/* ------------------------------------------------------------------------- + * + * pgstat_relation.c + * Implementation of relation statistics. + * + * This file contains the implementation of function relation. It is kept + * separate from pgstat.c to enforce the line between the statistics access / + * storage implementation and the details about individual types of + * statistics. + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/activity/pgstat_relation.c + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/twophase_rmgr.h" +#include "access/xact.h" +#include "catalog/partition.h" +#include "postmaster/autovacuum.h" +#include "utils/memutils.h" +#include "utils/pgstat_internal.h" +#include "utils/rel.h" +#include "utils/timestamp.h" + + +/* Record that's written to 2PC state file when pgstat state is persisted */ +typedef struct TwoPhasePgStatRecord +{ + PgStat_Counter tuples_inserted; /* tuples inserted in xact */ + PgStat_Counter tuples_updated; /* tuples updated in xact */ + PgStat_Counter tuples_deleted; /* tuples deleted in xact */ + /* tuples i/u/d prior to truncate/drop */ + PgStat_Counter inserted_pre_truncdrop; + PgStat_Counter updated_pre_truncdrop; + PgStat_Counter deleted_pre_truncdrop; + Oid t_id; /* table's OID */ + bool t_shared; /* is it a shared catalog? */ + bool t_truncdropped; /* was the relation truncated/dropped? */ +} TwoPhasePgStatRecord; + + +static PgStat_TableStatus *pgstat_prep_relation_pending(Oid rel_id, bool isshared); +static void add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level); +static void ensure_tabstat_xact_level(PgStat_TableStatus *pgstat_info); +static void save_truncdrop_counters(PgStat_TableXactStatus *trans, bool is_drop); +static void restore_truncdrop_counters(PgStat_TableXactStatus *trans); + + +/* + * Copy stats between relations. This is used for things like REINDEX + * CONCURRENTLY. + */ +void +pgstat_copy_relation_stats(Relation dst, Relation src) +{ + PgStat_StatTabEntry *srcstats; + PgStatShared_Relation *dstshstats; + PgStat_EntryRef *dst_ref; + + srcstats = pgstat_fetch_stat_tabentry_ext(src->rd_rel->relisshared, + RelationGetRelid(src)); + if (!srcstats) + return; + + dst_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, + dst->rd_rel->relisshared ? InvalidOid : MyDatabaseId, + RelationGetRelid(dst), + false); + + dstshstats = (PgStatShared_Relation *) dst_ref->shared_stats; + dstshstats->stats = *srcstats; + + pgstat_unlock_entry(dst_ref); +} + +/* + * Initialize a relcache entry to count access statistics. Called whenever a + * relation is opened. + * + * We assume that a relcache entry's pgstat_info field is zeroed by relcache.c + * when the relcache entry is made; thereafter it is long-lived data. + * + * This does not create a reference to a stats entry in shared memory, nor + * allocate memory for the pending stats. That happens in + * pgstat_assoc_relation(). + */ +void +pgstat_init_relation(Relation rel) +{ + char relkind = rel->rd_rel->relkind; + + /* + * We only count stats for relations with storage and partitioned tables + */ + if (!RELKIND_HAS_STORAGE(relkind) && relkind != RELKIND_PARTITIONED_TABLE) + { + rel->pgstat_enabled = false; + rel->pgstat_info = NULL; + return; + } + + if (!pgstat_track_counts) + { + if (rel->pgstat_info) + pgstat_unlink_relation(rel); + + /* We're not counting at all */ + rel->pgstat_enabled = false; + rel->pgstat_info = NULL; + return; + } + + rel->pgstat_enabled = true; +} + +/* + * Prepare for statistics for this relation to be collected. + * + * This ensures we have a reference to the stats entry before stats can be + * generated. That is important because a relation drop in another connection + * could otherwise lead to the stats entry being dropped, which then later + * would get recreated when flushing stats. + * + * This is separate from pgstat_init_relation() as it is not uncommon for + * relcache entries to be opened without ever getting stats reported. + */ +void +pgstat_assoc_relation(Relation rel) +{ + Assert(rel->pgstat_enabled); + Assert(rel->pgstat_info == NULL); + + /* Else find or make the PgStat_TableStatus entry, and update link */ + rel->pgstat_info = pgstat_prep_relation_pending(RelationGetRelid(rel), + rel->rd_rel->relisshared); + + /* don't allow link a stats to multiple relcache entries */ + Assert(rel->pgstat_info->relation == NULL); + + /* mark this relation as the owner */ + rel->pgstat_info->relation = rel; +} + +/* + * Break the mutual link between a relcache entry and pending stats entry. + * This must be called whenever one end of the link is removed. + */ +void +pgstat_unlink_relation(Relation rel) +{ + /* remove the link to stats info if any */ + if (rel->pgstat_info == NULL) + return; + + /* link sanity check */ + Assert(rel->pgstat_info->relation == rel); + rel->pgstat_info->relation = NULL; + rel->pgstat_info = NULL; +} + +/* + * Ensure that stats are dropped if transaction aborts. + */ +void +pgstat_create_relation(Relation rel) +{ + pgstat_create_transactional(PGSTAT_KIND_RELATION, + rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId, + RelationGetRelid(rel)); +} + +/* + * Ensure that stats are dropped if transaction commits. + */ +void +pgstat_drop_relation(Relation rel) +{ + int nest_level = GetCurrentTransactionNestLevel(); + PgStat_TableStatus *pgstat_info; + + pgstat_drop_transactional(PGSTAT_KIND_RELATION, + rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId, + RelationGetRelid(rel)); + + if (!pgstat_should_count_relation(rel)) + return; + + /* + * Transactionally set counters to 0. That ensures that accesses to + * pg_stat_xact_all_tables inside the transaction show 0. + */ + pgstat_info = rel->pgstat_info; + if (pgstat_info->trans && + pgstat_info->trans->nest_level == nest_level) + { + save_truncdrop_counters(pgstat_info->trans, true); + pgstat_info->trans->tuples_inserted = 0; + pgstat_info->trans->tuples_updated = 0; + pgstat_info->trans->tuples_deleted = 0; + } +} + +/* + * Report that the table was just vacuumed. + */ +void +pgstat_report_vacuum(Oid tableoid, bool shared, + PgStat_Counter livetuples, PgStat_Counter deadtuples) +{ + PgStat_EntryRef *entry_ref; + PgStatShared_Relation *shtabentry; + PgStat_StatTabEntry *tabentry; + Oid dboid = (shared ? InvalidOid : MyDatabaseId); + TimestampTz ts; + + if (!pgstat_track_counts) + return; + + /* Store the data in the table's hash table entry. */ + ts = GetCurrentTimestamp(); + + /* block acquiring lock for the same reason as pgstat_report_autovac() */ + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, + dboid, tableoid, false); + + shtabentry = (PgStatShared_Relation *) entry_ref->shared_stats; + tabentry = &shtabentry->stats; + + tabentry->n_live_tuples = livetuples; + tabentry->n_dead_tuples = deadtuples; + + /* + * It is quite possible that a non-aggressive VACUUM ended up skipping + * various pages, however, we'll zero the insert counter here regardless. + * It's currently used only to track when we need to perform an "insert" + * autovacuum, which are mainly intended to freeze newly inserted tuples. + * Zeroing this may just mean we'll not try to vacuum the table again + * until enough tuples have been inserted to trigger another insert + * autovacuum. An anti-wraparound autovacuum will catch any persistent + * stragglers. + */ + tabentry->inserts_since_vacuum = 0; + + if (IsAutoVacuumWorkerProcess()) + { + tabentry->autovac_vacuum_timestamp = ts; + tabentry->autovac_vacuum_count++; + } + else + { + tabentry->vacuum_timestamp = ts; + tabentry->vacuum_count++; + } + + pgstat_unlock_entry(entry_ref); +} + +/* + * Report that the table was just analyzed. + * + * Caller must provide new live- and dead-tuples estimates, as well as a + * flag indicating whether to reset the changes_since_analyze counter. + */ +void +pgstat_report_analyze(Relation rel, + PgStat_Counter livetuples, PgStat_Counter deadtuples, + bool resetcounter) +{ + PgStat_EntryRef *entry_ref; + PgStatShared_Relation *shtabentry; + PgStat_StatTabEntry *tabentry; + Oid dboid = (rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId); + + if (!pgstat_track_counts) + return; + + /* + * Unlike VACUUM, ANALYZE might be running inside a transaction that has + * already inserted and/or deleted rows in the target table. ANALYZE will + * have counted such rows as live or dead respectively. Because we will + * report our counts of such rows at transaction end, we should subtract + * off these counts from the update we're making now, else they'll be + * double-counted after commit. (This approach also ensures that the + * shared stats entry ends up with the right numbers if we abort instead + * of committing.) + * + * Waste no time on partitioned tables, though. + */ + if (pgstat_should_count_relation(rel) && + rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + { + PgStat_TableXactStatus *trans; + + for (trans = rel->pgstat_info->trans; trans; trans = trans->upper) + { + livetuples -= trans->tuples_inserted - trans->tuples_deleted; + deadtuples -= trans->tuples_updated + trans->tuples_deleted; + } + /* count stuff inserted by already-aborted subxacts, too */ + deadtuples -= rel->pgstat_info->t_counts.t_delta_dead_tuples; + /* Since ANALYZE's counts are estimates, we could have underflowed */ + livetuples = Max(livetuples, 0); + deadtuples = Max(deadtuples, 0); + } + + /* block acquiring lock for the same reason as pgstat_report_autovac() */ + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, dboid, + RelationGetRelid(rel), + false); + /* can't get dropped while accessed */ + Assert(entry_ref != NULL && entry_ref->shared_stats != NULL); + + shtabentry = (PgStatShared_Relation *) entry_ref->shared_stats; + tabentry = &shtabentry->stats; + + tabentry->n_live_tuples = livetuples; + tabentry->n_dead_tuples = deadtuples; + + /* + * If commanded, reset changes_since_analyze to zero. This forgets any + * changes that were committed while the ANALYZE was in progress, but we + * have no good way to estimate how many of those there were. + */ + if (resetcounter) + tabentry->changes_since_analyze = 0; + + if (IsAutoVacuumWorkerProcess()) + { + tabentry->autovac_analyze_timestamp = GetCurrentTimestamp(); + tabentry->autovac_analyze_count++; + } + else + { + tabentry->analyze_timestamp = GetCurrentTimestamp(); + tabentry->analyze_count++; + } + + pgstat_unlock_entry(entry_ref); +} + +/* + * count a tuple insertion of n tuples + */ +void +pgstat_count_heap_insert(Relation rel, PgStat_Counter n) +{ + if (pgstat_should_count_relation(rel)) + { + PgStat_TableStatus *pgstat_info = rel->pgstat_info; + + ensure_tabstat_xact_level(pgstat_info); + pgstat_info->trans->tuples_inserted += n; + } +} + +/* + * count a tuple update + */ +void +pgstat_count_heap_update(Relation rel, bool hot) +{ + if (pgstat_should_count_relation(rel)) + { + PgStat_TableStatus *pgstat_info = rel->pgstat_info; + + ensure_tabstat_xact_level(pgstat_info); + pgstat_info->trans->tuples_updated++; + + /* t_tuples_hot_updated is nontransactional, so just advance it */ + if (hot) + pgstat_info->t_counts.t_tuples_hot_updated++; + } +} + +/* + * count a tuple deletion + */ +void +pgstat_count_heap_delete(Relation rel) +{ + if (pgstat_should_count_relation(rel)) + { + PgStat_TableStatus *pgstat_info = rel->pgstat_info; + + ensure_tabstat_xact_level(pgstat_info); + pgstat_info->trans->tuples_deleted++; + } +} + +/* + * update tuple counters due to truncate + */ +void +pgstat_count_truncate(Relation rel) +{ + if (pgstat_should_count_relation(rel)) + { + PgStat_TableStatus *pgstat_info = rel->pgstat_info; + + ensure_tabstat_xact_level(pgstat_info); + save_truncdrop_counters(pgstat_info->trans, false); + pgstat_info->trans->tuples_inserted = 0; + pgstat_info->trans->tuples_updated = 0; + pgstat_info->trans->tuples_deleted = 0; + } +} + +/* + * update dead-tuples count + * + * The semantics of this are that we are reporting the nontransactional + * recovery of "delta" dead tuples; so t_delta_dead_tuples decreases + * rather than increasing, and the change goes straight into the per-table + * counter, not into transactional state. + */ +void +pgstat_update_heap_dead_tuples(Relation rel, int delta) +{ + if (pgstat_should_count_relation(rel)) + { + PgStat_TableStatus *pgstat_info = rel->pgstat_info; + + pgstat_info->t_counts.t_delta_dead_tuples -= delta; + } +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one table or NULL. NULL doesn't mean + * that the table doesn't exist, just that there are no statistics, so the + * caller is better off to report ZERO instead. + */ +PgStat_StatTabEntry * +pgstat_fetch_stat_tabentry(Oid relid) +{ + PgStat_StatTabEntry *tabentry; + + tabentry = pgstat_fetch_stat_tabentry_ext(false, relid); + if (tabentry != NULL) + return tabentry; + + /* + * If we didn't find it, maybe it's a shared table. + */ + tabentry = pgstat_fetch_stat_tabentry_ext(true, relid); + return tabentry; +} + +/* + * More efficient version of pgstat_fetch_stat_tabentry(), allowing to specify + * whether the to-be-accessed table is a shared relation or not. + */ +PgStat_StatTabEntry * +pgstat_fetch_stat_tabentry_ext(bool shared, Oid reloid) +{ + Oid dboid = (shared ? InvalidOid : MyDatabaseId); + + return (PgStat_StatTabEntry *) + pgstat_fetch_entry(PGSTAT_KIND_RELATION, dboid, reloid); +} + +/* + * find any existing PgStat_TableStatus entry for rel + * + * Find any existing PgStat_TableStatus entry for rel_id in the current + * database. If not found, try finding from shared tables. + * + * If no entry found, return NULL, don't create a new one + */ +PgStat_TableStatus * +find_tabstat_entry(Oid rel_id) +{ + PgStat_EntryRef *entry_ref; + + entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_RELATION, MyDatabaseId, rel_id); + if (!entry_ref) + entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_RELATION, InvalidOid, rel_id); + + if (entry_ref) + return entry_ref->pending; + return NULL; +} + +/* + * Perform relation stats specific end-of-transaction work. Helper for + * AtEOXact_PgStat. + * + * Transfer transactional insert/update counts into the base tabstat entries. + * We don't bother to free any of the transactional state, since it's all in + * TopTransactionContext and will go away anyway. + */ +void +AtEOXact_PgStat_Relations(PgStat_SubXactStatus *xact_state, bool isCommit) +{ + PgStat_TableXactStatus *trans; + + for (trans = xact_state->first; trans != NULL; trans = trans->next) + { + PgStat_TableStatus *tabstat; + + Assert(trans->nest_level == 1); + Assert(trans->upper == NULL); + tabstat = trans->parent; + Assert(tabstat->trans == trans); + /* restore pre-truncate/drop stats (if any) in case of aborted xact */ + if (!isCommit) + restore_truncdrop_counters(trans); + /* count attempted actions regardless of commit/abort */ + tabstat->t_counts.t_tuples_inserted += trans->tuples_inserted; + tabstat->t_counts.t_tuples_updated += trans->tuples_updated; + tabstat->t_counts.t_tuples_deleted += trans->tuples_deleted; + if (isCommit) + { + tabstat->t_counts.t_truncdropped = trans->truncdropped; + if (trans->truncdropped) + { + /* forget live/dead stats seen by backend thus far */ + tabstat->t_counts.t_delta_live_tuples = 0; + tabstat->t_counts.t_delta_dead_tuples = 0; + } + /* insert adds a live tuple, delete removes one */ + tabstat->t_counts.t_delta_live_tuples += + trans->tuples_inserted - trans->tuples_deleted; + /* update and delete each create a dead tuple */ + tabstat->t_counts.t_delta_dead_tuples += + trans->tuples_updated + trans->tuples_deleted; + /* insert, update, delete each count as one change event */ + tabstat->t_counts.t_changed_tuples += + trans->tuples_inserted + trans->tuples_updated + + trans->tuples_deleted; + } + else + { + /* inserted tuples are dead, deleted tuples are unaffected */ + tabstat->t_counts.t_delta_dead_tuples += + trans->tuples_inserted + trans->tuples_updated; + /* an aborted xact generates no changed_tuple events */ + } + tabstat->trans = NULL; + } +} + +/* + * Perform relation stats specific end-of-sub-transaction work. Helper for + * AtEOSubXact_PgStat. + * + * Transfer transactional insert/update counts into the next higher + * subtransaction state. + */ +void +AtEOSubXact_PgStat_Relations(PgStat_SubXactStatus *xact_state, bool isCommit, int nestDepth) +{ + PgStat_TableXactStatus *trans; + PgStat_TableXactStatus *next_trans; + + for (trans = xact_state->first; trans != NULL; trans = next_trans) + { + PgStat_TableStatus *tabstat; + + next_trans = trans->next; + Assert(trans->nest_level == nestDepth); + tabstat = trans->parent; + Assert(tabstat->trans == trans); + + if (isCommit) + { + if (trans->upper && trans->upper->nest_level == nestDepth - 1) + { + if (trans->truncdropped) + { + /* propagate the truncate/drop status one level up */ + save_truncdrop_counters(trans->upper, false); + /* replace upper xact stats with ours */ + trans->upper->tuples_inserted = trans->tuples_inserted; + trans->upper->tuples_updated = trans->tuples_updated; + trans->upper->tuples_deleted = trans->tuples_deleted; + } + else + { + trans->upper->tuples_inserted += trans->tuples_inserted; + trans->upper->tuples_updated += trans->tuples_updated; + trans->upper->tuples_deleted += trans->tuples_deleted; + } + tabstat->trans = trans->upper; + pfree(trans); + } + else + { + /* + * When there isn't an immediate parent state, we can just + * reuse the record instead of going through a palloc/pfree + * pushup (this works since it's all in TopTransactionContext + * anyway). We have to re-link it into the parent level, + * though, and that might mean pushing a new entry into the + * pgStatXactStack. + */ + PgStat_SubXactStatus *upper_xact_state; + + upper_xact_state = pgstat_get_xact_stack_level(nestDepth - 1); + trans->next = upper_xact_state->first; + upper_xact_state->first = trans; + trans->nest_level = nestDepth - 1; + } + } + else + { + /* + * On abort, update top-level tabstat counts, then forget the + * subtransaction + */ + + /* first restore values obliterated by truncate/drop */ + restore_truncdrop_counters(trans); + /* count attempted actions regardless of commit/abort */ + tabstat->t_counts.t_tuples_inserted += trans->tuples_inserted; + tabstat->t_counts.t_tuples_updated += trans->tuples_updated; + tabstat->t_counts.t_tuples_deleted += trans->tuples_deleted; + /* inserted tuples are dead, deleted tuples are unaffected */ + tabstat->t_counts.t_delta_dead_tuples += + trans->tuples_inserted + trans->tuples_updated; + tabstat->trans = trans->upper; + pfree(trans); + } + } +} + +/* + * Generate 2PC records for all the pending transaction-dependent relation + * stats. + */ +void +AtPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state) +{ + PgStat_TableXactStatus *trans; + + for (trans = xact_state->first; trans != NULL; trans = trans->next) + { + PgStat_TableStatus *tabstat PG_USED_FOR_ASSERTS_ONLY; + TwoPhasePgStatRecord record; + + Assert(trans->nest_level == 1); + Assert(trans->upper == NULL); + tabstat = trans->parent; + Assert(tabstat->trans == trans); + + record.tuples_inserted = trans->tuples_inserted; + record.tuples_updated = trans->tuples_updated; + record.tuples_deleted = trans->tuples_deleted; + record.inserted_pre_truncdrop = trans->inserted_pre_truncdrop; + record.updated_pre_truncdrop = trans->updated_pre_truncdrop; + record.deleted_pre_truncdrop = trans->deleted_pre_truncdrop; + record.t_id = tabstat->t_id; + record.t_shared = tabstat->t_shared; + record.t_truncdropped = trans->truncdropped; + + RegisterTwoPhaseRecord(TWOPHASE_RM_PGSTAT_ID, 0, + &record, sizeof(TwoPhasePgStatRecord)); + } +} + +/* + * All we need do here is unlink the transaction stats state from the + * nontransactional state. The nontransactional action counts will be + * reported to the stats system immediately, while the effects on live and + * dead tuple counts are preserved in the 2PC state file. + * + * Note: AtEOXact_PgStat_Relations is not called during PREPARE. + */ +void +PostPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state) +{ + PgStat_TableXactStatus *trans; + + for (trans = xact_state->first; trans != NULL; trans = trans->next) + { + PgStat_TableStatus *tabstat; + + tabstat = trans->parent; + tabstat->trans = NULL; + } +} + +/* + * 2PC processing routine for COMMIT PREPARED case. + * + * Load the saved counts into our local pgstats state. + */ +void +pgstat_twophase_postcommit(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata; + PgStat_TableStatus *pgstat_info; + + /* Find or create a tabstat entry for the rel */ + pgstat_info = pgstat_prep_relation_pending(rec->t_id, rec->t_shared); + + /* Same math as in AtEOXact_PgStat, commit case */ + pgstat_info->t_counts.t_tuples_inserted += rec->tuples_inserted; + pgstat_info->t_counts.t_tuples_updated += rec->tuples_updated; + pgstat_info->t_counts.t_tuples_deleted += rec->tuples_deleted; + pgstat_info->t_counts.t_truncdropped = rec->t_truncdropped; + if (rec->t_truncdropped) + { + /* forget live/dead stats seen by backend thus far */ + pgstat_info->t_counts.t_delta_live_tuples = 0; + pgstat_info->t_counts.t_delta_dead_tuples = 0; + } + pgstat_info->t_counts.t_delta_live_tuples += + rec->tuples_inserted - rec->tuples_deleted; + pgstat_info->t_counts.t_delta_dead_tuples += + rec->tuples_updated + rec->tuples_deleted; + pgstat_info->t_counts.t_changed_tuples += + rec->tuples_inserted + rec->tuples_updated + + rec->tuples_deleted; +} + +/* + * 2PC processing routine for ROLLBACK PREPARED case. + * + * Load the saved counts into our local pgstats state, but treat them + * as aborted. + */ +void +pgstat_twophase_postabort(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata; + PgStat_TableStatus *pgstat_info; + + /* Find or create a tabstat entry for the rel */ + pgstat_info = pgstat_prep_relation_pending(rec->t_id, rec->t_shared); + + /* Same math as in AtEOXact_PgStat, abort case */ + if (rec->t_truncdropped) + { + rec->tuples_inserted = rec->inserted_pre_truncdrop; + rec->tuples_updated = rec->updated_pre_truncdrop; + rec->tuples_deleted = rec->deleted_pre_truncdrop; + } + pgstat_info->t_counts.t_tuples_inserted += rec->tuples_inserted; + pgstat_info->t_counts.t_tuples_updated += rec->tuples_updated; + pgstat_info->t_counts.t_tuples_deleted += rec->tuples_deleted; + pgstat_info->t_counts.t_delta_dead_tuples += + rec->tuples_inserted + rec->tuples_updated; +} + +/* + * Flush out pending stats for the entry + * + * If nowait is true, this function returns false if lock could not + * immediately acquired, otherwise true is returned. + * + * Some of the stats are copied to the corresponding pending database stats + * entry when successfully flushing. + */ +bool +pgstat_relation_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) +{ + static const PgStat_TableCounts all_zeroes; + Oid dboid; + PgStat_TableStatus *lstats; /* pending stats entry */ + PgStatShared_Relation *shtabstats; + PgStat_StatTabEntry *tabentry; /* table entry of shared stats */ + PgStat_StatDBEntry *dbentry; /* pending database entry */ + + dboid = entry_ref->shared_entry->key.dboid; + lstats = (PgStat_TableStatus *) entry_ref->pending; + shtabstats = (PgStatShared_Relation *) entry_ref->shared_stats; + + /* + * Ignore entries that didn't accumulate any actual counts, such as + * indexes that were opened by the planner but not used. + */ + if (memcmp(&lstats->t_counts, &all_zeroes, + sizeof(PgStat_TableCounts)) == 0) + { + return true; + } + + if (!pgstat_lock_entry(entry_ref, nowait)) + return false; + + /* add the values to the shared entry. */ + tabentry = &shtabstats->stats; + + tabentry->numscans += lstats->t_counts.t_numscans; + tabentry->tuples_returned += lstats->t_counts.t_tuples_returned; + tabentry->tuples_fetched += lstats->t_counts.t_tuples_fetched; + tabentry->tuples_inserted += lstats->t_counts.t_tuples_inserted; + tabentry->tuples_updated += lstats->t_counts.t_tuples_updated; + tabentry->tuples_deleted += lstats->t_counts.t_tuples_deleted; + tabentry->tuples_hot_updated += lstats->t_counts.t_tuples_hot_updated; + + /* + * If table was truncated/dropped, first reset the live/dead counters. + */ + if (lstats->t_counts.t_truncdropped) + { + tabentry->n_live_tuples = 0; + tabentry->n_dead_tuples = 0; + tabentry->inserts_since_vacuum = 0; + } + + tabentry->n_live_tuples += lstats->t_counts.t_delta_live_tuples; + tabentry->n_dead_tuples += lstats->t_counts.t_delta_dead_tuples; + tabentry->changes_since_analyze += lstats->t_counts.t_changed_tuples; + tabentry->inserts_since_vacuum += lstats->t_counts.t_tuples_inserted; + tabentry->blocks_fetched += lstats->t_counts.t_blocks_fetched; + tabentry->blocks_hit += lstats->t_counts.t_blocks_hit; + + /* Clamp n_live_tuples in case of negative delta_live_tuples */ + tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0); + /* Likewise for n_dead_tuples */ + tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0); + + pgstat_unlock_entry(entry_ref); + + /* The entry was successfully flushed, add the same to database stats */ + dbentry = pgstat_prep_database_pending(dboid); + dbentry->n_tuples_returned += lstats->t_counts.t_tuples_returned; + dbentry->n_tuples_fetched += lstats->t_counts.t_tuples_fetched; + dbentry->n_tuples_inserted += lstats->t_counts.t_tuples_inserted; + dbentry->n_tuples_updated += lstats->t_counts.t_tuples_updated; + dbentry->n_tuples_deleted += lstats->t_counts.t_tuples_deleted; + dbentry->n_blocks_fetched += lstats->t_counts.t_blocks_fetched; + dbentry->n_blocks_hit += lstats->t_counts.t_blocks_hit; + + return true; +} + +void +pgstat_relation_delete_pending_cb(PgStat_EntryRef *entry_ref) +{ + PgStat_TableStatus *pending = (PgStat_TableStatus *) entry_ref->pending; + + if (pending->relation) + pgstat_unlink_relation(pending->relation); +} + +/* + * Find or create a PgStat_TableStatus entry for rel. New entry is created and + * initialized if not exists. + */ +static PgStat_TableStatus * +pgstat_prep_relation_pending(Oid rel_id, bool isshared) +{ + PgStat_EntryRef *entry_ref; + PgStat_TableStatus *pending; + + entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_RELATION, + isshared ? InvalidOid : MyDatabaseId, + rel_id, NULL); + pending = entry_ref->pending; + pending->t_id = rel_id; + pending->t_shared = isshared; + + return pending; +} + +/* + * add a new (sub)transaction state record + */ +static void +add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level) +{ + PgStat_SubXactStatus *xact_state; + PgStat_TableXactStatus *trans; + + /* + * If this is the first rel to be modified at the current nest level, we + * first have to push a transaction stack entry. + */ + xact_state = pgstat_get_xact_stack_level(nest_level); + + /* Now make a per-table stack entry */ + trans = (PgStat_TableXactStatus *) + MemoryContextAllocZero(TopTransactionContext, + sizeof(PgStat_TableXactStatus)); + trans->nest_level = nest_level; + trans->upper = pgstat_info->trans; + trans->parent = pgstat_info; + trans->next = xact_state->first; + xact_state->first = trans; + pgstat_info->trans = trans; +} + +/* + * Add a new (sub)transaction record if needed. + */ +static void +ensure_tabstat_xact_level(PgStat_TableStatus *pgstat_info) +{ + int nest_level = GetCurrentTransactionNestLevel(); + + if (pgstat_info->trans == NULL || + pgstat_info->trans->nest_level != nest_level) + add_tabstat_xact_level(pgstat_info, nest_level); +} + +/* + * Whenever a table is truncated/dropped, we save its i/u/d counters so that + * they can be cleared, and if the (sub)xact that executed the truncate/drop + * later aborts, the counters can be restored to the saved (pre-truncate/drop) + * values. + * + * Note that for truncate we do this on the first truncate in any particular + * subxact level only. + */ +static void +save_truncdrop_counters(PgStat_TableXactStatus *trans, bool is_drop) +{ + if (!trans->truncdropped || is_drop) + { + trans->inserted_pre_truncdrop = trans->tuples_inserted; + trans->updated_pre_truncdrop = trans->tuples_updated; + trans->deleted_pre_truncdrop = trans->tuples_deleted; + trans->truncdropped = true; + } +} + +/* + * restore counters when a truncate aborts + */ +static void +restore_truncdrop_counters(PgStat_TableXactStatus *trans) +{ + if (trans->truncdropped) + { + trans->tuples_inserted = trans->inserted_pre_truncdrop; + trans->tuples_updated = trans->updated_pre_truncdrop; + trans->tuples_deleted = trans->deleted_pre_truncdrop; + } +} diff --git a/src/backend/utils/activity/pgstat_replslot.c b/src/backend/utils/activity/pgstat_replslot.c new file mode 100644 index 0000000..1a8473b --- /dev/null +++ b/src/backend/utils/activity/pgstat_replslot.c @@ -0,0 +1,224 @@ +/* ------------------------------------------------------------------------- + * + * pgstat_replslot.c + * Implementation of replication slot statistics. + * + * This file contains the implementation of replication slot statistics. It is kept + * separate from pgstat.c to enforce the line between the statistics access / + * storage implementation and the details about individual types of + * statistics. + * + * Replication slot stats work a bit different than other other + * variable-numbered stats. Slots do not have oids (so they can be created on + * physical replicas). Use the slot index as object id while running. However, + * the slot index can change when restarting. That is addressed by using the + * name when (de-)serializing. After a restart it is possible for slots to + * have been dropped while shut down, which is addressed by not restoring + * stats for slots that cannot be found by name when starting up. + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/activity/pgstat_replslot.c + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "replication/slot.h" +#include "utils/builtins.h" /* for namestrcpy() */ +#include "utils/pgstat_internal.h" + + +static int get_replslot_index(const char *name); + + +/* + * Reset counters for a single replication slot. + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +void +pgstat_reset_replslot(const char *name) +{ + ReplicationSlot *slot; + + AssertArg(name != NULL); + + /* Check if the slot exits with the given name. */ + slot = SearchNamedReplicationSlot(name, true); + + if (!slot) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("replication slot \"%s\" does not exist", + name))); + + /* + * Nothing to do for physical slots as we collect stats only for logical + * slots. + */ + if (SlotIsPhysical(slot)) + return; + + /* reset this one entry */ + pgstat_reset(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot)); +} + +/* + * Report replication slot statistics. + * + * We can rely on the stats for the slot to exist and to belong to this + * slot. We can only get here if pgstat_create_replslot() or + * pgstat_acquire_replslot() have already been called. + */ +void +pgstat_report_replslot(ReplicationSlot *slot, const PgStat_StatReplSlotEntry *repSlotStat) +{ + PgStat_EntryRef *entry_ref; + PgStatShared_ReplSlot *shstatent; + PgStat_StatReplSlotEntry *statent; + + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot), false); + shstatent = (PgStatShared_ReplSlot *) entry_ref->shared_stats; + statent = &shstatent->stats; + + /* Update the replication slot statistics */ +#define REPLSLOT_ACC(fld) statent->fld += repSlotStat->fld + REPLSLOT_ACC(spill_txns); + REPLSLOT_ACC(spill_count); + REPLSLOT_ACC(spill_bytes); + REPLSLOT_ACC(stream_txns); + REPLSLOT_ACC(stream_count); + REPLSLOT_ACC(stream_bytes); + REPLSLOT_ACC(total_txns); + REPLSLOT_ACC(total_bytes); +#undef REPLSLOT_ACC + + pgstat_unlock_entry(entry_ref); +} + +/* + * Report replication slot creation. + * + * NB: This gets called with ReplicationSlotAllocationLock already held, be + * careful about calling back into slot.c. + */ +void +pgstat_create_replslot(ReplicationSlot *slot) +{ + PgStat_EntryRef *entry_ref; + PgStatShared_ReplSlot *shstatent; + + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot), false); + shstatent = (PgStatShared_ReplSlot *) entry_ref->shared_stats; + + /* + * NB: need to accept that there might be stats from an older slot, e.g. + * if we previously crashed after dropping a slot. + */ + memset(&shstatent->stats, 0, sizeof(shstatent->stats)); + + pgstat_unlock_entry(entry_ref); +} + +/* + * Report replication slot has been acquired. + * + * This guarantees that a stats entry exists during later + * pgstat_report_replslot() calls. + * + * If we previously crashed, no stats data exists. But if we did not crash, + * the stats do belong to this slot: + * - the stats cannot belong to a dropped slot, pgstat_drop_replslot() would + * have been called + * - if the slot was removed while shut down, + * pgstat_replslot_from_serialized_name_cb() returning false would have + * caused the stats to be dropped + */ +void +pgstat_acquire_replslot(ReplicationSlot *slot) +{ + pgstat_get_entry_ref(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot), true, NULL); +} + +/* + * Report replication slot drop. + */ +void +pgstat_drop_replslot(ReplicationSlot *slot) +{ + pgstat_drop_entry(PGSTAT_KIND_REPLSLOT, InvalidOid, + ReplicationSlotIndex(slot)); +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the replication slot statistics struct. + */ +PgStat_StatReplSlotEntry * +pgstat_fetch_replslot(NameData slotname) +{ + int idx = get_replslot_index(NameStr(slotname)); + + if (idx == -1) + return NULL; + + return (PgStat_StatReplSlotEntry *) + pgstat_fetch_entry(PGSTAT_KIND_REPLSLOT, InvalidOid, idx); +} + +void +pgstat_replslot_to_serialized_name_cb(const PgStat_HashKey *key, const PgStatShared_Common *header, NameData *name) +{ + /* + * This is only called late during shutdown. The set of existing slots + * isn't allowed to change at this point, we can assume that a slot exists + * at the offset. + */ + if (!ReplicationSlotName(key->objoid, name)) + elog(ERROR, "could not find name for replication slot index %u", + key->objoid); +} + +bool +pgstat_replslot_from_serialized_name_cb(const NameData *name, PgStat_HashKey *key) +{ + int idx = get_replslot_index(NameStr(*name)); + + /* slot might have been deleted */ + if (idx == -1) + return false; + + key->kind = PGSTAT_KIND_REPLSLOT; + key->dboid = InvalidOid; + key->objoid = idx; + + return true; +} + +void +pgstat_replslot_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts) +{ + ((PgStatShared_ReplSlot *) header)->stats.stat_reset_timestamp = ts; +} + +static int +get_replslot_index(const char *name) +{ + ReplicationSlot *slot; + + AssertArg(name != NULL); + + slot = SearchNamedReplicationSlot(name, true); + + if (!slot) + return -1; + + return ReplicationSlotIndex(slot); +} diff --git a/src/backend/utils/activity/pgstat_shmem.c b/src/backend/utils/activity/pgstat_shmem.c new file mode 100644 index 0000000..9a4f037 --- /dev/null +++ b/src/backend/utils/activity/pgstat_shmem.c @@ -0,0 +1,1003 @@ +/* ------------------------------------------------------------------------- + * + * pgstat_shmem.c + * Storage of stats entries in shared memory + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/activity/pgstat_shmem.c + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "pgstat.h" +#include "storage/shmem.h" +#include "utils/memutils.h" +#include "utils/pgstat_internal.h" + + +#define PGSTAT_ENTRY_REF_HASH_SIZE 128 + +/* hash table entry for finding the PgStat_EntryRef for a key */ +typedef struct PgStat_EntryRefHashEntry +{ + PgStat_HashKey key; /* hash key */ + char status; /* for simplehash use */ + PgStat_EntryRef *entry_ref; +} PgStat_EntryRefHashEntry; + + +/* for references to shared statistics entries */ +#define SH_PREFIX pgstat_entry_ref_hash +#define SH_ELEMENT_TYPE PgStat_EntryRefHashEntry +#define SH_KEY_TYPE PgStat_HashKey +#define SH_KEY key +#define SH_HASH_KEY(tb, key) \ + pgstat_hash_hash_key(&key, sizeof(PgStat_HashKey), NULL) +#define SH_EQUAL(tb, a, b) \ + pgstat_cmp_hash_key(&a, &b, sizeof(PgStat_HashKey), NULL) == 0 +#define SH_SCOPE static inline +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + + +static void pgstat_drop_database_and_contents(Oid dboid); + +static void pgstat_free_entry(PgStatShared_HashEntry *shent, dshash_seq_status *hstat); + +static void pgstat_release_entry_ref(PgStat_HashKey key, PgStat_EntryRef *entry_ref, bool discard_pending); +static bool pgstat_need_entry_refs_gc(void); +static void pgstat_gc_entry_refs(void); +static void pgstat_release_all_entry_refs(bool discard_pending); +typedef bool (*ReleaseMatchCB) (PgStat_EntryRefHashEntry *, Datum data); +static void pgstat_release_matching_entry_refs(bool discard_pending, ReleaseMatchCB match, Datum match_data); + +static void pgstat_setup_memcxt(void); + + +/* parameter for the shared hash */ +static const dshash_parameters dsh_params = { + sizeof(PgStat_HashKey), + sizeof(PgStatShared_HashEntry), + pgstat_cmp_hash_key, + pgstat_hash_hash_key, + LWTRANCHE_PGSTATS_HASH +}; + + +/* + * Backend local references to shared stats entries. If there are pending + * updates to a stats entry, the PgStat_EntryRef is added to the pgStatPending + * list. + * + * When a stats entry is dropped each backend needs to release its reference + * to it before the memory can be released. To trigger that + * pgStatLocal.shmem->gc_request_count is incremented - which each backend + * compares to their copy of pgStatSharedRefAge on a regular basis. + */ +static pgstat_entry_ref_hash_hash *pgStatEntryRefHash = NULL; +static int pgStatSharedRefAge = 0; /* cache age of pgStatShmLookupCache */ + +/* + * Memory contexts containing the pgStatEntryRefHash table and the + * pgStatSharedRef entries respectively. Kept separate to make it easier to + * track / attribute memory usage. + */ +static MemoryContext pgStatSharedRefContext = NULL; +static MemoryContext pgStatEntryRefHashContext = NULL; + + +/* ------------------------------------------------------------ + * Public functions called from postmaster follow + * ------------------------------------------------------------ + */ + +/* + * The size of the shared memory allocation for stats stored in the shared + * stats hash table. This allocation will be done as part of the main shared + * memory, rather than dynamic shared memory, allowing it to be initialized in + * postmaster. + */ +static Size +pgstat_dsa_init_size(void) +{ + Size sz; + + /* + * The dshash header / initial buckets array needs to fit into "plain" + * shared memory, but it's beneficial to not need dsm segments + * immediately. A size of 256kB seems works well and is not + * disproportional compared to other constant sized shared memory + * allocations. NB: To avoid DSMs further, the user can configure + * min_dynamic_shared_memory. + */ + sz = 256 * 1024; + Assert(dsa_minimum_size() <= sz); + return MAXALIGN(sz); +} + +/* + * Compute shared memory space needed for cumulative statistics + */ +Size +StatsShmemSize(void) +{ + Size sz; + + sz = MAXALIGN(sizeof(PgStat_ShmemControl)); + sz = add_size(sz, pgstat_dsa_init_size()); + + return sz; +} + +/* + * Initialize cumulative statistics system during startup + */ +void +StatsShmemInit(void) +{ + bool found; + Size sz; + + sz = StatsShmemSize(); + pgStatLocal.shmem = (PgStat_ShmemControl *) + ShmemInitStruct("Shared Memory Stats", sz, &found); + + if (!IsUnderPostmaster) + { + dsa_area *dsa; + dshash_table *dsh; + PgStat_ShmemControl *ctl = pgStatLocal.shmem; + char *p = (char *) ctl; + + Assert(!found); + + /* the allocation of pgStatLocal.shmem itself */ + p += MAXALIGN(sizeof(PgStat_ShmemControl)); + + /* + * Create a small dsa allocation in plain shared memory. This is + * required because postmaster cannot use dsm segments. It also + * provides a small efficiency win. + */ + ctl->raw_dsa_area = p; + p += MAXALIGN(pgstat_dsa_init_size()); + dsa = dsa_create_in_place(ctl->raw_dsa_area, + pgstat_dsa_init_size(), + LWTRANCHE_PGSTATS_DSA, 0); + dsa_pin(dsa); + + /* + * To ensure dshash is created in "plain" shared memory, temporarily + * limit size of dsa to the initial size of the dsa. + */ + dsa_set_size_limit(dsa, pgstat_dsa_init_size()); + + /* + * With the limit in place, create the dshash table. XXX: It'd be nice + * if there were dshash_create_in_place(). + */ + dsh = dshash_create(dsa, &dsh_params, 0); + ctl->hash_handle = dshash_get_hash_table_handle(dsh); + + /* lift limit set above */ + dsa_set_size_limit(dsa, -1); + + /* + * Postmaster will never access these again, thus free the local + * dsa/dshash references. + */ + dshash_detach(dsh); + dsa_detach(dsa); + + pg_atomic_init_u64(&ctl->gc_request_count, 1); + + + /* initialize fixed-numbered stats */ + LWLockInitialize(&ctl->archiver.lock, LWTRANCHE_PGSTATS_DATA); + LWLockInitialize(&ctl->bgwriter.lock, LWTRANCHE_PGSTATS_DATA); + LWLockInitialize(&ctl->checkpointer.lock, LWTRANCHE_PGSTATS_DATA); + LWLockInitialize(&ctl->slru.lock, LWTRANCHE_PGSTATS_DATA); + LWLockInitialize(&ctl->wal.lock, LWTRANCHE_PGSTATS_DATA); + } + else + { + Assert(found); + } +} + +void +pgstat_attach_shmem(void) +{ + MemoryContext oldcontext; + + Assert(pgStatLocal.dsa == NULL); + + /* stats shared memory persists for the backend lifetime */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + pgStatLocal.dsa = dsa_attach_in_place(pgStatLocal.shmem->raw_dsa_area, + NULL); + dsa_pin_mapping(pgStatLocal.dsa); + + pgStatLocal.shared_hash = dshash_attach(pgStatLocal.dsa, &dsh_params, + pgStatLocal.shmem->hash_handle, 0); + + MemoryContextSwitchTo(oldcontext); +} + +void +pgstat_detach_shmem(void) +{ + Assert(pgStatLocal.dsa); + + /* we shouldn't leave references to shared stats */ + pgstat_release_all_entry_refs(false); + + dshash_detach(pgStatLocal.shared_hash); + pgStatLocal.shared_hash = NULL; + + dsa_detach(pgStatLocal.dsa); + pgStatLocal.dsa = NULL; +} + + +/* ------------------------------------------------------------ + * Maintenance of shared memory stats entries + * ------------------------------------------------------------ + */ + +PgStatShared_Common * +pgstat_init_entry(PgStat_Kind kind, + PgStatShared_HashEntry *shhashent) +{ + /* Create new stats entry. */ + dsa_pointer chunk; + PgStatShared_Common *shheader; + + /* + * Initialize refcount to 1, marking it as valid / not dropped. The entry + * can't be freed before the initialization because it can't be found as + * long as we hold the dshash partition lock. Caller needs to increase + * further if a longer lived reference is needed. + */ + pg_atomic_init_u32(&shhashent->refcount, 1); + shhashent->dropped = false; + + chunk = dsa_allocate0(pgStatLocal.dsa, pgstat_get_kind_info(kind)->shared_size); + shheader = dsa_get_address(pgStatLocal.dsa, chunk); + shheader->magic = 0xdeadbeef; + + /* Link the new entry from the hash entry. */ + shhashent->body = chunk; + + LWLockInitialize(&shheader->lock, LWTRANCHE_PGSTATS_DATA); + + return shheader; +} + +static PgStatShared_Common * +pgstat_reinit_entry(PgStat_Kind kind, PgStatShared_HashEntry *shhashent) +{ + PgStatShared_Common *shheader; + + shheader = dsa_get_address(pgStatLocal.dsa, shhashent->body); + + /* mark as not dropped anymore */ + pg_atomic_fetch_add_u32(&shhashent->refcount, 1); + shhashent->dropped = false; + + /* reinitialize content */ + Assert(shheader->magic == 0xdeadbeef); + memset(pgstat_get_entry_data(kind, shheader), 0, + pgstat_get_entry_len(kind)); + + return shheader; +} + +static void +pgstat_setup_shared_refs(void) +{ + if (likely(pgStatEntryRefHash != NULL)) + return; + + pgStatEntryRefHash = + pgstat_entry_ref_hash_create(pgStatEntryRefHashContext, + PGSTAT_ENTRY_REF_HASH_SIZE, NULL); + pgStatSharedRefAge = pg_atomic_read_u64(&pgStatLocal.shmem->gc_request_count); + Assert(pgStatSharedRefAge != 0); +} + +/* + * Helper function for pgstat_get_entry_ref(). + */ +static void +pgstat_acquire_entry_ref(PgStat_EntryRef *entry_ref, + PgStatShared_HashEntry *shhashent, + PgStatShared_Common *shheader) +{ + Assert(shheader->magic == 0xdeadbeef); + Assert(pg_atomic_read_u32(&shhashent->refcount) > 0); + + pg_atomic_fetch_add_u32(&shhashent->refcount, 1); + + dshash_release_lock(pgStatLocal.shared_hash, shhashent); + + entry_ref->shared_stats = shheader; + entry_ref->shared_entry = shhashent; +} + +/* + * Helper function for pgstat_get_entry_ref(). + */ +static bool +pgstat_get_entry_ref_cached(PgStat_HashKey key, PgStat_EntryRef **entry_ref_p) +{ + bool found; + PgStat_EntryRefHashEntry *cache_entry; + + /* + * We immediately insert a cache entry, because it avoids 1) multiple + * hashtable lookups in case of a cache miss 2) having to deal with + * out-of-memory errors after incrementing PgStatShared_Common->refcount. + */ + + cache_entry = pgstat_entry_ref_hash_insert(pgStatEntryRefHash, key, &found); + + if (!found || !cache_entry->entry_ref) + { + PgStat_EntryRef *entry_ref; + + cache_entry->entry_ref = entry_ref = + MemoryContextAlloc(pgStatSharedRefContext, + sizeof(PgStat_EntryRef)); + entry_ref->shared_stats = NULL; + entry_ref->shared_entry = NULL; + entry_ref->pending = NULL; + + found = false; + } + else if (cache_entry->entry_ref->shared_stats == NULL) + { + Assert(cache_entry->entry_ref->pending == NULL); + found = false; + } + else + { + PgStat_EntryRef *entry_ref PG_USED_FOR_ASSERTS_ONLY; + + entry_ref = cache_entry->entry_ref; + Assert(entry_ref->shared_entry != NULL); + Assert(entry_ref->shared_stats != NULL); + + Assert(entry_ref->shared_stats->magic == 0xdeadbeef); + /* should have at least our reference */ + Assert(pg_atomic_read_u32(&entry_ref->shared_entry->refcount) > 0); + } + + *entry_ref_p = cache_entry->entry_ref; + return found; +} + +/* + * Get a shared stats reference. If create is true, the shared stats object is + * created if it does not exist. + * + * When create is true, and created_entry is non-NULL, it'll be set to true + * if the entry is newly created, false otherwise. + */ +PgStat_EntryRef * +pgstat_get_entry_ref(PgStat_Kind kind, Oid dboid, Oid objoid, bool create, + bool *created_entry) +{ + PgStat_HashKey key = {.kind = kind,.dboid = dboid,.objoid = objoid}; + PgStatShared_HashEntry *shhashent; + PgStatShared_Common *shheader = NULL; + PgStat_EntryRef *entry_ref; + + /* + * passing in created_entry only makes sense if we possibly could create + * entry. + */ + AssertArg(create || created_entry == NULL); + pgstat_assert_is_up(); + Assert(pgStatLocal.shared_hash != NULL); + Assert(!pgStatLocal.shmem->is_shutdown); + + pgstat_setup_memcxt(); + pgstat_setup_shared_refs(); + + if (created_entry != NULL) + *created_entry = false; + + /* + * Check if other backends dropped stats that could not be deleted because + * somebody held references to it. If so, check this backend's references. + * This is not expected to happen often. The location of the check is a + * bit random, but this is a relatively frequently called path, so better + * than most. + */ + if (pgstat_need_entry_refs_gc()) + pgstat_gc_entry_refs(); + + /* + * First check the lookup cache hashtable in local memory. If we find a + * match here we can avoid taking locks / causing contention. + */ + if (pgstat_get_entry_ref_cached(key, &entry_ref)) + return entry_ref; + + Assert(entry_ref != NULL); + + /* + * Do a lookup in the hash table first - it's quite likely that the entry + * already exists, and that way we only need a shared lock. + */ + shhashent = dshash_find(pgStatLocal.shared_hash, &key, false); + + if (create && !shhashent) + { + bool shfound; + + /* + * It's possible that somebody created the entry since the above + * lookup. If so, fall through to the same path as if we'd have if it + * already had been created before the dshash_find() calls. + */ + shhashent = dshash_find_or_insert(pgStatLocal.shared_hash, &key, &shfound); + if (!shfound) + { + shheader = pgstat_init_entry(kind, shhashent); + pgstat_acquire_entry_ref(entry_ref, shhashent, shheader); + + if (created_entry != NULL) + *created_entry = true; + + return entry_ref; + } + } + + if (!shhashent) + { + /* + * If we're not creating, delete the reference again. In all + * likelihood it's just a stats lookup - no point wasting memory for a + * shared ref to nothing... + */ + pgstat_release_entry_ref(key, entry_ref, false); + + return NULL; + } + else + { + /* + * Can get here either because dshash_find() found a match, or if + * dshash_find_or_insert() found a concurrently inserted entry. + */ + + if (shhashent->dropped && create) + { + /* + * There are legitimate cases where the old stats entry might not + * yet have been dropped by the time it's reused. The most obvious + * case are replication slot stats, where a new slot can be + * created with the same index just after dropping. But oid + * wraparound can lead to other cases as well. We just reset the + * stats to their plain state. + */ + shheader = pgstat_reinit_entry(kind, shhashent); + pgstat_acquire_entry_ref(entry_ref, shhashent, shheader); + + if (created_entry != NULL) + *created_entry = true; + + return entry_ref; + } + else if (shhashent->dropped) + { + dshash_release_lock(pgStatLocal.shared_hash, shhashent); + pgstat_release_entry_ref(key, entry_ref, false); + + return NULL; + } + else + { + shheader = dsa_get_address(pgStatLocal.dsa, shhashent->body); + pgstat_acquire_entry_ref(entry_ref, shhashent, shheader); + + return entry_ref; + } + } +} + +static void +pgstat_release_entry_ref(PgStat_HashKey key, PgStat_EntryRef *entry_ref, + bool discard_pending) +{ + if (entry_ref && entry_ref->pending) + { + if (discard_pending) + pgstat_delete_pending_entry(entry_ref); + else + elog(ERROR, "releasing ref with pending data"); + } + + if (entry_ref && entry_ref->shared_stats) + { + Assert(entry_ref->shared_stats->magic == 0xdeadbeef); + Assert(entry_ref->pending == NULL); + + /* + * This can't race with another backend looking up the stats entry and + * increasing the refcount because it is not "legal" to create + * additional references to dropped entries. + */ + if (pg_atomic_fetch_sub_u32(&entry_ref->shared_entry->refcount, 1) == 1) + { + PgStatShared_HashEntry *shent; + + /* + * We're the last referrer to this entry, try to drop the shared + * entry. + */ + + /* only dropped entries can reach a 0 refcount */ + Assert(entry_ref->shared_entry->dropped); + + shent = dshash_find(pgStatLocal.shared_hash, + &entry_ref->shared_entry->key, + true); + if (!shent) + elog(ERROR, "could not find just referenced shared stats entry"); + + Assert(pg_atomic_read_u32(&entry_ref->shared_entry->refcount) == 0); + Assert(entry_ref->shared_entry == shent); + + pgstat_free_entry(shent, NULL); + } + } + + if (!pgstat_entry_ref_hash_delete(pgStatEntryRefHash, key)) + elog(ERROR, "entry ref vanished before deletion"); + + if (entry_ref) + pfree(entry_ref); +} + +bool +pgstat_lock_entry(PgStat_EntryRef *entry_ref, bool nowait) +{ + LWLock *lock = &entry_ref->shared_stats->lock; + + if (nowait) + return LWLockConditionalAcquire(lock, LW_EXCLUSIVE); + + LWLockAcquire(lock, LW_EXCLUSIVE); + return true; +} + +/* + * Separate from pgstat_lock_entry() as most callers will need to lock + * exclusively. + */ +bool +pgstat_lock_entry_shared(PgStat_EntryRef *entry_ref, bool nowait) +{ + LWLock *lock = &entry_ref->shared_stats->lock; + + if (nowait) + return LWLockConditionalAcquire(lock, LW_SHARED); + + LWLockAcquire(lock, LW_SHARED); + return true; +} + +void +pgstat_unlock_entry(PgStat_EntryRef *entry_ref) +{ + LWLockRelease(&entry_ref->shared_stats->lock); +} + +/* + * Helper function to fetch and lock shared stats. + */ +PgStat_EntryRef * +pgstat_get_entry_ref_locked(PgStat_Kind kind, Oid dboid, Oid objoid, + bool nowait) +{ + PgStat_EntryRef *entry_ref; + + /* find shared table stats entry corresponding to the local entry */ + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, true, NULL); + + /* lock the shared entry to protect the content, skip if failed */ + if (!pgstat_lock_entry(entry_ref, nowait)) + return NULL; + + return entry_ref; +} + +void +pgstat_request_entry_refs_gc(void) +{ + pg_atomic_fetch_add_u64(&pgStatLocal.shmem->gc_request_count, 1); +} + +static bool +pgstat_need_entry_refs_gc(void) +{ + uint64 curage; + + if (!pgStatEntryRefHash) + return false; + + /* should have been initialized when creating pgStatEntryRefHash */ + Assert(pgStatSharedRefAge != 0); + + curage = pg_atomic_read_u64(&pgStatLocal.shmem->gc_request_count); + + return pgStatSharedRefAge != curage; +} + +static void +pgstat_gc_entry_refs(void) +{ + pgstat_entry_ref_hash_iterator i; + PgStat_EntryRefHashEntry *ent; + uint64 curage; + + curage = pg_atomic_read_u64(&pgStatLocal.shmem->gc_request_count); + Assert(curage != 0); + + /* + * Some entries have been dropped. Invalidate cache pointer to them. + */ + pgstat_entry_ref_hash_start_iterate(pgStatEntryRefHash, &i); + while ((ent = pgstat_entry_ref_hash_iterate(pgStatEntryRefHash, &i)) != NULL) + { + PgStat_EntryRef *entry_ref = ent->entry_ref; + + Assert(!entry_ref->shared_stats || + entry_ref->shared_stats->magic == 0xdeadbeef); + + if (!entry_ref->shared_entry->dropped) + continue; + + /* cannot gc shared ref that has pending data */ + if (entry_ref->pending != NULL) + continue; + + pgstat_release_entry_ref(ent->key, entry_ref, false); + } + + pgStatSharedRefAge = curage; +} + +static void +pgstat_release_matching_entry_refs(bool discard_pending, ReleaseMatchCB match, + Datum match_data) +{ + pgstat_entry_ref_hash_iterator i; + PgStat_EntryRefHashEntry *ent; + + if (pgStatEntryRefHash == NULL) + return; + + pgstat_entry_ref_hash_start_iterate(pgStatEntryRefHash, &i); + + while ((ent = pgstat_entry_ref_hash_iterate(pgStatEntryRefHash, &i)) + != NULL) + { + Assert(ent->entry_ref != NULL); + + if (match && !match(ent, match_data)) + continue; + + pgstat_release_entry_ref(ent->key, ent->entry_ref, discard_pending); + } +} + +/* + * Release all local references to shared stats entries. + * + * When a process exits it cannot do so while still holding references onto + * stats entries, otherwise the shared stats entries could never be freed. + */ +static void +pgstat_release_all_entry_refs(bool discard_pending) +{ + if (pgStatEntryRefHash == NULL) + return; + + pgstat_release_matching_entry_refs(discard_pending, NULL, 0); + Assert(pgStatEntryRefHash->members == 0); + pgstat_entry_ref_hash_destroy(pgStatEntryRefHash); + pgStatEntryRefHash = NULL; +} + +static bool +match_db(PgStat_EntryRefHashEntry *ent, Datum match_data) +{ + Oid dboid = DatumGetObjectId(match_data); + + return ent->key.dboid == dboid; +} + +static void +pgstat_release_db_entry_refs(Oid dboid) +{ + pgstat_release_matching_entry_refs( /* discard pending = */ true, + match_db, + ObjectIdGetDatum(dboid)); +} + + +/* ------------------------------------------------------------ + * Dropping and resetting of stats entries + * ------------------------------------------------------------ + */ + +static void +pgstat_free_entry(PgStatShared_HashEntry *shent, dshash_seq_status *hstat) +{ + dsa_pointer pdsa; + + /* + * Fetch dsa pointer before deleting entry - that way we can free the + * memory after releasing the lock. + */ + pdsa = shent->body; + + if (!hstat) + dshash_delete_entry(pgStatLocal.shared_hash, shent); + else + dshash_delete_current(hstat); + + dsa_free(pgStatLocal.dsa, pdsa); +} + +/* + * Helper for both pgstat_drop_database_and_contents() and + * pgstat_drop_entry(). If hstat is non-null delete the shared entry using + * dshash_delete_current(), otherwise use dshash_delete_entry(). In either + * case the entry needs to be already locked. + */ +static bool +pgstat_drop_entry_internal(PgStatShared_HashEntry *shent, + dshash_seq_status *hstat) +{ + Assert(shent->body != InvalidDsaPointer); + + /* should already have released local reference */ + if (pgStatEntryRefHash) + Assert(!pgstat_entry_ref_hash_lookup(pgStatEntryRefHash, shent->key)); + + /* + * Signal that the entry is dropped - this will eventually cause other + * backends to release their references. + */ + if (shent->dropped) + elog(ERROR, "can only drop stats once"); + shent->dropped = true; + + /* release refcount marking entry as not dropped */ + if (pg_atomic_sub_fetch_u32(&shent->refcount, 1) == 0) + { + pgstat_free_entry(shent, hstat); + return true; + } + else + { + if (!hstat) + dshash_release_lock(pgStatLocal.shared_hash, shent); + return false; + } +} + +/* + * Drop stats for the database and all the objects inside that database. + */ +static void +pgstat_drop_database_and_contents(Oid dboid) +{ + dshash_seq_status hstat; + PgStatShared_HashEntry *p; + uint64 not_freed_count = 0; + + Assert(OidIsValid(dboid)); + + Assert(pgStatLocal.shared_hash != NULL); + + /* + * This backend might very well be the only backend holding a reference to + * about-to-be-dropped entries. Ensure that we're not preventing it from + * being cleaned up till later. + * + * Doing this separately from the dshash iteration below avoids having to + * do so while holding a partition lock on the shared hashtable. + */ + pgstat_release_db_entry_refs(dboid); + + /* some of the dshash entries are to be removed, take exclusive lock. */ + dshash_seq_init(&hstat, pgStatLocal.shared_hash, true); + while ((p = dshash_seq_next(&hstat)) != NULL) + { + if (p->dropped) + continue; + + if (p->key.dboid != dboid) + continue; + + if (!pgstat_drop_entry_internal(p, &hstat)) + { + /* + * Even statistics for a dropped database might currently be + * accessed (consider e.g. database stats for pg_stat_database). + */ + not_freed_count++; + } + } + dshash_seq_term(&hstat); + + /* + * If some of the stats data could not be freed, signal the reference + * holders to run garbage collection of their cached pgStatShmLookupCache. + */ + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); +} + +bool +pgstat_drop_entry(PgStat_Kind kind, Oid dboid, Oid objoid) +{ + PgStat_HashKey key = {.kind = kind,.dboid = dboid,.objoid = objoid}; + PgStatShared_HashEntry *shent; + bool freed = true; + + /* delete local reference */ + if (pgStatEntryRefHash) + { + PgStat_EntryRefHashEntry *lohashent = + pgstat_entry_ref_hash_lookup(pgStatEntryRefHash, key); + + if (lohashent) + pgstat_release_entry_ref(lohashent->key, lohashent->entry_ref, + true); + } + + /* mark entry in shared hashtable as deleted, drop if possible */ + shent = dshash_find(pgStatLocal.shared_hash, &key, true); + if (shent) + { + freed = pgstat_drop_entry_internal(shent, NULL); + + /* + * Database stats contain other stats. Drop those as well when + * dropping the database. XXX: Perhaps this should be done in a + * slightly more principled way? But not obvious what that'd look + * like, and so far this is the only case... + */ + if (key.kind == PGSTAT_KIND_DATABASE) + pgstat_drop_database_and_contents(key.dboid); + } + + return freed; +} + +void +pgstat_drop_all_entries(void) +{ + dshash_seq_status hstat; + PgStatShared_HashEntry *ps; + uint64 not_freed_count = 0; + + dshash_seq_init(&hstat, pgStatLocal.shared_hash, true); + while ((ps = dshash_seq_next(&hstat)) != NULL) + { + if (ps->dropped) + continue; + + if (!pgstat_drop_entry_internal(ps, &hstat)) + not_freed_count++; + } + dshash_seq_term(&hstat); + + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); +} + +static void +shared_stat_reset_contents(PgStat_Kind kind, PgStatShared_Common *header, + TimestampTz ts) +{ + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + + memset(pgstat_get_entry_data(kind, header), 0, + pgstat_get_entry_len(kind)); + + if (kind_info->reset_timestamp_cb) + kind_info->reset_timestamp_cb(header, ts); +} + +/* + * Reset one variable-numbered stats entry. + */ +void +pgstat_reset_entry(PgStat_Kind kind, Oid dboid, Oid objoid, TimestampTz ts) +{ + PgStat_EntryRef *entry_ref; + + Assert(!pgstat_get_kind_info(kind)->fixed_amount); + + entry_ref = pgstat_get_entry_ref(kind, dboid, objoid, false, NULL); + if (!entry_ref || entry_ref->shared_entry->dropped) + return; + + (void) pgstat_lock_entry(entry_ref, false); + shared_stat_reset_contents(kind, entry_ref->shared_stats, ts); + pgstat_unlock_entry(entry_ref); +} + +/* + * Scan through the shared hashtable of stats, resetting statistics if + * approved by the provided do_reset() function. + */ +void +pgstat_reset_matching_entries(bool (*do_reset) (PgStatShared_HashEntry *, Datum), + Datum match_data, TimestampTz ts) +{ + dshash_seq_status hstat; + PgStatShared_HashEntry *p; + + /* dshash entry is not modified, take shared lock */ + dshash_seq_init(&hstat, pgStatLocal.shared_hash, false); + while ((p = dshash_seq_next(&hstat)) != NULL) + { + PgStatShared_Common *header; + + if (p->dropped) + continue; + + if (!do_reset(p, match_data)) + continue; + + header = dsa_get_address(pgStatLocal.dsa, p->body); + + LWLockAcquire(&header->lock, LW_EXCLUSIVE); + + shared_stat_reset_contents(p->key.kind, header, ts); + + LWLockRelease(&header->lock); + } + dshash_seq_term(&hstat); +} + +static bool +match_kind(PgStatShared_HashEntry *p, Datum match_data) +{ + return p->key.kind == DatumGetInt32(match_data); +} + +void +pgstat_reset_entries_of_kind(PgStat_Kind kind, TimestampTz ts) +{ + pgstat_reset_matching_entries(match_kind, Int32GetDatum(kind), ts); +} + +static void +pgstat_setup_memcxt(void) +{ + if (unlikely(!pgStatSharedRefContext)) + pgStatSharedRefContext = + AllocSetContextCreate(TopMemoryContext, + "PgStat Shared Ref", + ALLOCSET_SMALL_SIZES); + if (unlikely(!pgStatEntryRefHashContext)) + pgStatEntryRefHashContext = + AllocSetContextCreate(TopMemoryContext, + "PgStat Shared Ref Hash", + ALLOCSET_SMALL_SIZES); +} diff --git a/src/backend/utils/activity/pgstat_slru.c b/src/backend/utils/activity/pgstat_slru.c new file mode 100644 index 0000000..28ef736 --- /dev/null +++ b/src/backend/utils/activity/pgstat_slru.c @@ -0,0 +1,248 @@ +/* ------------------------------------------------------------------------- + * + * pgstat_slru.c + * Implementation of SLRU statistics. + * + * This file contains the implementation of SLRU statistics. It is kept + * separate from pgstat.c to enforce the line between the statistics access / + * storage implementation and the details about individual types of + * statistics. + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/activity/pgstat_slru.c + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "utils/pgstat_internal.h" +#include "utils/timestamp.h" + + +static inline PgStat_SLRUStats *get_slru_entry(int slru_idx); +static void pgstat_reset_slru_counter_internal(int index, TimestampTz ts); + + +/* + * SLRU statistics counts waiting to be flushed out. We assume this variable + * inits to zeroes. Entries are one-to-one with slru_names[]. Changes of + * SLRU counters are reported within critical sections so we use static memory + * in order to avoid memory allocation. + */ +static PgStat_SLRUStats pending_SLRUStats[SLRU_NUM_ELEMENTS]; +bool have_slrustats = false; + + +/* + * Reset counters for a single SLRU. + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +void +pgstat_reset_slru(const char *name) +{ + TimestampTz ts = GetCurrentTimestamp(); + + AssertArg(name != NULL); + + pgstat_reset_slru_counter_internal(pgstat_get_slru_index(name), ts); +} + +/* + * SLRU statistics count accumulation functions --- called from slru.c + */ + +void +pgstat_count_slru_page_zeroed(int slru_idx) +{ + get_slru_entry(slru_idx)->blocks_zeroed += 1; +} + +void +pgstat_count_slru_page_hit(int slru_idx) +{ + get_slru_entry(slru_idx)->blocks_hit += 1; +} + +void +pgstat_count_slru_page_exists(int slru_idx) +{ + get_slru_entry(slru_idx)->blocks_exists += 1; +} + +void +pgstat_count_slru_page_read(int slru_idx) +{ + get_slru_entry(slru_idx)->blocks_read += 1; +} + +void +pgstat_count_slru_page_written(int slru_idx) +{ + get_slru_entry(slru_idx)->blocks_written += 1; +} + +void +pgstat_count_slru_flush(int slru_idx) +{ + get_slru_entry(slru_idx)->flush += 1; +} + +void +pgstat_count_slru_truncate(int slru_idx) +{ + get_slru_entry(slru_idx)->truncate += 1; +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the slru statistics struct. + */ +PgStat_SLRUStats * +pgstat_fetch_slru(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_SLRU); + + return pgStatLocal.snapshot.slru; +} + +/* + * Returns SLRU name for an index. The index may be above SLRU_NUM_ELEMENTS, + * in which case this returns NULL. This allows writing code that does not + * know the number of entries in advance. + */ +const char * +pgstat_get_slru_name(int slru_idx) +{ + if (slru_idx < 0 || slru_idx >= SLRU_NUM_ELEMENTS) + return NULL; + + return slru_names[slru_idx]; +} + +/* + * Determine index of entry for a SLRU with a given name. If there's no exact + * match, returns index of the last "other" entry used for SLRUs defined in + * external projects. + */ +int +pgstat_get_slru_index(const char *name) +{ + int i; + + for (i = 0; i < SLRU_NUM_ELEMENTS; i++) + { + if (strcmp(slru_names[i], name) == 0) + return i; + } + + /* return index of the last entry (which is the "other" one) */ + return (SLRU_NUM_ELEMENTS - 1); +} + +/* + * Flush out locally pending SLRU stats entries + * + * If nowait is true, this function returns false on lock failure. Otherwise + * this function always returns true. + * + * If nowait is true, this function returns true if the lock could not be + * acquired. Otherwise return false. + */ +bool +pgstat_slru_flush(bool nowait) +{ + PgStatShared_SLRU *stats_shmem = &pgStatLocal.shmem->slru; + int i; + + if (!have_slrustats) + return false; + + if (!nowait) + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + else if (!LWLockConditionalAcquire(&stats_shmem->lock, LW_EXCLUSIVE)) + return true; + + for (i = 0; i < SLRU_NUM_ELEMENTS; i++) + { + PgStat_SLRUStats *sharedent = &stats_shmem->stats[i]; + PgStat_SLRUStats *pendingent = &pending_SLRUStats[i]; + +#define SLRU_ACC(fld) sharedent->fld += pendingent->fld + SLRU_ACC(blocks_zeroed); + SLRU_ACC(blocks_hit); + SLRU_ACC(blocks_read); + SLRU_ACC(blocks_written); + SLRU_ACC(blocks_exists); + SLRU_ACC(flush); + SLRU_ACC(truncate); +#undef SLRU_ACC + } + + /* done, clear the pending entry */ + MemSet(pending_SLRUStats, 0, sizeof(pending_SLRUStats)); + + LWLockRelease(&stats_shmem->lock); + + have_slrustats = false; + + return false; +} + +void +pgstat_slru_reset_all_cb(TimestampTz ts) +{ + for (int i = 0; i < SLRU_NUM_ELEMENTS; i++) + pgstat_reset_slru_counter_internal(i, ts); +} + +void +pgstat_slru_snapshot_cb(void) +{ + PgStatShared_SLRU *stats_shmem = &pgStatLocal.shmem->slru; + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + + memcpy(pgStatLocal.snapshot.slru, &stats_shmem->stats, + sizeof(stats_shmem->stats)); + + LWLockRelease(&stats_shmem->lock); +} + +/* + * Returns pointer to entry with counters for given SLRU (based on the name + * stored in SlruCtl as lwlock tranche name). + */ +static inline PgStat_SLRUStats * +get_slru_entry(int slru_idx) +{ + pgstat_assert_is_up(); + + /* + * The postmaster should never register any SLRU statistics counts; if it + * did, the counts would be duplicated into child processes via fork(). + */ + Assert(IsUnderPostmaster || !IsPostmasterEnvironment); + + Assert((slru_idx >= 0) && (slru_idx < SLRU_NUM_ELEMENTS)); + + have_slrustats = true; + + return &pending_SLRUStats[slru_idx]; +} + +static void +pgstat_reset_slru_counter_internal(int index, TimestampTz ts) +{ + PgStatShared_SLRU *stats_shmem = &pgStatLocal.shmem->slru; + + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + + memset(&stats_shmem->stats[index], 0, sizeof(PgStat_SLRUStats)); + stats_shmem->stats[index].stat_reset_timestamp = ts; + + LWLockRelease(&stats_shmem->lock); +} diff --git a/src/backend/utils/activity/pgstat_subscription.c b/src/backend/utils/activity/pgstat_subscription.c new file mode 100644 index 0000000..e1072bd --- /dev/null +++ b/src/backend/utils/activity/pgstat_subscription.c @@ -0,0 +1,110 @@ +/* ------------------------------------------------------------------------- + * + * pgstat_subscription.c + * Implementation of subscription statistics. + * + * This file contains the implementation of subscription statistics. It is kept + * separate from pgstat.c to enforce the line between the statistics access / + * storage implementation and the details about individual types of + * statistics. + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/activity/pgstat_subscription.c + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "utils/pgstat_internal.h" + + +/* + * Report a subscription error. + */ +void +pgstat_report_subscription_error(Oid subid, bool is_apply_error) +{ + PgStat_EntryRef *entry_ref; + PgStat_BackendSubEntry *pending; + + entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_SUBSCRIPTION, + InvalidOid, subid, NULL); + pending = entry_ref->pending; + + if (is_apply_error) + pending->apply_error_count++; + else + pending->sync_error_count++; +} + +/* + * Report creating the subscription. + * + * Ensures that stats are dropped if transaction rolls back. + */ +void +pgstat_create_subscription(Oid subid) +{ + pgstat_create_transactional(PGSTAT_KIND_SUBSCRIPTION, + InvalidOid, subid); +} + +/* + * Report dropping the subscription. + * + * Ensures that stats are dropped if transaction commits. + */ +void +pgstat_drop_subscription(Oid subid) +{ + pgstat_drop_transactional(PGSTAT_KIND_SUBSCRIPTION, + InvalidOid, subid); +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * the collected statistics for one subscription or NULL. + */ +PgStat_StatSubEntry * +pgstat_fetch_stat_subscription(Oid subid) +{ + return (PgStat_StatSubEntry *) + pgstat_fetch_entry(PGSTAT_KIND_SUBSCRIPTION, InvalidOid, subid); +} + +/* + * Flush out pending stats for the entry + * + * If nowait is true, this function returns false if lock could not + * immediately acquired, otherwise true is returned. + */ +bool +pgstat_subscription_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) +{ + PgStat_BackendSubEntry *localent; + PgStatShared_Subscription *shsubent; + + localent = (PgStat_BackendSubEntry *) entry_ref->pending; + shsubent = (PgStatShared_Subscription *) entry_ref->shared_stats; + + /* localent always has non-zero content */ + + if (!pgstat_lock_entry(entry_ref, nowait)) + return false; + +#define SUB_ACC(fld) shsubent->stats.fld += localent->fld + SUB_ACC(apply_error_count); + SUB_ACC(sync_error_count); +#undef SUB_ACC + + pgstat_unlock_entry(entry_ref); + return true; +} + +void +pgstat_subscription_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts) +{ + ((PgStatShared_Subscription *) header)->stats.stat_reset_timestamp = ts; +} diff --git a/src/backend/utils/activity/pgstat_wal.c b/src/backend/utils/activity/pgstat_wal.c new file mode 100644 index 0000000..305a925 --- /dev/null +++ b/src/backend/utils/activity/pgstat_wal.c @@ -0,0 +1,182 @@ +/* ------------------------------------------------------------------------- + * + * pgstat_wal.c + * Implementation of WAL statistics. + * + * This file contains the implementation of WAL statistics. It is kept + * separate from pgstat.c to enforce the line between the statistics access / + * storage implementation and the details about individual types of + * statistics. + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/activity/pgstat_wal.c + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "utils/pgstat_internal.h" +#include "executor/instrument.h" + + +PgStat_WalStats PendingWalStats = {0}; + +/* + * WAL usage counters saved from pgWALUsage at the previous call to + * pgstat_report_wal(). This is used to calculate how much WAL usage + * happens between pgstat_report_wal() calls, by subtracting + * the previous counters from the current ones. + */ +static WalUsage prevWalUsage; + + +/* + * Calculate how much WAL usage counters have increased and update + * shared statistics. + * + * Must be called by processes that generate WAL, that do not call + * pgstat_report_stat(), like walwriter. + * + * "force" set to true ensures that the statistics are flushed; note that + * this needs to acquire the pgstat shmem LWLock, waiting on it. When + * set to false, the statistics may not be flushed if the lock could not + * be acquired. + */ +void +pgstat_report_wal(bool force) +{ + bool nowait; + + /* like in pgstat.c, don't wait for lock acquisition when !force */ + nowait = !force; + + /* flush wal stats */ + pgstat_flush_wal(nowait); +} + +/* + * Support function for the SQL-callable pgstat* functions. Returns + * a pointer to the WAL statistics struct. + */ +PgStat_WalStats * +pgstat_fetch_stat_wal(void) +{ + pgstat_snapshot_fixed(PGSTAT_KIND_WAL); + + return &pgStatLocal.snapshot.wal; +} + +/* + * Calculate how much WAL usage counters have increased by subtracting the + * previous counters from the current ones. + * + * If nowait is true, this function returns true if the lock could not be + * acquired. Otherwise return false. + */ +bool +pgstat_flush_wal(bool nowait) +{ + PgStatShared_Wal *stats_shmem = &pgStatLocal.shmem->wal; + WalUsage diff = {0}; + + Assert(IsUnderPostmaster || !IsPostmasterEnvironment); + Assert(pgStatLocal.shmem != NULL && + !pgStatLocal.shmem->is_shutdown); + + /* + * This function can be called even if nothing at all has happened. Avoid + * taking lock for nothing in that case. + */ + if (!pgstat_have_pending_wal()) + return false; + + /* + * We don't update the WAL usage portion of the local WalStats elsewhere. + * Calculate how much WAL usage counters were increased by subtracting the + * previous counters from the current ones. + */ + WalUsageAccumDiff(&diff, &pgWalUsage, &prevWalUsage); + PendingWalStats.wal_records = diff.wal_records; + PendingWalStats.wal_fpi = diff.wal_fpi; + PendingWalStats.wal_bytes = diff.wal_bytes; + + if (!nowait) + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + else if (!LWLockConditionalAcquire(&stats_shmem->lock, LW_EXCLUSIVE)) + return true; + +#define WALSTAT_ACC(fld) stats_shmem->stats.fld += PendingWalStats.fld + WALSTAT_ACC(wal_records); + WALSTAT_ACC(wal_fpi); + WALSTAT_ACC(wal_bytes); + WALSTAT_ACC(wal_buffers_full); + WALSTAT_ACC(wal_write); + WALSTAT_ACC(wal_sync); + WALSTAT_ACC(wal_write_time); + WALSTAT_ACC(wal_sync_time); +#undef WALSTAT_ACC + + LWLockRelease(&stats_shmem->lock); + + /* + * Save the current counters for the subsequent calculation of WAL usage. + */ + prevWalUsage = pgWalUsage; + + /* + * Clear out the statistics buffer, so it can be re-used. + */ + MemSet(&PendingWalStats, 0, sizeof(PendingWalStats)); + + return false; +} + +void +pgstat_init_wal(void) +{ + /* + * Initialize prevWalUsage with pgWalUsage so that pgstat_flush_wal() can + * calculate how much pgWalUsage counters are increased by subtracting + * prevWalUsage from pgWalUsage. + */ + prevWalUsage = pgWalUsage; +} + +/* + * To determine whether any WAL activity has occurred since last time, not + * only the number of generated WAL records but also the numbers of WAL + * writes and syncs need to be checked. Because even transaction that + * generates no WAL records can write or sync WAL data when flushing the + * data pages. + */ +bool +pgstat_have_pending_wal(void) +{ + return pgWalUsage.wal_records != prevWalUsage.wal_records || + PendingWalStats.wal_write != 0 || + PendingWalStats.wal_sync != 0; +} + +void +pgstat_wal_reset_all_cb(TimestampTz ts) +{ + PgStatShared_Wal *stats_shmem = &pgStatLocal.shmem->wal; + + LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE); + memset(&stats_shmem->stats, 0, sizeof(stats_shmem->stats)); + stats_shmem->stats.stat_reset_timestamp = ts; + LWLockRelease(&stats_shmem->lock); +} + +void +pgstat_wal_snapshot_cb(void) +{ + PgStatShared_Wal *stats_shmem = &pgStatLocal.shmem->wal; + + LWLockAcquire(&stats_shmem->lock, LW_SHARED); + memcpy(&pgStatLocal.snapshot.wal, &stats_shmem->stats, + sizeof(pgStatLocal.snapshot.wal)); + LWLockRelease(&stats_shmem->lock); +} diff --git a/src/backend/utils/activity/pgstat_xact.c b/src/backend/utils/activity/pgstat_xact.c new file mode 100644 index 0000000..d6f660e --- /dev/null +++ b/src/backend/utils/activity/pgstat_xact.c @@ -0,0 +1,391 @@ +/* ------------------------------------------------------------------------- + * + * pgstat_xact.c + * Transactional integration for the cumulative statistics system. + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/activity/pgstat_xact.c + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/transam.h" +#include "access/xact.h" +#include "pgstat.h" +#include "utils/memutils.h" +#include "utils/pgstat_internal.h" + + +typedef struct PgStat_PendingDroppedStatsItem +{ + xl_xact_stats_item item; + bool is_create; + dlist_node node; +} PgStat_PendingDroppedStatsItem; + + +static void AtEOXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, bool isCommit); +static void AtEOSubXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, + bool isCommit, int nestDepth); + +static PgStat_SubXactStatus *pgStatXactStack = NULL; + + +/* + * Called from access/transam/xact.c at top-level transaction commit/abort. + */ +void +AtEOXact_PgStat(bool isCommit, bool parallel) +{ + PgStat_SubXactStatus *xact_state; + + AtEOXact_PgStat_Database(isCommit, parallel); + + /* handle transactional stats information */ + xact_state = pgStatXactStack; + if (xact_state != NULL) + { + Assert(xact_state->nest_level == 1); + Assert(xact_state->prev == NULL); + + AtEOXact_PgStat_Relations(xact_state, isCommit); + AtEOXact_PgStat_DroppedStats(xact_state, isCommit); + } + pgStatXactStack = NULL; + + /* Make sure any stats snapshot is thrown away */ + pgstat_clear_snapshot(); +} + +/* + * When committing, drop stats for objects dropped in the transaction. When + * aborting, drop stats for objects created in the transaction. + */ +static void +AtEOXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, bool isCommit) +{ + dlist_mutable_iter iter; + int not_freed_count = 0; + + if (xact_state->pending_drops_count == 0) + { + Assert(dlist_is_empty(&xact_state->pending_drops)); + return; + } + + dlist_foreach_modify(iter, &xact_state->pending_drops) + { + PgStat_PendingDroppedStatsItem *pending = + dlist_container(PgStat_PendingDroppedStatsItem, node, iter.cur); + xl_xact_stats_item *it = &pending->item; + + if (isCommit && !pending->is_create) + { + /* + * Transaction that dropped an object committed. Drop the stats + * too. + */ + if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid)) + not_freed_count++; + } + else if (!isCommit && pending->is_create) + { + /* + * Transaction that created an object aborted. Drop the stats + * associated with the object. + */ + if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid)) + not_freed_count++; + } + + dlist_delete(&pending->node); + xact_state->pending_drops_count--; + pfree(pending); + } + + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); +} + +/* + * Called from access/transam/xact.c at subtransaction commit/abort. + */ +void +AtEOSubXact_PgStat(bool isCommit, int nestDepth) +{ + PgStat_SubXactStatus *xact_state; + + /* merge the sub-transaction's transactional stats into the parent */ + xact_state = pgStatXactStack; + if (xact_state != NULL && + xact_state->nest_level >= nestDepth) + { + /* delink xact_state from stack immediately to simplify reuse case */ + pgStatXactStack = xact_state->prev; + + AtEOSubXact_PgStat_Relations(xact_state, isCommit, nestDepth); + AtEOSubXact_PgStat_DroppedStats(xact_state, isCommit, nestDepth); + + pfree(xact_state); + } +} + +/* + * Like AtEOXact_PgStat_DroppedStats(), but for subtransactions. + */ +static void +AtEOSubXact_PgStat_DroppedStats(PgStat_SubXactStatus *xact_state, + bool isCommit, int nestDepth) +{ + PgStat_SubXactStatus *parent_xact_state; + dlist_mutable_iter iter; + int not_freed_count = 0; + + if (xact_state->pending_drops_count == 0) + return; + + parent_xact_state = pgstat_get_xact_stack_level(nestDepth - 1); + + dlist_foreach_modify(iter, &xact_state->pending_drops) + { + PgStat_PendingDroppedStatsItem *pending = + dlist_container(PgStat_PendingDroppedStatsItem, node, iter.cur); + xl_xact_stats_item *it = &pending->item; + + dlist_delete(&pending->node); + xact_state->pending_drops_count--; + + if (!isCommit && pending->is_create) + { + /* + * Subtransaction creating a new stats object aborted. Drop the + * stats object. + */ + if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid)) + not_freed_count++; + pfree(pending); + } + else if (isCommit) + { + /* + * Subtransaction dropping a stats object committed. Can't yet + * remove the stats object, the surrounding transaction might + * still abort. Pass it on to the parent. + */ + dlist_push_tail(&parent_xact_state->pending_drops, &pending->node); + parent_xact_state->pending_drops_count++; + } + else + { + pfree(pending); + } + } + + Assert(xact_state->pending_drops_count == 0); + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); +} + +/* + * Save the transactional stats state at 2PC transaction prepare. + */ +void +AtPrepare_PgStat(void) +{ + PgStat_SubXactStatus *xact_state; + + xact_state = pgStatXactStack; + if (xact_state != NULL) + { + Assert(xact_state->nest_level == 1); + Assert(xact_state->prev == NULL); + + AtPrepare_PgStat_Relations(xact_state); + } +} + +/* + * Clean up after successful PREPARE. + * + * Note: AtEOXact_PgStat is not called during PREPARE. + */ +void +PostPrepare_PgStat(void) +{ + PgStat_SubXactStatus *xact_state; + + /* + * We don't bother to free any of the transactional state, since it's all + * in TopTransactionContext and will go away anyway. + */ + xact_state = pgStatXactStack; + if (xact_state != NULL) + { + Assert(xact_state->nest_level == 1); + Assert(xact_state->prev == NULL); + + PostPrepare_PgStat_Relations(xact_state); + } + pgStatXactStack = NULL; + + /* Make sure any stats snapshot is thrown away */ + pgstat_clear_snapshot(); +} + +/* + * Ensure (sub)transaction stack entry for the given nest_level exists, adding + * it if needed. + */ +PgStat_SubXactStatus * +pgstat_get_xact_stack_level(int nest_level) +{ + PgStat_SubXactStatus *xact_state; + + xact_state = pgStatXactStack; + if (xact_state == NULL || xact_state->nest_level != nest_level) + { + xact_state = (PgStat_SubXactStatus *) + MemoryContextAlloc(TopTransactionContext, + sizeof(PgStat_SubXactStatus)); + dlist_init(&xact_state->pending_drops); + xact_state->pending_drops_count = 0; + xact_state->nest_level = nest_level; + xact_state->prev = pgStatXactStack; + xact_state->first = NULL; + pgStatXactStack = xact_state; + } + return xact_state; +} + +/* + * Get stat items that need to be dropped at commit / abort. + * + * When committing, stats for objects that have been dropped in the + * transaction are returned. When aborting, stats for newly created objects are + * returned. + * + * Used by COMMIT / ABORT and 2PC PREPARE processing when building their + * respective WAL records, to ensure stats are dropped in case of a crash / on + * standbys. + * + * The list of items is allocated in CurrentMemoryContext and must be freed by + * the caller (directly or via memory context reset). + */ +int +pgstat_get_transactional_drops(bool isCommit, xl_xact_stats_item **items) +{ + PgStat_SubXactStatus *xact_state = pgStatXactStack; + int nitems = 0; + dlist_iter iter; + + if (xact_state == NULL) + return 0; + + /* + * We expect to be called for subtransaction abort (which logs a WAL + * record), but not for subtransaction commit (which doesn't). + */ + Assert(!isCommit || xact_state->nest_level == 1); + Assert(!isCommit || xact_state->prev == NULL); + + *items = palloc(xact_state->pending_drops_count + * sizeof(xl_xact_stats_item)); + + dlist_foreach(iter, &xact_state->pending_drops) + { + PgStat_PendingDroppedStatsItem *pending = + dlist_container(PgStat_PendingDroppedStatsItem, node, iter.cur); + + if (isCommit && pending->is_create) + continue; + if (!isCommit && !pending->is_create) + continue; + + Assert(nitems < xact_state->pending_drops_count); + (*items)[nitems++] = pending->item; + } + + return nitems; +} + +/* + * Execute scheduled drops post-commit. Called from xact_redo_commit() / + * xact_redo_abort() during recovery, and from FinishPreparedTransaction() + * during normal 2PC COMMIT/ABORT PREPARED processing. + */ +void +pgstat_execute_transactional_drops(int ndrops, struct xl_xact_stats_item *items, bool is_redo) +{ + int not_freed_count = 0; + + if (ndrops == 0) + return; + + for (int i = 0; i < ndrops; i++) + { + xl_xact_stats_item *it = &items[i]; + + if (!pgstat_drop_entry(it->kind, it->dboid, it->objoid)) + not_freed_count++; + } + + if (not_freed_count > 0) + pgstat_request_entry_refs_gc(); +} + +static void +create_drop_transactional_internal(PgStat_Kind kind, Oid dboid, Oid objoid, bool is_create) +{ + int nest_level = GetCurrentTransactionNestLevel(); + PgStat_SubXactStatus *xact_state; + PgStat_PendingDroppedStatsItem *drop = (PgStat_PendingDroppedStatsItem *) + MemoryContextAlloc(TopTransactionContext, sizeof(PgStat_PendingDroppedStatsItem)); + + xact_state = pgstat_get_xact_stack_level(nest_level); + + drop->is_create = is_create; + drop->item.kind = kind; + drop->item.dboid = dboid; + drop->item.objoid = objoid; + + dlist_push_tail(&xact_state->pending_drops, &drop->node); + xact_state->pending_drops_count++; +} + +/* + * Create a stats entry for a newly created database object in a transactional + * manner. + * + * I.e. if the current (sub-)transaction aborts, the stats entry will also be + * dropped. + */ +void +pgstat_create_transactional(PgStat_Kind kind, Oid dboid, Oid objoid) +{ + if (pgstat_get_entry_ref(kind, dboid, objoid, false, NULL)) + { + ereport(WARNING, + errmsg("resetting existing statistics for kind %s, db=%u, oid=%u", + (pgstat_get_kind_info(kind))->name, dboid, objoid)); + + pgstat_reset(kind, dboid, objoid); + } + + create_drop_transactional_internal(kind, dboid, objoid, /* create */ true); +} + +/* + * Drop a stats entry for a just dropped database object in a transactional + * manner. + * + * I.e. if the current (sub-)transaction aborts, the stats entry will stay + * alive. + */ +void +pgstat_drop_transactional(PgStat_Kind kind, Oid dboid, Oid objoid) +{ + create_drop_transactional_internal(kind, dboid, objoid, /* create */ false); +} diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c new file mode 100644 index 0000000..87c15b9 --- /dev/null +++ b/src/backend/utils/activity/wait_event.c @@ -0,0 +1,749 @@ +/* ---------- + * wait_event.c + * Wait event reporting infrastructure. + * + * Copyright (c) 2001-2022, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/utils/activity/wait_event.c + * + * NOTES + * + * To make pgstat_report_wait_start() and pgstat_report_wait_end() as + * lightweight as possible, they do not check if shared memory (MyProc + * specifically, where the wait event is stored) is already available. Instead + * we initially set my_wait_event_info to a process local variable, which then + * is redirected to shared memory using pgstat_set_wait_event_storage(). For + * the same reason pgstat_track_activities is not checked - the check adds + * more work than it saves. + * + * ---------- + */ +#include "postgres.h" + +#include "storage/lmgr.h" /* for GetLockNameFromTagType */ +#include "storage/lwlock.h" /* for GetLWLockIdentifier */ +#include "utils/wait_event.h" + + +static const char *pgstat_get_wait_activity(WaitEventActivity w); +static const char *pgstat_get_wait_client(WaitEventClient w); +static const char *pgstat_get_wait_ipc(WaitEventIPC w); +static const char *pgstat_get_wait_timeout(WaitEventTimeout w); +static const char *pgstat_get_wait_io(WaitEventIO w); + + +static uint32 local_my_wait_event_info; +uint32 *my_wait_event_info = &local_my_wait_event_info; + + +/* + * Configure wait event reporting to report wait events to *wait_event_info. + * *wait_event_info needs to be valid until pgstat_reset_wait_event_storage() + * is called. + * + * Expected to be called during backend startup, to point my_wait_event_info + * into shared memory. + */ +void +pgstat_set_wait_event_storage(uint32 *wait_event_info) +{ + my_wait_event_info = wait_event_info; +} + +/* + * Reset wait event storage location. + * + * Expected to be called during backend shutdown, before the location set up + * pgstat_set_wait_event_storage() becomes invalid. + */ +void +pgstat_reset_wait_event_storage(void) +{ + my_wait_event_info = &local_my_wait_event_info; +} + +/* ---------- + * pgstat_get_wait_event_type() - + * + * Return a string representing the current wait event type, backend is + * waiting on. + */ +const char * +pgstat_get_wait_event_type(uint32 wait_event_info) +{ + uint32 classId; + const char *event_type; + + /* report process as not waiting. */ + if (wait_event_info == 0) + return NULL; + + classId = wait_event_info & 0xFF000000; + + switch (classId) + { + case PG_WAIT_LWLOCK: + event_type = "LWLock"; + break; + case PG_WAIT_LOCK: + event_type = "Lock"; + break; + case PG_WAIT_BUFFER_PIN: + event_type = "BufferPin"; + break; + case PG_WAIT_ACTIVITY: + event_type = "Activity"; + break; + case PG_WAIT_CLIENT: + event_type = "Client"; + break; + case PG_WAIT_EXTENSION: + event_type = "Extension"; + break; + case PG_WAIT_IPC: + event_type = "IPC"; + break; + case PG_WAIT_TIMEOUT: + event_type = "Timeout"; + break; + case PG_WAIT_IO: + event_type = "IO"; + break; + default: + event_type = "???"; + break; + } + + return event_type; +} + +/* ---------- + * pgstat_get_wait_event() - + * + * Return a string representing the current wait event, backend is + * waiting on. + */ +const char * +pgstat_get_wait_event(uint32 wait_event_info) +{ + uint32 classId; + uint16 eventId; + const char *event_name; + + /* report process as not waiting. */ + if (wait_event_info == 0) + return NULL; + + classId = wait_event_info & 0xFF000000; + eventId = wait_event_info & 0x0000FFFF; + + switch (classId) + { + case PG_WAIT_LWLOCK: + event_name = GetLWLockIdentifier(classId, eventId); + break; + case PG_WAIT_LOCK: + event_name = GetLockNameFromTagType(eventId); + break; + case PG_WAIT_BUFFER_PIN: + event_name = "BufferPin"; + break; + case PG_WAIT_ACTIVITY: + { + WaitEventActivity w = (WaitEventActivity) wait_event_info; + + event_name = pgstat_get_wait_activity(w); + break; + } + case PG_WAIT_CLIENT: + { + WaitEventClient w = (WaitEventClient) wait_event_info; + + event_name = pgstat_get_wait_client(w); + break; + } + case PG_WAIT_EXTENSION: + event_name = "Extension"; + break; + case PG_WAIT_IPC: + { + WaitEventIPC w = (WaitEventIPC) wait_event_info; + + event_name = pgstat_get_wait_ipc(w); + break; + } + case PG_WAIT_TIMEOUT: + { + WaitEventTimeout w = (WaitEventTimeout) wait_event_info; + + event_name = pgstat_get_wait_timeout(w); + break; + } + case PG_WAIT_IO: + { + WaitEventIO w = (WaitEventIO) wait_event_info; + + event_name = pgstat_get_wait_io(w); + break; + } + default: + event_name = "unknown wait event"; + break; + } + + return event_name; +} + +/* ---------- + * pgstat_get_wait_activity() - + * + * Convert WaitEventActivity to string. + * ---------- + */ +static const char * +pgstat_get_wait_activity(WaitEventActivity w) +{ + const char *event_name = "unknown wait event"; + + switch (w) + { + case WAIT_EVENT_ARCHIVER_MAIN: + event_name = "ArchiverMain"; + break; + case WAIT_EVENT_AUTOVACUUM_MAIN: + event_name = "AutoVacuumMain"; + break; + case WAIT_EVENT_BGWRITER_HIBERNATE: + event_name = "BgWriterHibernate"; + break; + case WAIT_EVENT_BGWRITER_MAIN: + event_name = "BgWriterMain"; + break; + case WAIT_EVENT_CHECKPOINTER_MAIN: + event_name = "CheckpointerMain"; + break; + case WAIT_EVENT_LOGICAL_APPLY_MAIN: + event_name = "LogicalApplyMain"; + break; + case WAIT_EVENT_LOGICAL_LAUNCHER_MAIN: + event_name = "LogicalLauncherMain"; + break; + case WAIT_EVENT_RECOVERY_WAL_STREAM: + event_name = "RecoveryWalStream"; + break; + case WAIT_EVENT_SYSLOGGER_MAIN: + event_name = "SysLoggerMain"; + break; + case WAIT_EVENT_WAL_RECEIVER_MAIN: + event_name = "WalReceiverMain"; + break; + case WAIT_EVENT_WAL_SENDER_MAIN: + event_name = "WalSenderMain"; + break; + case WAIT_EVENT_WAL_WRITER_MAIN: + event_name = "WalWriterMain"; + break; + /* no default case, so that compiler will warn */ + } + + return event_name; +} + +/* ---------- + * pgstat_get_wait_client() - + * + * Convert WaitEventClient to string. + * ---------- + */ +static const char * +pgstat_get_wait_client(WaitEventClient w) +{ + const char *event_name = "unknown wait event"; + + switch (w) + { + case WAIT_EVENT_CLIENT_READ: + event_name = "ClientRead"; + break; + case WAIT_EVENT_CLIENT_WRITE: + event_name = "ClientWrite"; + break; + case WAIT_EVENT_GSS_OPEN_SERVER: + event_name = "GSSOpenServer"; + break; + case WAIT_EVENT_LIBPQWALRECEIVER_CONNECT: + event_name = "LibPQWalReceiverConnect"; + break; + case WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE: + event_name = "LibPQWalReceiverReceive"; + break; + case WAIT_EVENT_SSL_OPEN_SERVER: + event_name = "SSLOpenServer"; + break; + case WAIT_EVENT_WAL_SENDER_WAIT_WAL: + event_name = "WalSenderWaitForWAL"; + break; + case WAIT_EVENT_WAL_SENDER_WRITE_DATA: + event_name = "WalSenderWriteData"; + break; + /* no default case, so that compiler will warn */ + } + + return event_name; +} + +/* ---------- + * pgstat_get_wait_ipc() - + * + * Convert WaitEventIPC to string. + * ---------- + */ +static const char * +pgstat_get_wait_ipc(WaitEventIPC w) +{ + const char *event_name = "unknown wait event"; + + switch (w) + { + case WAIT_EVENT_APPEND_READY: + event_name = "AppendReady"; + break; + case WAIT_EVENT_ARCHIVE_CLEANUP_COMMAND: + event_name = "ArchiveCleanupCommand"; + break; + case WAIT_EVENT_ARCHIVE_COMMAND: + event_name = "ArchiveCommand"; + break; + case WAIT_EVENT_BACKEND_TERMINATION: + event_name = "BackendTermination"; + break; + case WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE: + event_name = "BackupWaitWalArchive"; + break; + case WAIT_EVENT_BGWORKER_SHUTDOWN: + event_name = "BgWorkerShutdown"; + break; + case WAIT_EVENT_BGWORKER_STARTUP: + event_name = "BgWorkerStartup"; + break; + case WAIT_EVENT_BTREE_PAGE: + event_name = "BtreePage"; + break; + case WAIT_EVENT_BUFFER_IO: + event_name = "BufferIO"; + break; + case WAIT_EVENT_CHECKPOINT_DONE: + event_name = "CheckpointDone"; + break; + case WAIT_EVENT_CHECKPOINT_START: + event_name = "CheckpointStart"; + break; + case WAIT_EVENT_EXECUTE_GATHER: + event_name = "ExecuteGather"; + break; + case WAIT_EVENT_HASH_BATCH_ALLOCATE: + event_name = "HashBatchAllocate"; + break; + case WAIT_EVENT_HASH_BATCH_ELECT: + event_name = "HashBatchElect"; + break; + case WAIT_EVENT_HASH_BATCH_LOAD: + event_name = "HashBatchLoad"; + break; + case WAIT_EVENT_HASH_BUILD_ALLOCATE: + event_name = "HashBuildAllocate"; + break; + case WAIT_EVENT_HASH_BUILD_ELECT: + event_name = "HashBuildElect"; + break; + case WAIT_EVENT_HASH_BUILD_HASH_INNER: + event_name = "HashBuildHashInner"; + break; + case WAIT_EVENT_HASH_BUILD_HASH_OUTER: + event_name = "HashBuildHashOuter"; + break; + case WAIT_EVENT_HASH_GROW_BATCHES_ALLOCATE: + event_name = "HashGrowBatchesAllocate"; + break; + case WAIT_EVENT_HASH_GROW_BATCHES_DECIDE: + event_name = "HashGrowBatchesDecide"; + break; + case WAIT_EVENT_HASH_GROW_BATCHES_ELECT: + event_name = "HashGrowBatchesElect"; + break; + case WAIT_EVENT_HASH_GROW_BATCHES_FINISH: + event_name = "HashGrowBatchesFinish"; + break; + case WAIT_EVENT_HASH_GROW_BATCHES_REPARTITION: + event_name = "HashGrowBatchesRepartition"; + break; + case WAIT_EVENT_HASH_GROW_BUCKETS_ALLOCATE: + event_name = "HashGrowBucketsAllocate"; + break; + case WAIT_EVENT_HASH_GROW_BUCKETS_ELECT: + event_name = "HashGrowBucketsElect"; + break; + case WAIT_EVENT_HASH_GROW_BUCKETS_REINSERT: + event_name = "HashGrowBucketsReinsert"; + break; + case WAIT_EVENT_LOGICAL_SYNC_DATA: + event_name = "LogicalSyncData"; + break; + case WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE: + event_name = "LogicalSyncStateChange"; + break; + case WAIT_EVENT_MQ_INTERNAL: + event_name = "MessageQueueInternal"; + break; + case WAIT_EVENT_MQ_PUT_MESSAGE: + event_name = "MessageQueuePutMessage"; + break; + case WAIT_EVENT_MQ_RECEIVE: + event_name = "MessageQueueReceive"; + break; + case WAIT_EVENT_MQ_SEND: + event_name = "MessageQueueSend"; + break; + case WAIT_EVENT_PARALLEL_BITMAP_SCAN: + event_name = "ParallelBitmapScan"; + break; + case WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN: + event_name = "ParallelCreateIndexScan"; + break; + case WAIT_EVENT_PARALLEL_FINISH: + event_name = "ParallelFinish"; + break; + case WAIT_EVENT_PROCARRAY_GROUP_UPDATE: + event_name = "ProcArrayGroupUpdate"; + break; + case WAIT_EVENT_PROC_SIGNAL_BARRIER: + event_name = "ProcSignalBarrier"; + break; + case WAIT_EVENT_PROMOTE: + event_name = "Promote"; + break; + case WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT: + event_name = "RecoveryConflictSnapshot"; + break; + case WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE: + event_name = "RecoveryConflictTablespace"; + break; + case WAIT_EVENT_RECOVERY_END_COMMAND: + event_name = "RecoveryEndCommand"; + break; + case WAIT_EVENT_RECOVERY_PAUSE: + event_name = "RecoveryPause"; + break; + case WAIT_EVENT_REPLICATION_ORIGIN_DROP: + event_name = "ReplicationOriginDrop"; + break; + case WAIT_EVENT_REPLICATION_SLOT_DROP: + event_name = "ReplicationSlotDrop"; + break; + case WAIT_EVENT_RESTORE_COMMAND: + event_name = "RestoreCommand"; + break; + case WAIT_EVENT_SAFE_SNAPSHOT: + event_name = "SafeSnapshot"; + break; + case WAIT_EVENT_SYNC_REP: + event_name = "SyncRep"; + break; + case WAIT_EVENT_WAL_RECEIVER_EXIT: + event_name = "WalReceiverExit"; + break; + case WAIT_EVENT_WAL_RECEIVER_WAIT_START: + event_name = "WalReceiverWaitStart"; + break; + case WAIT_EVENT_XACT_GROUP_UPDATE: + event_name = "XactGroupUpdate"; + break; + /* no default case, so that compiler will warn */ + } + + return event_name; +} + +/* ---------- + * pgstat_get_wait_timeout() - + * + * Convert WaitEventTimeout to string. + * ---------- + */ +static const char * +pgstat_get_wait_timeout(WaitEventTimeout w) +{ + const char *event_name = "unknown wait event"; + + switch (w) + { + case WAIT_EVENT_BASE_BACKUP_THROTTLE: + event_name = "BaseBackupThrottle"; + break; + case WAIT_EVENT_CHECKPOINT_WRITE_DELAY: + event_name = "CheckpointWriteDelay"; + break; + case WAIT_EVENT_PG_SLEEP: + event_name = "PgSleep"; + break; + case WAIT_EVENT_RECOVERY_APPLY_DELAY: + event_name = "RecoveryApplyDelay"; + break; + case WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL: + event_name = "RecoveryRetrieveRetryInterval"; + break; + case WAIT_EVENT_REGISTER_SYNC_REQUEST: + event_name = "RegisterSyncRequest"; + break; + case WAIT_EVENT_VACUUM_DELAY: + event_name = "VacuumDelay"; + break; + case WAIT_EVENT_VACUUM_TRUNCATE: + event_name = "VacuumTruncate"; + break; + /* no default case, so that compiler will warn */ + } + + return event_name; +} + +/* ---------- + * pgstat_get_wait_io() - + * + * Convert WaitEventIO to string. + * ---------- + */ +static const char * +pgstat_get_wait_io(WaitEventIO w) +{ + const char *event_name = "unknown wait event"; + + switch (w) + { + case WAIT_EVENT_BASEBACKUP_READ: + event_name = "BaseBackupRead"; + break; + case WAIT_EVENT_BASEBACKUP_SYNC: + event_name = "BaseBackupSync"; + break; + case WAIT_EVENT_BASEBACKUP_WRITE: + event_name = "BaseBackupWrite"; + break; + case WAIT_EVENT_BUFFILE_READ: + event_name = "BufFileRead"; + break; + case WAIT_EVENT_BUFFILE_WRITE: + event_name = "BufFileWrite"; + break; + case WAIT_EVENT_BUFFILE_TRUNCATE: + event_name = "BufFileTruncate"; + break; + case WAIT_EVENT_CONTROL_FILE_READ: + event_name = "ControlFileRead"; + break; + case WAIT_EVENT_CONTROL_FILE_SYNC: + event_name = "ControlFileSync"; + break; + case WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE: + event_name = "ControlFileSyncUpdate"; + break; + case WAIT_EVENT_CONTROL_FILE_WRITE: + event_name = "ControlFileWrite"; + break; + case WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE: + event_name = "ControlFileWriteUpdate"; + break; + case WAIT_EVENT_COPY_FILE_READ: + event_name = "CopyFileRead"; + break; + case WAIT_EVENT_COPY_FILE_WRITE: + event_name = "CopyFileWrite"; + break; + case WAIT_EVENT_DATA_FILE_EXTEND: + event_name = "DataFileExtend"; + break; + case WAIT_EVENT_DATA_FILE_FLUSH: + event_name = "DataFileFlush"; + break; + case WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC: + event_name = "DataFileImmediateSync"; + break; + case WAIT_EVENT_DATA_FILE_PREFETCH: + event_name = "DataFilePrefetch"; + break; + case WAIT_EVENT_DATA_FILE_READ: + event_name = "DataFileRead"; + break; + case WAIT_EVENT_DATA_FILE_SYNC: + event_name = "DataFileSync"; + break; + case WAIT_EVENT_DATA_FILE_TRUNCATE: + event_name = "DataFileTruncate"; + break; + case WAIT_EVENT_DATA_FILE_WRITE: + event_name = "DataFileWrite"; + break; + case WAIT_EVENT_DSM_FILL_ZERO_WRITE: + event_name = "DSMFillZeroWrite"; + break; + case WAIT_EVENT_LOCK_FILE_ADDTODATADIR_READ: + event_name = "LockFileAddToDataDirRead"; + break; + case WAIT_EVENT_LOCK_FILE_ADDTODATADIR_SYNC: + event_name = "LockFileAddToDataDirSync"; + break; + case WAIT_EVENT_LOCK_FILE_ADDTODATADIR_WRITE: + event_name = "LockFileAddToDataDirWrite"; + break; + case WAIT_EVENT_LOCK_FILE_CREATE_READ: + event_name = "LockFileCreateRead"; + break; + case WAIT_EVENT_LOCK_FILE_CREATE_SYNC: + event_name = "LockFileCreateSync"; + break; + case WAIT_EVENT_LOCK_FILE_CREATE_WRITE: + event_name = "LockFileCreateWrite"; + break; + case WAIT_EVENT_LOCK_FILE_RECHECKDATADIR_READ: + event_name = "LockFileReCheckDataDirRead"; + break; + case WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC: + event_name = "LogicalRewriteCheckpointSync"; + break; + case WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC: + event_name = "LogicalRewriteMappingSync"; + break; + case WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE: + event_name = "LogicalRewriteMappingWrite"; + break; + case WAIT_EVENT_LOGICAL_REWRITE_SYNC: + event_name = "LogicalRewriteSync"; + break; + case WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE: + event_name = "LogicalRewriteTruncate"; + break; + case WAIT_EVENT_LOGICAL_REWRITE_WRITE: + event_name = "LogicalRewriteWrite"; + break; + case WAIT_EVENT_RELATION_MAP_READ: + event_name = "RelationMapRead"; + break; + case WAIT_EVENT_RELATION_MAP_SYNC: + event_name = "RelationMapSync"; + break; + case WAIT_EVENT_RELATION_MAP_WRITE: + event_name = "RelationMapWrite"; + break; + case WAIT_EVENT_REORDER_BUFFER_READ: + event_name = "ReorderBufferRead"; + break; + case WAIT_EVENT_REORDER_BUFFER_WRITE: + event_name = "ReorderBufferWrite"; + break; + case WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ: + event_name = "ReorderLogicalMappingRead"; + break; + case WAIT_EVENT_REPLICATION_SLOT_READ: + event_name = "ReplicationSlotRead"; + break; + case WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC: + event_name = "ReplicationSlotRestoreSync"; + break; + case WAIT_EVENT_REPLICATION_SLOT_SYNC: + event_name = "ReplicationSlotSync"; + break; + case WAIT_EVENT_REPLICATION_SLOT_WRITE: + event_name = "ReplicationSlotWrite"; + break; + case WAIT_EVENT_SLRU_FLUSH_SYNC: + event_name = "SLRUFlushSync"; + break; + case WAIT_EVENT_SLRU_READ: + event_name = "SLRURead"; + break; + case WAIT_EVENT_SLRU_SYNC: + event_name = "SLRUSync"; + break; + case WAIT_EVENT_SLRU_WRITE: + event_name = "SLRUWrite"; + break; + case WAIT_EVENT_SNAPBUILD_READ: + event_name = "SnapbuildRead"; + break; + case WAIT_EVENT_SNAPBUILD_SYNC: + event_name = "SnapbuildSync"; + break; + case WAIT_EVENT_SNAPBUILD_WRITE: + event_name = "SnapbuildWrite"; + break; + case WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC: + event_name = "TimelineHistoryFileSync"; + break; + case WAIT_EVENT_TIMELINE_HISTORY_FILE_WRITE: + event_name = "TimelineHistoryFileWrite"; + break; + case WAIT_EVENT_TIMELINE_HISTORY_READ: + event_name = "TimelineHistoryRead"; + break; + case WAIT_EVENT_TIMELINE_HISTORY_SYNC: + event_name = "TimelineHistorySync"; + break; + case WAIT_EVENT_TIMELINE_HISTORY_WRITE: + event_name = "TimelineHistoryWrite"; + break; + case WAIT_EVENT_TWOPHASE_FILE_READ: + event_name = "TwophaseFileRead"; + break; + case WAIT_EVENT_TWOPHASE_FILE_SYNC: + event_name = "TwophaseFileSync"; + break; + case WAIT_EVENT_TWOPHASE_FILE_WRITE: + event_name = "TwophaseFileWrite"; + break; + case WAIT_EVENT_VERSION_FILE_WRITE: + event_name = "VersionFileWrite"; + break; + case WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ: + event_name = "WALSenderTimelineHistoryRead"; + break; + case WAIT_EVENT_WAL_BOOTSTRAP_SYNC: + event_name = "WALBootstrapSync"; + break; + case WAIT_EVENT_WAL_BOOTSTRAP_WRITE: + event_name = "WALBootstrapWrite"; + break; + case WAIT_EVENT_WAL_COPY_READ: + event_name = "WALCopyRead"; + break; + case WAIT_EVENT_WAL_COPY_SYNC: + event_name = "WALCopySync"; + break; + case WAIT_EVENT_WAL_COPY_WRITE: + event_name = "WALCopyWrite"; + break; + case WAIT_EVENT_WAL_INIT_SYNC: + event_name = "WALInitSync"; + break; + case WAIT_EVENT_WAL_INIT_WRITE: + event_name = "WALInitWrite"; + break; + case WAIT_EVENT_WAL_READ: + event_name = "WALRead"; + break; + case WAIT_EVENT_WAL_SYNC: + event_name = "WALSync"; + break; + case WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN: + event_name = "WALSyncMethodAssign"; + break; + case WAIT_EVENT_WAL_WRITE: + event_name = "WALWrite"; + break; + + /* no default case, so that compiler will warn */ + } + + return event_name; +} |