/*------------------------------------------------------------------------- * * xlogfuncs.c * * PostgreSQL write-ahead log manager user interface functions * * This file contains WAL control and information functions. * * * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/backend/access/transam/xlogfuncs.c * *------------------------------------------------------------------------- */ #include "postgres.h" #include #include "access/htup_details.h" #include "access/xlog.h" #include "access/xlog_internal.h" #include "access/xlogutils.h" #include "catalog/pg_type.h" #include "funcapi.h" #include "miscadmin.h" #include "pgstat.h" #include "replication/walreceiver.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/smgr.h" #include "utils/builtins.h" #include "utils/guc.h" #include "utils/memutils.h" #include "utils/numeric.h" #include "utils/pg_lsn.h" #include "utils/timestamp.h" #include "utils/tuplestore.h" /* * Store label file and tablespace map during non-exclusive backups. */ static StringInfo label_file; static StringInfo tblspc_map_file; /* * pg_start_backup: set up for taking an on-line backup dump * * Essentially what this does is to create a backup label file in $PGDATA, * where it will be archived as part of the backup dump. The label file * contains the user-supplied label string (typically this would be used * to tell where the backup dump will be stored) and the starting time and * starting WAL location for the dump. * * Permission checking for this function is managed through the normal * GRANT system. */ Datum pg_start_backup(PG_FUNCTION_ARGS) { text *backupid = PG_GETARG_TEXT_PP(0); bool fast = PG_GETARG_BOOL(1); bool exclusive = PG_GETARG_BOOL(2); char *backupidstr; XLogRecPtr startpoint; SessionBackupState status = get_backup_status(); backupidstr = text_to_cstring(backupid); if (status == SESSION_BACKUP_NON_EXCLUSIVE) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("a backup is already in progress in this session"))); if (exclusive) { startpoint = do_pg_start_backup(backupidstr, fast, NULL, NULL, NULL, NULL, false, true); } else { MemoryContext oldcontext; /* * Label file and tablespace map file need to be long-lived, since * they are read in pg_stop_backup. */ oldcontext = MemoryContextSwitchTo(TopMemoryContext); label_file = makeStringInfo(); tblspc_map_file = makeStringInfo(); MemoryContextSwitchTo(oldcontext); register_persistent_abort_backup_handler(); startpoint = do_pg_start_backup(backupidstr, fast, NULL, label_file, NULL, tblspc_map_file, false, true); } PG_RETURN_LSN(startpoint); } /* * pg_stop_backup: finish taking an on-line backup dump * * We write an end-of-backup WAL record, and remove the backup label file * created by pg_start_backup, creating a backup history file in pg_wal * instead (whence it will immediately be archived). The backup history file * contains the same info found in the label file, plus the backup-end time * and WAL location. Before 9.0, the backup-end time was read from the backup * history file at the beginning of archive recovery, but we now use the WAL * record for that and the file is for informational and debug purposes only. * * Note: different from CancelBackup which just cancels online backup mode. * * Note: this version is only called to stop an exclusive backup. The function * pg_stop_backup_v2 (overloaded as pg_stop_backup in SQL) is called to * stop non-exclusive backups. * * Permission checking for this function is managed through the normal * GRANT system. */ Datum pg_stop_backup(PG_FUNCTION_ARGS) { XLogRecPtr stoppoint; SessionBackupState status = get_backup_status(); if (status == SESSION_BACKUP_NON_EXCLUSIVE) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("non-exclusive backup in progress"), errhint("Did you mean to use pg_stop_backup('f')?"))); /* * Exclusive backups were typically started in a different connection, so * don't try to verify that status of backup is set to * SESSION_BACKUP_EXCLUSIVE in this function. Actual verification that an * exclusive backup is in fact running is handled inside * do_pg_stop_backup. */ stoppoint = do_pg_stop_backup(NULL, true, NULL); PG_RETURN_LSN(stoppoint); } /* * pg_stop_backup_v2: finish taking exclusive or nonexclusive on-line backup. * * Works the same as pg_stop_backup, except for non-exclusive backups it returns * the backup label and tablespace map files as text fields in as part of the * resultset. * * The first parameter (variable 'exclusive') allows the user to tell us if * this is an exclusive or a non-exclusive backup. * * The second parameter (variable 'waitforarchive'), which is optional, * allows the user to choose if they want to wait for the WAL to be archived * or if we should just return as soon as the WAL record is written. * * Permission checking for this function is managed through the normal * GRANT system. */ Datum pg_stop_backup_v2(PG_FUNCTION_ARGS) { ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; TupleDesc tupdesc; Tuplestorestate *tupstore; MemoryContext per_query_ctx; MemoryContext oldcontext; Datum values[3]; bool nulls[3]; bool exclusive = PG_GETARG_BOOL(0); bool waitforarchive = PG_GETARG_BOOL(1); XLogRecPtr stoppoint; SessionBackupState status = get_backup_status(); /* check to see if caller supports us returning a tuplestore */ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); if (!(rsinfo->allowedModes & SFRM_Materialize)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("materialize mode required, but it is not allowed in this context"))); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; oldcontext = MemoryContextSwitchTo(per_query_ctx); tupstore = tuplestore_begin_heap(true, false, work_mem); rsinfo->returnMode = SFRM_Materialize; rsinfo->setResult = tupstore; rsinfo->setDesc = tupdesc; MemoryContextSwitchTo(oldcontext); MemSet(values, 0, sizeof(values)); MemSet(nulls, 0, sizeof(nulls)); if (exclusive) { if (status == SESSION_BACKUP_NON_EXCLUSIVE) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("non-exclusive backup in progress"), errhint("Did you mean to use pg_stop_backup('f')?"))); /* * Stop the exclusive backup, and since we're in an exclusive backup * return NULL for both backup_label and tablespace_map. */ stoppoint = do_pg_stop_backup(NULL, waitforarchive, NULL); nulls[1] = true; nulls[2] = true; } else { if (status != SESSION_BACKUP_NON_EXCLUSIVE) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("non-exclusive backup is not in progress"), errhint("Did you mean to use pg_stop_backup('t')?"))); /* * Stop the non-exclusive backup. Return a copy of the backup label * and tablespace map so they can be written to disk by the caller. */ stoppoint = do_pg_stop_backup(label_file->data, waitforarchive, NULL); values[1] = CStringGetTextDatum(label_file->data); values[2] = CStringGetTextDatum(tblspc_map_file->data); /* Free structures allocated in TopMemoryContext */ pfree(label_file->data); pfree(label_file); label_file = NULL; pfree(tblspc_map_file->data); pfree(tblspc_map_file); tblspc_map_file = NULL; } /* Stoppoint is included on both exclusive and nonexclusive backups */ values[0] = LSNGetDatum(stoppoint); tuplestore_putvalues(tupstore, tupdesc, values, nulls); tuplestore_donestoring(tupstore); return (Datum) 0; } /* * pg_switch_wal: switch to next xlog file * * Permission checking for this function is managed through the normal * GRANT system. */ Datum pg_switch_wal(PG_FUNCTION_ARGS) { XLogRecPtr switchpoint; if (RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"), errhint("WAL control functions cannot be executed during recovery."))); switchpoint = RequestXLogSwitch(false); /* * As a convenience, return the WAL location of the switch record */ PG_RETURN_LSN(switchpoint); } /* * pg_create_restore_point: a named point for restore * * Permission checking for this function is managed through the normal * GRANT system. */ Datum pg_create_restore_point(PG_FUNCTION_ARGS) { text *restore_name = PG_GETARG_TEXT_PP(0); char *restore_name_str; XLogRecPtr restorepoint; if (RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"), errhint("WAL control functions cannot be executed during recovery."))); if (!XLogIsNeeded()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("WAL level not sufficient for creating a restore point"), errhint("wal_level must be set to \"replica\" or \"logical\" at server start."))); restore_name_str = text_to_cstring(restore_name); if (strlen(restore_name_str) >= MAXFNAMELEN) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("value too long for restore point (maximum %d characters)", MAXFNAMELEN - 1))); restorepoint = XLogRestorePoint(restore_name_str); /* * As a convenience, return the WAL location of the restore point record */ PG_RETURN_LSN(restorepoint); } /* * Report the current WAL write location (same format as pg_start_backup etc) * * This is useful for determining how much of WAL is visible to an external * archiving process. Note that the data before this point is written out * to the kernel, but is not necessarily synced to disk. */ Datum pg_current_wal_lsn(PG_FUNCTION_ARGS) { XLogRecPtr current_recptr; if (RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"), errhint("WAL control functions cannot be executed during recovery."))); current_recptr = GetXLogWriteRecPtr(); PG_RETURN_LSN(current_recptr); } /* * Report the current WAL insert location (same format as pg_start_backup etc) * * This function is mostly for debugging purposes. */ Datum pg_current_wal_insert_lsn(PG_FUNCTION_ARGS) { XLogRecPtr current_recptr; if (RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"), errhint("WAL control functions cannot be executed during recovery."))); current_recptr = GetXLogInsertRecPtr(); PG_RETURN_LSN(current_recptr); } /* * Report the current WAL flush location (same format as pg_start_backup etc) * * This function is mostly for debugging purposes. */ Datum pg_current_wal_flush_lsn(PG_FUNCTION_ARGS) { XLogRecPtr current_recptr; if (RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"), errhint("WAL control functions cannot be executed during recovery."))); current_recptr = GetFlushRecPtr(); PG_RETURN_LSN(current_recptr); } /* * Report the last WAL receive location (same format as pg_start_backup etc) * * This is useful for determining how much of WAL is guaranteed to be received * and synced to disk by walreceiver. */ Datum pg_last_wal_receive_lsn(PG_FUNCTION_ARGS) { XLogRecPtr recptr; recptr = GetWalRcvFlushRecPtr(NULL, NULL); if (recptr == 0) PG_RETURN_NULL(); PG_RETURN_LSN(recptr); } /* * Report the last WAL replay location (same format as pg_start_backup etc) * * This is useful for determining how much of WAL is visible to read-only * connections during recovery. */ Datum pg_last_wal_replay_lsn(PG_FUNCTION_ARGS) { XLogRecPtr recptr; recptr = GetXLogReplayRecPtr(NULL); if (recptr == 0) PG_RETURN_NULL(); PG_RETURN_LSN(recptr); } /* * Compute an xlog file name and decimal byte offset given a WAL location, * such as is returned by pg_stop_backup() or pg_switch_wal(). * * Note that a location exactly at a segment boundary is taken to be in * the previous segment. This is usually the right thing, since the * expected usage is to determine which xlog file(s) are ready to archive. */ Datum pg_walfile_name_offset(PG_FUNCTION_ARGS) { XLogSegNo xlogsegno; uint32 xrecoff; XLogRecPtr locationpoint = PG_GETARG_LSN(0); char xlogfilename[MAXFNAMELEN]; Datum values[2]; bool isnull[2]; TupleDesc resultTupleDesc; HeapTuple resultHeapTuple; Datum result; if (RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"), errhint("%s cannot be executed during recovery.", "pg_walfile_name_offset()"))); /* * Construct a tuple descriptor for the result row. This must match this * function's pg_proc entry! */ resultTupleDesc = CreateTemplateTupleDesc(2); TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name", TEXTOID, -1, 0); TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset", INT4OID, -1, 0); resultTupleDesc = BlessTupleDesc(resultTupleDesc); /* * xlogfilename */ XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size); XLogFileName(xlogfilename, ThisTimeLineID, xlogsegno, wal_segment_size); values[0] = CStringGetTextDatum(xlogfilename); isnull[0] = false; /* * offset */ xrecoff = XLogSegmentOffset(locationpoint, wal_segment_size); values[1] = UInt32GetDatum(xrecoff); isnull[1] = false; /* * Tuple jam: Having first prepared your Datums, then squash together */ resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull); result = HeapTupleGetDatum(resultHeapTuple); PG_RETURN_DATUM(result); } /* * Compute an xlog file name given a WAL location, * such as is returned by pg_stop_backup() or pg_switch_wal(). */ Datum pg_walfile_name(PG_FUNCTION_ARGS) { XLogSegNo xlogsegno; XLogRecPtr locationpoint = PG_GETARG_LSN(0); char xlogfilename[MAXFNAMELEN]; if (RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"), errhint("%s cannot be executed during recovery.", "pg_walfile_name()"))); XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size); XLogFileName(xlogfilename, ThisTimeLineID, xlogsegno, wal_segment_size); PG_RETURN_TEXT_P(cstring_to_text(xlogfilename)); } /* * pg_wal_replay_pause - pause recovery now * * Permission checking for this function is managed through the normal * GRANT system. */ Datum pg_wal_replay_pause(PG_FUNCTION_ARGS) { if (!RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is not in progress"), errhint("Recovery control functions can only be executed during recovery."))); if (PromoteIsTriggered()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("standby promotion is ongoing"), errhint("%s cannot be executed after promotion is triggered.", "pg_wal_replay_pause()"))); SetRecoveryPause(true); PG_RETURN_VOID(); } /* * pg_wal_replay_resume - resume recovery now * * Permission checking for this function is managed through the normal * GRANT system. */ Datum pg_wal_replay_resume(PG_FUNCTION_ARGS) { if (!RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is not in progress"), errhint("Recovery control functions can only be executed during recovery."))); if (PromoteIsTriggered()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("standby promotion is ongoing"), errhint("%s cannot be executed after promotion is triggered.", "pg_wal_replay_resume()"))); SetRecoveryPause(false); PG_RETURN_VOID(); } /* * pg_is_wal_replay_paused */ Datum pg_is_wal_replay_paused(PG_FUNCTION_ARGS) { if (!RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is not in progress"), errhint("Recovery control functions can only be executed during recovery."))); PG_RETURN_BOOL(RecoveryIsPaused()); } /* * Returns timestamp of latest processed commit/abort record. * * When the server has been started normally without recovery the function * returns NULL. */ Datum pg_last_xact_replay_timestamp(PG_FUNCTION_ARGS) { TimestampTz xtime; xtime = GetLatestXTime(); if (xtime == 0) PG_RETURN_NULL(); PG_RETURN_TIMESTAMPTZ(xtime); } /* * Returns bool with current recovery mode, a global state. */ Datum pg_is_in_recovery(PG_FUNCTION_ARGS) { PG_RETURN_BOOL(RecoveryInProgress()); } /* * Compute the difference in bytes between two WAL locations. */ Datum pg_wal_lsn_diff(PG_FUNCTION_ARGS) { Datum result; result = DirectFunctionCall2(pg_lsn_mi, PG_GETARG_DATUM(0), PG_GETARG_DATUM(1)); PG_RETURN_NUMERIC(result); } /* * Returns bool with current on-line backup mode, a global state. */ Datum pg_is_in_backup(PG_FUNCTION_ARGS) { PG_RETURN_BOOL(BackupInProgress()); } /* * Returns start time of an online exclusive backup. * * When there's no exclusive backup in progress, the function * returns NULL. */ Datum pg_backup_start_time(PG_FUNCTION_ARGS) { Datum xtime; FILE *lfp; char fline[MAXPGPATH]; char backup_start_time[30]; /* * See if label file is present */ lfp = AllocateFile(BACKUP_LABEL_FILE, "r"); if (lfp == NULL) { if (errno != ENOENT) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", BACKUP_LABEL_FILE))); PG_RETURN_NULL(); } /* * Parse the file to find the START TIME line. */ backup_start_time[0] = '\0'; while (fgets(fline, sizeof(fline), lfp) != NULL) { if (sscanf(fline, "START TIME: %25[^\n]\n", backup_start_time) == 1) break; } /* Check for a read error. */ if (ferror(lfp)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", BACKUP_LABEL_FILE))); /* Close the backup label file. */ if (FreeFile(lfp)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", BACKUP_LABEL_FILE))); if (strlen(backup_start_time) == 0) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); /* * Convert the time string read from file to TimestampTz form. */ xtime = DirectFunctionCall3(timestamptz_in, CStringGetDatum(backup_start_time), ObjectIdGetDatum(InvalidOid), Int32GetDatum(-1)); PG_RETURN_DATUM(xtime); } /* * Promotes a standby server. * * A result of "true" means that promotion has been completed if "wait" is * "true", or initiated if "wait" is false. */ Datum pg_promote(PG_FUNCTION_ARGS) { bool wait = PG_GETARG_BOOL(0); int wait_seconds = PG_GETARG_INT32(1); FILE *promote_file; int i; if (!RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is not in progress"), errhint("Recovery control functions can only be executed during recovery."))); if (wait_seconds <= 0) ereport(ERROR, (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), errmsg("\"wait_seconds\" must not be negative or zero"))); /* create the promote signal file */ promote_file = AllocateFile(PROMOTE_SIGNAL_FILE, "w"); if (!promote_file) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", PROMOTE_SIGNAL_FILE))); if (FreeFile(promote_file)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write file \"%s\": %m", PROMOTE_SIGNAL_FILE))); /* signal the postmaster */ if (kill(PostmasterPid, SIGUSR1) != 0) { ereport(WARNING, (errmsg("failed to send signal to postmaster: %m"))); (void) unlink(PROMOTE_SIGNAL_FILE); PG_RETURN_BOOL(false); } /* return immediately if waiting was not requested */ if (!wait) PG_RETURN_BOOL(true); /* wait for the amount of time wanted until promotion */ #define WAITS_PER_SECOND 10 for (i = 0; i < WAITS_PER_SECOND * wait_seconds; i++) { int rc; ResetLatch(MyLatch); if (!RecoveryInProgress()) PG_RETURN_BOOL(true); CHECK_FOR_INTERRUPTS(); rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, 1000L / WAITS_PER_SECOND, WAIT_EVENT_PROMOTE); /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (rc & WL_POSTMASTER_DEATH) PG_RETURN_BOOL(false); } ereport(WARNING, (errmsg("server did not promote within %d seconds", wait_seconds))); PG_RETURN_BOOL(false); }