diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-16 19:46:48 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-16 19:46:48 +0000 |
commit | 311bcfc6b3acdd6fd152798c7f287ddf74fa2a98 (patch) | |
tree | 0ec307299b1dada3701e42f4ca6eda57d708261e /src/bin/pg_checksums/pg_checksums.c | |
parent | Initial commit. (diff) | |
download | postgresql-15-311bcfc6b3acdd6fd152798c7f287ddf74fa2a98.tar.xz postgresql-15-311bcfc6b3acdd6fd152798c7f287ddf74fa2a98.zip |
Adding upstream version 15.4.upstream/15.4upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/bin/pg_checksums/pg_checksums.c')
-rw-r--r-- | src/bin/pg_checksums/pg_checksums.c | 655 |
1 files changed, 655 insertions, 0 deletions
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c new file mode 100644 index 0000000..21dfe1b --- /dev/null +++ b/src/bin/pg_checksums/pg_checksums.c @@ -0,0 +1,655 @@ +/*------------------------------------------------------------------------- + * + * pg_checksums.c + * Checks, enables or disables page level checksums for an offline + * cluster + * + * Copyright (c) 2010-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/bin/pg_checksums/pg_checksums.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres_fe.h" + +#include <dirent.h> +#include <limits.h> +#include <time.h> +#include <sys/stat.h> +#include <unistd.h> + +#include "access/xlog_internal.h" +#include "common/controldata_utils.h" +#include "common/file_perm.h" +#include "common/file_utils.h" +#include "common/logging.h" +#include "fe_utils/option_utils.h" +#include "getopt_long.h" +#include "pg_getopt.h" +#include "storage/bufpage.h" +#include "storage/checksum.h" +#include "storage/checksum_impl.h" + + +static int64 files_scanned = 0; +static int64 files_written = 0; +static int64 blocks_scanned = 0; +static int64 blocks_written = 0; +static int64 badblocks = 0; +static ControlFileData *ControlFile; + +static char *only_filenode = NULL; +static bool do_sync = true; +static bool verbose = false; +static bool showprogress = false; + +typedef enum +{ + PG_MODE_CHECK, + PG_MODE_DISABLE, + PG_MODE_ENABLE +} PgChecksumMode; + +/* + * Filename components. + * + * XXX: fd.h is not declared here as frontend side code is not able to + * interact with the backend-side definitions for the various fsync + * wrappers. + */ +#define PG_TEMP_FILES_DIR "pgsql_tmp" +#define PG_TEMP_FILE_PREFIX "pgsql_tmp" + +static PgChecksumMode mode = PG_MODE_CHECK; + +static const char *progname; + +/* + * Progress status information. + */ +int64 total_size = 0; +int64 current_size = 0; +static pg_time_t last_progress_report = 0; + +static void +usage(void) +{ + printf(_("%s enables, disables, or verifies data checksums in a PostgreSQL database cluster.\n\n"), progname); + printf(_("Usage:\n")); + printf(_(" %s [OPTION]... [DATADIR]\n"), progname); + printf(_("\nOptions:\n")); + printf(_(" [-D, --pgdata=]DATADIR data directory\n")); + printf(_(" -c, --check check data checksums (default)\n")); + printf(_(" -d, --disable disable data checksums\n")); + printf(_(" -e, --enable enable data checksums\n")); + printf(_(" -f, --filenode=FILENODE check only relation with specified filenode\n")); + printf(_(" -N, --no-sync do not wait for changes to be written safely to disk\n")); + printf(_(" -P, --progress show progress information\n")); + printf(_(" -v, --verbose output verbose messages\n")); + printf(_(" -V, --version output version information, then exit\n")); + printf(_(" -?, --help show this help, then exit\n")); + printf(_("\nIf no data directory (DATADIR) is specified, " + "the environment variable PGDATA\nis used.\n\n")); + printf(_("Report bugs to <%s>.\n"), PACKAGE_BUGREPORT); + printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL); +} + +/* + * Definition of one element part of an exclusion list, used for files + * to exclude from checksum validation. "name" is the name of the file + * or path to check for exclusion. If "match_prefix" is true, any items + * matching the name as prefix are excluded. + */ +struct exclude_list_item +{ + const char *name; + bool match_prefix; +}; + +/* + * List of files excluded from checksum validation. + * + * Note: this list should be kept in sync with what basebackup.c includes. + */ +static const struct exclude_list_item skip[] = { + {"pg_control", false}, + {"pg_filenode.map", false}, + {"pg_internal.init", true}, + {"PG_VERSION", false}, +#ifdef EXEC_BACKEND + {"config_exec_params", true}, +#endif + {NULL, false} +}; + +/* + * Report current progress status. Parts borrowed from + * src/bin/pg_basebackup/pg_basebackup.c. + */ +static void +progress_report(bool finished) +{ + int percent; + pg_time_t now; + + Assert(showprogress); + + now = time(NULL); + if (now == last_progress_report && !finished) + return; /* Max once per second */ + + /* Save current time */ + last_progress_report = now; + + /* Adjust total size if current_size is larger */ + if (current_size > total_size) + total_size = current_size; + + /* Calculate current percentage of size done */ + percent = total_size ? (int) ((current_size) * 100 / total_size) : 0; + + fprintf(stderr, _("%lld/%lld MB (%d%%) computed"), + (long long) (current_size / (1024 * 1024)), + (long long) (total_size / (1024 * 1024)), + percent); + + /* + * Stay on the same line if reporting to a terminal and we're not done + * yet. + */ + fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr); +} + +static bool +skipfile(const char *fn) +{ + int excludeIdx; + + for (excludeIdx = 0; skip[excludeIdx].name != NULL; excludeIdx++) + { + int cmplen = strlen(skip[excludeIdx].name); + + if (!skip[excludeIdx].match_prefix) + cmplen++; + if (strncmp(skip[excludeIdx].name, fn, cmplen) == 0) + return true; + } + + return false; +} + +static void +scan_file(const char *fn, int segmentno) +{ + PGAlignedBlock buf; + PageHeader header = (PageHeader) buf.data; + int f; + BlockNumber blockno; + int flags; + int64 blocks_written_in_file = 0; + + Assert(mode == PG_MODE_ENABLE || + mode == PG_MODE_CHECK); + + flags = (mode == PG_MODE_ENABLE) ? O_RDWR : O_RDONLY; + f = open(fn, PG_BINARY | flags, 0); + + if (f < 0) + pg_fatal("could not open file \"%s\": %m", fn); + + files_scanned++; + + for (blockno = 0;; blockno++) + { + uint16 csum; + int r = read(f, buf.data, BLCKSZ); + + if (r == 0) + break; + if (r != BLCKSZ) + { + if (r < 0) + pg_fatal("could not read block %u in file \"%s\": %m", + blockno, fn); + else + pg_fatal("could not read block %u in file \"%s\": read %d of %d", + blockno, fn, r, BLCKSZ); + } + blocks_scanned++; + + /* + * Since the file size is counted as total_size for progress status + * information, the sizes of all pages including new ones in the file + * should be counted as current_size. Otherwise the progress reporting + * calculated using those counters may not reach 100%. + */ + current_size += r; + + /* New pages have no checksum yet */ + if (PageIsNew(header)) + continue; + + csum = pg_checksum_page(buf.data, blockno + segmentno * RELSEG_SIZE); + if (mode == PG_MODE_CHECK) + { + if (csum != header->pd_checksum) + { + if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION) + pg_log_error("checksum verification failed in file \"%s\", block %u: calculated checksum %X but block contains %X", + fn, blockno, csum, header->pd_checksum); + badblocks++; + } + } + else if (mode == PG_MODE_ENABLE) + { + int w; + + /* + * Do not rewrite if the checksum is already set to the expected + * value. + */ + if (header->pd_checksum == csum) + continue; + + blocks_written_in_file++; + + /* Set checksum in page header */ + header->pd_checksum = csum; + + /* Seek back to beginning of block */ + if (lseek(f, -BLCKSZ, SEEK_CUR) < 0) + pg_fatal("seek failed for block %u in file \"%s\": %m", blockno, fn); + + /* Write block with checksum */ + w = write(f, buf.data, BLCKSZ); + if (w != BLCKSZ) + { + if (w < 0) + pg_fatal("could not write block %u in file \"%s\": %m", + blockno, fn); + else + pg_fatal("could not write block %u in file \"%s\": wrote %d of %d", + blockno, fn, w, BLCKSZ); + } + } + + if (showprogress) + progress_report(false); + } + + if (verbose) + { + if (mode == PG_MODE_CHECK) + pg_log_info("checksums verified in file \"%s\"", fn); + if (mode == PG_MODE_ENABLE) + pg_log_info("checksums enabled in file \"%s\"", fn); + } + + /* Update write counters if any write activity has happened */ + if (blocks_written_in_file > 0) + { + files_written++; + blocks_written += blocks_written_in_file; + } + + close(f); +} + +/* + * Scan the given directory for items which can be checksummed and + * operate on each one of them. If "sizeonly" is true, the size of + * all the items which have checksums is computed and returned back + * to the caller without operating on the files. This is used to compile + * the total size of the data directory for progress reports. + */ +static int64 +scan_directory(const char *basedir, const char *subdir, bool sizeonly) +{ + int64 dirsize = 0; + char path[MAXPGPATH]; + DIR *dir; + struct dirent *de; + + snprintf(path, sizeof(path), "%s/%s", basedir, subdir); + dir = opendir(path); + if (!dir) + pg_fatal("could not open directory \"%s\": %m", path); + while ((de = readdir(dir)) != NULL) + { + char fn[MAXPGPATH]; + struct stat st; + + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; + + /* Skip temporary files */ + if (strncmp(de->d_name, + PG_TEMP_FILE_PREFIX, + strlen(PG_TEMP_FILE_PREFIX)) == 0) + continue; + + /* Skip temporary folders */ + if (strncmp(de->d_name, + PG_TEMP_FILES_DIR, + strlen(PG_TEMP_FILES_DIR)) == 0) + continue; + + snprintf(fn, sizeof(fn), "%s/%s", path, de->d_name); + if (lstat(fn, &st) < 0) + pg_fatal("could not stat file \"%s\": %m", fn); + if (S_ISREG(st.st_mode)) + { + char fnonly[MAXPGPATH]; + char *forkpath, + *segmentpath; + int segmentno = 0; + + if (skipfile(de->d_name)) + continue; + + /* + * Cut off at the segment boundary (".") to get the segment number + * in order to mix it into the checksum. Then also cut off at the + * fork boundary, to get the filenode the file belongs to for + * filtering. + */ + strlcpy(fnonly, de->d_name, sizeof(fnonly)); + segmentpath = strchr(fnonly, '.'); + if (segmentpath != NULL) + { + *segmentpath++ = '\0'; + segmentno = atoi(segmentpath); + if (segmentno == 0) + pg_fatal("invalid segment number %d in file name \"%s\"", + segmentno, fn); + } + + forkpath = strchr(fnonly, '_'); + if (forkpath != NULL) + *forkpath++ = '\0'; + + if (only_filenode && strcmp(only_filenode, fnonly) != 0) + /* filenode not to be included */ + continue; + + dirsize += st.st_size; + + /* + * No need to work on the file when calculating only the size of + * the items in the data folder. + */ + if (!sizeonly) + scan_file(fn, segmentno); + } +#ifndef WIN32 + else if (S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode)) +#else + else if (S_ISDIR(st.st_mode) || pgwin32_is_junction(fn)) +#endif + { + /* + * If going through the entries of pg_tblspc, we assume to operate + * on tablespace locations where only TABLESPACE_VERSION_DIRECTORY + * is valid, resolving the linked locations and dive into them + * directly. + */ + if (strncmp("pg_tblspc", subdir, strlen("pg_tblspc")) == 0) + { + char tblspc_path[MAXPGPATH]; + struct stat tblspc_st; + + /* + * Resolve tablespace location path and check whether + * TABLESPACE_VERSION_DIRECTORY exists. Not finding a valid + * location is unexpected, since there should be no orphaned + * links and no links pointing to something else than a + * directory. + */ + snprintf(tblspc_path, sizeof(tblspc_path), "%s/%s/%s", + path, de->d_name, TABLESPACE_VERSION_DIRECTORY); + + if (lstat(tblspc_path, &tblspc_st) < 0) + pg_fatal("could not stat file \"%s\": %m", + tblspc_path); + + /* + * Move backwards once as the scan needs to happen for the + * contents of TABLESPACE_VERSION_DIRECTORY. + */ + snprintf(tblspc_path, sizeof(tblspc_path), "%s/%s", + path, de->d_name); + + /* Looks like a valid tablespace location */ + dirsize += scan_directory(tblspc_path, + TABLESPACE_VERSION_DIRECTORY, + sizeonly); + } + else + { + dirsize += scan_directory(path, de->d_name, sizeonly); + } + } + } + closedir(dir); + return dirsize; +} + +int +main(int argc, char *argv[]) +{ + static struct option long_options[] = { + {"check", no_argument, NULL, 'c'}, + {"pgdata", required_argument, NULL, 'D'}, + {"disable", no_argument, NULL, 'd'}, + {"enable", no_argument, NULL, 'e'}, + {"filenode", required_argument, NULL, 'f'}, + {"no-sync", no_argument, NULL, 'N'}, + {"progress", no_argument, NULL, 'P'}, + {"verbose", no_argument, NULL, 'v'}, + {NULL, 0, NULL, 0} + }; + + char *DataDir = NULL; + int c; + int option_index; + bool crc_ok; + + pg_logging_init(argv[0]); + set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_checksums")); + progname = get_progname(argv[0]); + + if (argc > 1) + { + if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) + { + usage(); + exit(0); + } + if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) + { + puts("pg_checksums (PostgreSQL) " PG_VERSION); + exit(0); + } + } + + while ((c = getopt_long(argc, argv, "cD:deNPf:v", long_options, &option_index)) != -1) + { + switch (c) + { + case 'c': + mode = PG_MODE_CHECK; + break; + case 'd': + mode = PG_MODE_DISABLE; + break; + case 'e': + mode = PG_MODE_ENABLE; + break; + case 'f': + if (!option_parse_int(optarg, "-f/--filenode", 0, + INT_MAX, + NULL)) + exit(1); + only_filenode = pstrdup(optarg); + break; + case 'N': + do_sync = false; + break; + case 'v': + verbose = true; + break; + case 'D': + DataDir = optarg; + break; + case 'P': + showprogress = true; + break; + default: + /* getopt_long already emitted a complaint */ + pg_log_error_hint("Try \"%s --help\" for more information.", progname); + exit(1); + } + } + + if (DataDir == NULL) + { + if (optind < argc) + DataDir = argv[optind++]; + else + DataDir = getenv("PGDATA"); + + /* If no DataDir was specified, and none could be found, error out */ + if (DataDir == NULL) + { + pg_log_error("no data directory specified"); + pg_log_error_hint("Try \"%s --help\" for more information.", progname); + exit(1); + } + } + + /* Complain if any arguments remain */ + if (optind < argc) + { + pg_log_error("too many command-line arguments (first is \"%s\")", + argv[optind]); + pg_log_error_hint("Try \"%s --help\" for more information.", progname); + exit(1); + } + + /* filenode checking only works in --check mode */ + if (mode != PG_MODE_CHECK && only_filenode) + { + pg_log_error("option -f/--filenode can only be used with --check"); + pg_log_error_hint("Try \"%s --help\" for more information.", progname); + exit(1); + } + + /* Read the control file and check compatibility */ + ControlFile = get_controlfile(DataDir, &crc_ok); + if (!crc_ok) + pg_fatal("pg_control CRC value is incorrect"); + + if (ControlFile->pg_control_version != PG_CONTROL_VERSION) + pg_fatal("cluster is not compatible with this version of pg_checksums"); + + if (ControlFile->blcksz != BLCKSZ) + { + pg_log_error("database cluster is not compatible"); + pg_log_error_detail("The database cluster was initialized with block size %u, but pg_checksums was compiled with block size %u.", + ControlFile->blcksz, BLCKSZ); + exit(1); + } + + /* + * Check if cluster is running. A clean shutdown is required to avoid + * random checksum failures caused by torn pages. Note that this doesn't + * guard against someone starting the cluster concurrently. + */ + if (ControlFile->state != DB_SHUTDOWNED && + ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY) + pg_fatal("cluster must be shut down"); + + if (ControlFile->data_checksum_version == 0 && + mode == PG_MODE_CHECK) + pg_fatal("data checksums are not enabled in cluster"); + + if (ControlFile->data_checksum_version == 0 && + mode == PG_MODE_DISABLE) + pg_fatal("data checksums are already disabled in cluster"); + + if (ControlFile->data_checksum_version > 0 && + mode == PG_MODE_ENABLE) + pg_fatal("data checksums are already enabled in cluster"); + + /* Operate on all files if checking or enabling checksums */ + if (mode == PG_MODE_CHECK || mode == PG_MODE_ENABLE) + { + /* + * If progress status information is requested, we need to scan the + * directory tree twice: once to know how much total data needs to be + * processed and once to do the real work. + */ + if (showprogress) + { + total_size = scan_directory(DataDir, "global", true); + total_size += scan_directory(DataDir, "base", true); + total_size += scan_directory(DataDir, "pg_tblspc", true); + } + + (void) scan_directory(DataDir, "global", false); + (void) scan_directory(DataDir, "base", false); + (void) scan_directory(DataDir, "pg_tblspc", false); + + if (showprogress) + progress_report(true); + + printf(_("Checksum operation completed\n")); + printf(_("Files scanned: %lld\n"), (long long) files_scanned); + printf(_("Blocks scanned: %lld\n"), (long long) blocks_scanned); + if (mode == PG_MODE_CHECK) + { + printf(_("Bad checksums: %lld\n"), (long long) badblocks); + printf(_("Data checksum version: %u\n"), ControlFile->data_checksum_version); + + if (badblocks > 0) + exit(1); + } + else if (mode == PG_MODE_ENABLE) + { + printf(_("Files written: %lld\n"), (long long) files_written); + printf(_("Blocks written: %lld\n"), (long long) blocks_written); + } + } + + /* + * Finally make the data durable on disk if enabling or disabling + * checksums. Flush first the data directory for safety, and then update + * the control file to keep the switch consistent. + */ + if (mode == PG_MODE_ENABLE || mode == PG_MODE_DISABLE) + { + ControlFile->data_checksum_version = + (mode == PG_MODE_ENABLE) ? PG_DATA_CHECKSUM_VERSION : 0; + + if (do_sync) + { + pg_log_info("syncing data directory"); + fsync_pgdata(DataDir, PG_VERSION_NUM); + } + + pg_log_info("updating control file"); + update_controlfile(DataDir, ControlFile, do_sync); + + if (verbose) + printf(_("Data checksum version: %u\n"), ControlFile->data_checksum_version); + if (mode == PG_MODE_ENABLE) + printf(_("Checksums enabled in cluster\n")); + else + printf(_("Checksums disabled in cluster\n")); + } + + return 0; +} |