summaryrefslogtreecommitdiffstats
path: root/src/bin/pg_test_fsync/pg_test_fsync.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/bin/pg_test_fsync/pg_test_fsync.c')
-rw-r--r--src/bin/pg_test_fsync/pg_test_fsync.c652
1 files changed, 652 insertions, 0 deletions
diff --git a/src/bin/pg_test_fsync/pg_test_fsync.c b/src/bin/pg_test_fsync/pg_test_fsync.c
new file mode 100644
index 0000000..f7bc199
--- /dev/null
+++ b/src/bin/pg_test_fsync/pg_test_fsync.c
@@ -0,0 +1,652 @@
+/*
+ * pg_test_fsync.c
+ * tests all supported fsync() methods
+ */
+
+#include "postgres_fe.h"
+
+#include <limits.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <fcntl.h>
+#include <time.h>
+#include <unistd.h>
+#include <signal.h>
+
+#include "access/xlogdefs.h"
+#include "common/logging.h"
+#include "common/pg_prng.h"
+#include "getopt_long.h"
+
+/*
+ * put the temp files in the local directory
+ * unless the user specifies otherwise
+ */
+#define FSYNC_FILENAME "./pg_test_fsync.out"
+
+#define XLOG_BLCKSZ_K (XLOG_BLCKSZ / 1024)
+
+#define LABEL_FORMAT " %-30s"
+#define NA_FORMAT "%21s\n"
+/* translator: maintain alignment with NA_FORMAT */
+#define OPS_FORMAT gettext_noop("%13.3f ops/sec %6.0f usecs/op\n")
+#define USECS_SEC 1000000
+
+/* These are macros to avoid timing the function call overhead. */
+#ifndef WIN32
+#define START_TIMER \
+do { \
+ alarm_triggered = false; \
+ alarm(secs_per_test); \
+ gettimeofday(&start_t, NULL); \
+} while (0)
+#else
+/* WIN32 doesn't support alarm, so we create a thread and sleep there */
+#define START_TIMER \
+do { \
+ alarm_triggered = false; \
+ if (CreateThread(NULL, 0, process_alarm, NULL, 0, NULL) == \
+ INVALID_HANDLE_VALUE) \
+ pg_fatal("could not create thread for alarm"); \
+ gettimeofday(&start_t, NULL); \
+} while (0)
+#endif
+
+#define STOP_TIMER \
+do { \
+ gettimeofday(&stop_t, NULL); \
+ print_elapse(start_t, stop_t, ops); \
+} while (0)
+
+
+static const char *progname;
+
+static unsigned int secs_per_test = 5;
+static int needs_unlink = 0;
+static char full_buf[DEFAULT_XLOG_SEG_SIZE],
+ *buf,
+ *filename = FSYNC_FILENAME;
+static struct timeval start_t,
+ stop_t;
+static bool alarm_triggered = false;
+
+
+static void handle_args(int argc, char *argv[]);
+static void prepare_buf(void);
+static void test_open(void);
+static void test_non_sync(void);
+static void test_sync(int writes_per_op);
+static void test_open_syncs(void);
+static void test_open_sync(const char *msg, int writes_size);
+static void test_file_descriptor_sync(void);
+
+#ifndef WIN32
+static void process_alarm(int sig);
+#else
+static DWORD WINAPI process_alarm(LPVOID param);
+#endif
+static void signal_cleanup(int sig);
+
+#ifdef HAVE_FSYNC_WRITETHROUGH
+static int pg_fsync_writethrough(int fd);
+#endif
+static void print_elapse(struct timeval start_t, struct timeval stop_t, int ops);
+
+#define die(msg) pg_fatal("%s: %m", _(msg))
+
+
+int
+main(int argc, char *argv[])
+{
+ pg_logging_init(argv[0]);
+ set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_test_fsync"));
+ progname = get_progname(argv[0]);
+
+ handle_args(argc, argv);
+
+ /* Prevent leaving behind the test file */
+ pqsignal(SIGINT, signal_cleanup);
+ pqsignal(SIGTERM, signal_cleanup);
+#ifndef WIN32
+ pqsignal(SIGALRM, process_alarm);
+#endif
+#ifdef SIGHUP
+ /* Not defined on win32 */
+ pqsignal(SIGHUP, signal_cleanup);
+#endif
+
+ pg_prng_seed(&pg_global_prng_state, (uint64) time(NULL));
+
+ prepare_buf();
+
+ test_open();
+
+ /* Test using 1 XLOG_BLCKSZ write */
+ test_sync(1);
+
+ /* Test using 2 XLOG_BLCKSZ writes */
+ test_sync(2);
+
+ test_open_syncs();
+
+ test_file_descriptor_sync();
+
+ test_non_sync();
+
+ unlink(filename);
+
+ return 0;
+}
+
+static void
+handle_args(int argc, char *argv[])
+{
+ static struct option long_options[] = {
+ {"filename", required_argument, NULL, 'f'},
+ {"secs-per-test", required_argument, NULL, 's'},
+ {NULL, 0, NULL, 0}
+ };
+
+ int option; /* Command line option */
+ int optindex = 0; /* used by getopt_long */
+ unsigned long optval; /* used for option parsing */
+ char *endptr;
+
+ if (argc > 1)
+ {
+ if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
+ {
+ printf(_("Usage: %s [-f FILENAME] [-s SECS-PER-TEST]\n"), progname);
+ exit(0);
+ }
+ if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
+ {
+ puts("pg_test_fsync (PostgreSQL) " PG_VERSION);
+ exit(0);
+ }
+ }
+
+ while ((option = getopt_long(argc, argv, "f:s:",
+ long_options, &optindex)) != -1)
+ {
+ switch (option)
+ {
+ case 'f':
+ filename = pg_strdup(optarg);
+ break;
+
+ case 's':
+ errno = 0;
+ optval = strtoul(optarg, &endptr, 10);
+
+ if (endptr == optarg || *endptr != '\0' ||
+ errno != 0 || optval != (unsigned int) optval)
+ {
+ pg_log_error("invalid argument for option %s", "--secs-per-test");
+ pg_log_error_hint("Try \"%s --help\" for more information.", progname);
+ exit(1);
+ }
+
+ secs_per_test = (unsigned int) optval;
+ if (secs_per_test == 0)
+ pg_fatal("%s must be in range %u..%u",
+ "--secs-per-test", 1, UINT_MAX);
+ break;
+
+ default:
+ /* getopt_long already emitted a complaint */
+ pg_log_error_hint("Try \"%s --help\" for more information.", progname);
+ exit(1);
+ }
+ }
+
+ if (argc > optind)
+ {
+ pg_log_error("too many command-line arguments (first is \"%s\")",
+ argv[optind]);
+ pg_log_error_hint("Try \"%s --help\" for more information.", progname);
+ exit(1);
+ }
+
+ printf(ngettext("%u second per test\n",
+ "%u seconds per test\n",
+ secs_per_test),
+ secs_per_test);
+#if defined(O_DIRECT)
+ printf(_("O_DIRECT supported on this platform for open_datasync and open_sync.\n"));
+#elif defined(F_NOCACHE)
+ printf(_("F_NOCACHE supported on this platform for open_datasync and open_sync.\n"));
+#else
+ printf(_("Direct I/O is not supported on this platform.\n"));
+#endif
+}
+
+static void
+prepare_buf(void)
+{
+ int ops;
+
+ /* write random data into buffer */
+ for (ops = 0; ops < DEFAULT_XLOG_SEG_SIZE; ops++)
+ full_buf[ops] = (char) pg_prng_int32(&pg_global_prng_state);
+
+ buf = (char *) TYPEALIGN(XLOG_BLCKSZ, full_buf);
+}
+
+static void
+test_open(void)
+{
+ int tmpfile;
+
+ /*
+ * test if we can open the target file
+ */
+ if ((tmpfile = open(filename, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
+ die("could not open output file");
+ needs_unlink = 1;
+ if (write(tmpfile, full_buf, DEFAULT_XLOG_SEG_SIZE) !=
+ DEFAULT_XLOG_SEG_SIZE)
+ die("write failed");
+
+ /* fsync now so that dirty buffers don't skew later tests */
+ if (fsync(tmpfile) != 0)
+ die("fsync failed");
+
+ close(tmpfile);
+}
+
+static int
+open_direct(const char *path, int flags, mode_t mode)
+{
+ int fd;
+
+#ifdef O_DIRECT
+ flags |= O_DIRECT;
+#endif
+
+ fd = open(path, flags, mode);
+
+#if !defined(O_DIRECT) && defined(F_NOCACHE)
+ if (fd >= 0 && fcntl(fd, F_NOCACHE, 1) < 0)
+ {
+ int save_errno = errno;
+
+ close(fd);
+ errno = save_errno;
+ return -1;
+ }
+#endif
+
+ return fd;
+}
+
+static void
+test_sync(int writes_per_op)
+{
+ int tmpfile,
+ ops,
+ writes;
+ bool fs_warning = false;
+
+ if (writes_per_op == 1)
+ printf(_("\nCompare file sync methods using one %dkB write:\n"), XLOG_BLCKSZ_K);
+ else
+ printf(_("\nCompare file sync methods using two %dkB writes:\n"), XLOG_BLCKSZ_K);
+ printf(_("(in wal_sync_method preference order, except fdatasync is Linux's default)\n"));
+
+ /*
+ * Test open_datasync if available
+ */
+ printf(LABEL_FORMAT, "open_datasync");
+ fflush(stdout);
+
+#ifdef OPEN_DATASYNC_FLAG
+ if ((tmpfile = open_direct(filename, O_RDWR | O_DSYNC | PG_BINARY, 0)) == -1)
+ {
+ printf(NA_FORMAT, _("n/a*"));
+ fs_warning = true;
+ }
+ else
+ {
+ START_TIMER;
+ for (ops = 0; alarm_triggered == false; ops++)
+ {
+ for (writes = 0; writes < writes_per_op; writes++)
+ if (pg_pwrite(tmpfile,
+ buf,
+ XLOG_BLCKSZ,
+ writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
+ die("write failed");
+ }
+ STOP_TIMER;
+ close(tmpfile);
+ }
+#else
+ printf(NA_FORMAT, _("n/a"));
+#endif
+
+/*
+ * Test fdatasync if available
+ */
+ printf(LABEL_FORMAT, "fdatasync");
+ fflush(stdout);
+
+#ifdef HAVE_FDATASYNC
+ if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
+ die("could not open output file");
+ START_TIMER;
+ for (ops = 0; alarm_triggered == false; ops++)
+ {
+ for (writes = 0; writes < writes_per_op; writes++)
+ if (pg_pwrite(tmpfile,
+ buf,
+ XLOG_BLCKSZ,
+ writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
+ die("write failed");
+ fdatasync(tmpfile);
+ }
+ STOP_TIMER;
+ close(tmpfile);
+#else
+ printf(NA_FORMAT, _("n/a"));
+#endif
+
+/*
+ * Test fsync
+ */
+ printf(LABEL_FORMAT, "fsync");
+ fflush(stdout);
+
+ if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
+ die("could not open output file");
+ START_TIMER;
+ for (ops = 0; alarm_triggered == false; ops++)
+ {
+ for (writes = 0; writes < writes_per_op; writes++)
+ if (pg_pwrite(tmpfile,
+ buf,
+ XLOG_BLCKSZ,
+ writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
+ die("write failed");
+ if (fsync(tmpfile) != 0)
+ die("fsync failed");
+ }
+ STOP_TIMER;
+ close(tmpfile);
+
+/*
+ * If fsync_writethrough is available, test as well
+ */
+ printf(LABEL_FORMAT, "fsync_writethrough");
+ fflush(stdout);
+
+#ifdef HAVE_FSYNC_WRITETHROUGH
+ if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
+ die("could not open output file");
+ START_TIMER;
+ for (ops = 0; alarm_triggered == false; ops++)
+ {
+ for (writes = 0; writes < writes_per_op; writes++)
+ if (pg_pwrite(tmpfile,
+ buf,
+ XLOG_BLCKSZ,
+ writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
+ die("write failed");
+ if (pg_fsync_writethrough(tmpfile) != 0)
+ die("fsync failed");
+ }
+ STOP_TIMER;
+ close(tmpfile);
+#else
+ printf(NA_FORMAT, _("n/a"));
+#endif
+
+/*
+ * Test open_sync if available
+ */
+ printf(LABEL_FORMAT, "open_sync");
+ fflush(stdout);
+
+#ifdef OPEN_SYNC_FLAG
+ if ((tmpfile = open_direct(filename, O_RDWR | OPEN_SYNC_FLAG | PG_BINARY, 0)) == -1)
+ {
+ printf(NA_FORMAT, _("n/a*"));
+ fs_warning = true;
+ }
+ else
+ {
+ START_TIMER;
+ for (ops = 0; alarm_triggered == false; ops++)
+ {
+ for (writes = 0; writes < writes_per_op; writes++)
+ if (pg_pwrite(tmpfile,
+ buf,
+ XLOG_BLCKSZ,
+ writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
+
+ /*
+ * This can generate write failures if the filesystem has
+ * a large block size, e.g. 4k, and there is no support
+ * for O_DIRECT writes smaller than the file system block
+ * size, e.g. XFS.
+ */
+ die("write failed");
+ }
+ STOP_TIMER;
+ close(tmpfile);
+ }
+#else
+ printf(NA_FORMAT, _("n/a"));
+#endif
+
+ if (fs_warning)
+ {
+ printf(_("* This file system and its mount options do not support direct\n"
+ " I/O, e.g. ext4 in journaled mode.\n"));
+ }
+}
+
+static void
+test_open_syncs(void)
+{
+ printf(_("\nCompare open_sync with different write sizes:\n"));
+ printf(_("(This is designed to compare the cost of writing 16kB in different write\n"
+ "open_sync sizes.)\n"));
+
+ test_open_sync(_(" 1 * 16kB open_sync write"), 16);
+ test_open_sync(_(" 2 * 8kB open_sync writes"), 8);
+ test_open_sync(_(" 4 * 4kB open_sync writes"), 4);
+ test_open_sync(_(" 8 * 2kB open_sync writes"), 2);
+ test_open_sync(_("16 * 1kB open_sync writes"), 1);
+}
+
+/*
+ * Test open_sync with different size files
+ */
+static void
+test_open_sync(const char *msg, int writes_size)
+{
+#ifdef OPEN_SYNC_FLAG
+ int tmpfile,
+ ops,
+ writes;
+#endif
+
+ printf(LABEL_FORMAT, msg);
+ fflush(stdout);
+
+#ifdef OPEN_SYNC_FLAG
+ if ((tmpfile = open_direct(filename, O_RDWR | OPEN_SYNC_FLAG | PG_BINARY, 0)) == -1)
+ printf(NA_FORMAT, _("n/a*"));
+ else
+ {
+ START_TIMER;
+ for (ops = 0; alarm_triggered == false; ops++)
+ {
+ for (writes = 0; writes < 16 / writes_size; writes++)
+ if (pg_pwrite(tmpfile,
+ buf,
+ writes_size * 1024,
+ writes * writes_size * 1024) !=
+ writes_size * 1024)
+ die("write failed");
+ }
+ STOP_TIMER;
+ close(tmpfile);
+ }
+#else
+ printf(NA_FORMAT, _("n/a"));
+#endif
+}
+
+static void
+test_file_descriptor_sync(void)
+{
+ int tmpfile,
+ ops;
+
+ /*
+ * Test whether fsync can sync data written on a different descriptor for
+ * the same file. This checks the efficiency of multi-process fsyncs
+ * against the same file. Possibly this should be done with writethrough
+ * on platforms which support it.
+ */
+ printf(_("\nTest if fsync on non-write file descriptor is honored:\n"));
+ printf(_("(If the times are similar, fsync() can sync data written on a different\n"
+ "descriptor.)\n"));
+
+ /*
+ * first write, fsync and close, which is the normal behavior without
+ * multiple descriptors
+ */
+ printf(LABEL_FORMAT, "write, fsync, close");
+ fflush(stdout);
+
+ START_TIMER;
+ for (ops = 0; alarm_triggered == false; ops++)
+ {
+ if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
+ die("could not open output file");
+ if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
+ die("write failed");
+ if (fsync(tmpfile) != 0)
+ die("fsync failed");
+ close(tmpfile);
+
+ /*
+ * open and close the file again to be consistent with the following
+ * test
+ */
+ if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
+ die("could not open output file");
+ close(tmpfile);
+ }
+ STOP_TIMER;
+
+ /*
+ * Now open, write, close, open again and fsync This simulates processes
+ * fsyncing each other's writes.
+ */
+ printf(LABEL_FORMAT, "write, close, fsync");
+ fflush(stdout);
+
+ START_TIMER;
+ for (ops = 0; alarm_triggered == false; ops++)
+ {
+ if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
+ die("could not open output file");
+ if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
+ die("write failed");
+ close(tmpfile);
+ /* reopen file */
+ if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
+ die("could not open output file");
+ if (fsync(tmpfile) != 0)
+ die("fsync failed");
+ close(tmpfile);
+ }
+ STOP_TIMER;
+}
+
+static void
+test_non_sync(void)
+{
+ int tmpfile,
+ ops;
+
+ /*
+ * Test a simple write without fsync
+ */
+ printf(_("\nNon-sync'ed %dkB writes:\n"), XLOG_BLCKSZ_K);
+ printf(LABEL_FORMAT, "write");
+ fflush(stdout);
+
+ if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
+ die("could not open output file");
+ START_TIMER;
+ for (ops = 0; alarm_triggered == false; ops++)
+ {
+ if (pg_pwrite(tmpfile, buf, XLOG_BLCKSZ, 0) != XLOG_BLCKSZ)
+ die("write failed");
+ }
+ STOP_TIMER;
+ close(tmpfile);
+}
+
+static void
+signal_cleanup(int signum)
+{
+ /* Delete the file if it exists. Ignore errors */
+ if (needs_unlink)
+ unlink(filename);
+ /* Finish incomplete line on stdout */
+ puts("");
+ exit(signum);
+}
+
+#ifdef HAVE_FSYNC_WRITETHROUGH
+
+static int
+pg_fsync_writethrough(int fd)
+{
+#ifdef WIN32
+ return _commit(fd);
+#elif defined(F_FULLFSYNC)
+ return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
+#else
+ errno = ENOSYS;
+ return -1;
+#endif
+}
+#endif
+
+/*
+ * print out the writes per second for tests
+ */
+static void
+print_elapse(struct timeval start_t, struct timeval stop_t, int ops)
+{
+ double total_time = (stop_t.tv_sec - start_t.tv_sec) +
+ (stop_t.tv_usec - start_t.tv_usec) * 0.000001;
+ double per_second = ops / total_time;
+ double avg_op_time_us = (total_time / ops) * USECS_SEC;
+
+ printf(_(OPS_FORMAT), per_second, avg_op_time_us);
+}
+
+#ifndef WIN32
+static void
+process_alarm(int sig)
+{
+ alarm_triggered = true;
+}
+#else
+static DWORD WINAPI
+process_alarm(LPVOID param)
+{
+ /* WIN32 doesn't support alarm, so we create a thread and sleep here */
+ Sleep(secs_per_test * 1000);
+ alarm_triggered = true;
+ ExitThread(0);
+}
+#endif