/* * pg_test_fsync.c * tests all supported fsync() methods */ #include "postgres_fe.h" #include #include #include #include #include #include #include #include "access/xlogdefs.h" #include "common/logging.h" #include "common/pg_prng.h" #include "getopt_long.h" /* * put the temp files in the local directory * unless the user specifies otherwise */ #define FSYNC_FILENAME "./pg_test_fsync.out" #define XLOG_BLCKSZ_K (XLOG_BLCKSZ / 1024) #define LABEL_FORMAT " %-30s" #define NA_FORMAT "%21s\n" /* translator: maintain alignment with NA_FORMAT */ #define OPS_FORMAT gettext_noop("%13.3f ops/sec %6.0f usecs/op\n") #define USECS_SEC 1000000 /* These are macros to avoid timing the function call overhead. */ #ifndef WIN32 #define START_TIMER \ do { \ alarm_triggered = false; \ alarm(secs_per_test); \ gettimeofday(&start_t, NULL); \ } while (0) #else /* WIN32 doesn't support alarm, so we create a thread and sleep there */ #define START_TIMER \ do { \ alarm_triggered = false; \ if (CreateThread(NULL, 0, process_alarm, NULL, 0, NULL) == \ INVALID_HANDLE_VALUE) \ pg_fatal("could not create thread for alarm"); \ gettimeofday(&start_t, NULL); \ } while (0) #endif #define STOP_TIMER \ do { \ gettimeofday(&stop_t, NULL); \ print_elapse(start_t, stop_t, ops); \ } while (0) static const char *progname; static unsigned int secs_per_test = 5; static int needs_unlink = 0; static char full_buf[DEFAULT_XLOG_SEG_SIZE], *buf, *filename = FSYNC_FILENAME; static struct timeval start_t, stop_t; static bool alarm_triggered = false; static void handle_args(int argc, char *argv[]); static void prepare_buf(void); static void test_open(void); static void test_non_sync(void); static void test_sync(int writes_per_op); static void test_open_syncs(void); static void test_open_sync(const char *msg, int writes_size); static void test_file_descriptor_sync(void); #ifndef WIN32 static void process_alarm(int sig); #else static DWORD WINAPI process_alarm(LPVOID param); #endif static void signal_cleanup(int sig); #ifdef HAVE_FSYNC_WRITETHROUGH static int pg_fsync_writethrough(int fd); #endif static void print_elapse(struct timeval start_t, struct timeval stop_t, int ops); #define die(msg) pg_fatal("%s: %m", _(msg)) int main(int argc, char *argv[]) { pg_logging_init(argv[0]); set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_test_fsync")); progname = get_progname(argv[0]); handle_args(argc, argv); /* Prevent leaving behind the test file */ pqsignal(SIGINT, signal_cleanup); pqsignal(SIGTERM, signal_cleanup); #ifndef WIN32 pqsignal(SIGALRM, process_alarm); #endif #ifdef SIGHUP /* Not defined on win32 */ pqsignal(SIGHUP, signal_cleanup); #endif pg_prng_seed(&pg_global_prng_state, (uint64) time(NULL)); prepare_buf(); test_open(); /* Test using 1 XLOG_BLCKSZ write */ test_sync(1); /* Test using 2 XLOG_BLCKSZ writes */ test_sync(2); test_open_syncs(); test_file_descriptor_sync(); test_non_sync(); unlink(filename); return 0; } static void handle_args(int argc, char *argv[]) { static struct option long_options[] = { {"filename", required_argument, NULL, 'f'}, {"secs-per-test", required_argument, NULL, 's'}, {NULL, 0, NULL, 0} }; int option; /* Command line option */ int optindex = 0; /* used by getopt_long */ unsigned long optval; /* used for option parsing */ char *endptr; if (argc > 1) { if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) { printf(_("Usage: %s [-f FILENAME] [-s SECS-PER-TEST]\n"), progname); exit(0); } if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) { puts("pg_test_fsync (PostgreSQL) " PG_VERSION); exit(0); } } while ((option = getopt_long(argc, argv, "f:s:", long_options, &optindex)) != -1) { switch (option) { case 'f': filename = pg_strdup(optarg); break; case 's': errno = 0; optval = strtoul(optarg, &endptr, 10); if (endptr == optarg || *endptr != '\0' || errno != 0 || optval != (unsigned int) optval) { pg_log_error("invalid argument for option %s", "--secs-per-test"); pg_log_error_hint("Try \"%s --help\" for more information.", progname); exit(1); } secs_per_test = (unsigned int) optval; if (secs_per_test == 0) pg_fatal("%s must be in range %u..%u", "--secs-per-test", 1, UINT_MAX); break; default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); exit(1); } } if (argc > optind) { pg_log_error("too many command-line arguments (first is \"%s\")", argv[optind]); pg_log_error_hint("Try \"%s --help\" for more information.", progname); exit(1); } printf(ngettext("%u second per test\n", "%u seconds per test\n", secs_per_test), secs_per_test); #if defined(O_DIRECT) printf(_("O_DIRECT supported on this platform for open_datasync and open_sync.\n")); #elif defined(F_NOCACHE) printf(_("F_NOCACHE supported on this platform for open_datasync and open_sync.\n")); #else printf(_("Direct I/O is not supported on this platform.\n")); #endif } static void prepare_buf(void) { int ops; /* write random data into buffer */ for (ops = 0; ops < DEFAULT_XLOG_SEG_SIZE; ops++) full_buf[ops] = (char) pg_prng_int32(&pg_global_prng_state); buf = (char *) TYPEALIGN(XLOG_BLCKSZ, full_buf); } static void test_open(void) { int tmpfile; /* * test if we can open the target file */ if ((tmpfile = open(filename, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1) die("could not open output file"); needs_unlink = 1; if (write(tmpfile, full_buf, DEFAULT_XLOG_SEG_SIZE) != DEFAULT_XLOG_SEG_SIZE) die("write failed"); /* fsync now so that dirty buffers don't skew later tests */ if (fsync(tmpfile) != 0) die("fsync failed"); close(tmpfile); } static int open_direct(const char *path, int flags, mode_t mode) { int fd; #ifdef O_DIRECT flags |= O_DIRECT; #endif fd = open(path, flags, mode); #if !defined(O_DIRECT) && defined(F_NOCACHE) if (fd >= 0 && fcntl(fd, F_NOCACHE, 1) < 0) { int save_errno = errno; close(fd); errno = save_errno; return -1; } #endif return fd; } static void test_sync(int writes_per_op) { int tmpfile, ops, writes; bool fs_warning = false; if (writes_per_op == 1) printf(_("\nCompare file sync methods using one %dkB write:\n"), XLOG_BLCKSZ_K); else printf(_("\nCompare file sync methods using two %dkB writes:\n"), XLOG_BLCKSZ_K); printf(_("(in wal_sync_method preference order, except fdatasync is Linux's default)\n")); /* * Test open_datasync if available */ printf(LABEL_FORMAT, "open_datasync"); fflush(stdout); #ifdef OPEN_DATASYNC_FLAG if ((tmpfile = open_direct(filename, O_RDWR | O_DSYNC | PG_BINARY, 0)) == -1) { printf(NA_FORMAT, _("n/a*")); fs_warning = true; } else { START_TIMER; for (ops = 0; alarm_triggered == false; ops++) { for (writes = 0; writes < writes_per_op; writes++) if (pg_pwrite(tmpfile, buf, XLOG_BLCKSZ, writes * XLOG_BLCKSZ) != XLOG_BLCKSZ) die("write failed"); } STOP_TIMER; close(tmpfile); } #else printf(NA_FORMAT, _("n/a")); #endif /* * Test fdatasync if available */ printf(LABEL_FORMAT, "fdatasync"); fflush(stdout); #ifdef HAVE_FDATASYNC if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1) die("could not open output file"); START_TIMER; for (ops = 0; alarm_triggered == false; ops++) { for (writes = 0; writes < writes_per_op; writes++) if (pg_pwrite(tmpfile, buf, XLOG_BLCKSZ, writes * XLOG_BLCKSZ) != XLOG_BLCKSZ) die("write failed"); fdatasync(tmpfile); } STOP_TIMER; close(tmpfile); #else printf(NA_FORMAT, _("n/a")); #endif /* * Test fsync */ printf(LABEL_FORMAT, "fsync"); fflush(stdout); if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1) die("could not open output file"); START_TIMER; for (ops = 0; alarm_triggered == false; ops++) { for (writes = 0; writes < writes_per_op; writes++) if (pg_pwrite(tmpfile, buf, XLOG_BLCKSZ, writes * XLOG_BLCKSZ) != XLOG_BLCKSZ) die("write failed"); if (fsync(tmpfile) != 0) die("fsync failed"); } STOP_TIMER; close(tmpfile); /* * If fsync_writethrough is available, test as well */ printf(LABEL_FORMAT, "fsync_writethrough"); fflush(stdout); #ifdef HAVE_FSYNC_WRITETHROUGH if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1) die("could not open output file"); START_TIMER; for (ops = 0; alarm_triggered == false; ops++) { for (writes = 0; writes < writes_per_op; writes++) if (pg_pwrite(tmpfile, buf, XLOG_BLCKSZ, writes * XLOG_BLCKSZ) != XLOG_BLCKSZ) die("write failed"); if (pg_fsync_writethrough(tmpfile) != 0) die("fsync failed"); } STOP_TIMER; close(tmpfile); #else printf(NA_FORMAT, _("n/a")); #endif /* * Test open_sync if available */ printf(LABEL_FORMAT, "open_sync"); fflush(stdout); #ifdef OPEN_SYNC_FLAG if ((tmpfile = open_direct(filename, O_RDWR | OPEN_SYNC_FLAG | PG_BINARY, 0)) == -1) { printf(NA_FORMAT, _("n/a*")); fs_warning = true; } else { START_TIMER; for (ops = 0; alarm_triggered == false; ops++) { for (writes = 0; writes < writes_per_op; writes++) if (pg_pwrite(tmpfile, buf, XLOG_BLCKSZ, writes * XLOG_BLCKSZ) != XLOG_BLCKSZ) /* * This can generate write failures if the filesystem has * a large block size, e.g. 4k, and there is no support * for O_DIRECT writes smaller than the file system block * size, e.g. XFS. */ die("write failed"); } STOP_TIMER; close(tmpfile); } #else printf(NA_FORMAT, _("n/a")); #endif if (fs_warning) { printf(_("* This file system and its mount options do not support direct\n" " I/O, e.g. ext4 in journaled mode.\n")); } } static void test_open_syncs(void) { printf(_("\nCompare open_sync with different write sizes:\n")); printf(_("(This is designed to compare the cost of writing 16kB in different write\n" "open_sync sizes.)\n")); test_open_sync(_(" 1 * 16kB open_sync write"), 16); test_open_sync(_(" 2 * 8kB open_sync writes"), 8); test_open_sync(_(" 4 * 4kB open_sync writes"), 4); test_open_sync(_(" 8 * 2kB open_sync writes"), 2); test_open_sync(_("16 * 1kB open_sync writes"), 1); } /* * Test open_sync with different size files */ static void test_open_sync(const char *msg, int writes_size) { #ifdef OPEN_SYNC_FLAG int tmpfile, ops, writes; #endif printf(LABEL_FORMAT, msg); fflush(stdout); #ifdef OPEN_SYNC_FLAG if ((tmpfile = open_direct(filename, O_RDWR | OPEN_SYNC_FLAG | PG_BINARY, 0)) == -1) printf(NA_FORMAT, _("n/a*")); else { START_TIMER; for (ops = 0; alarm_triggered == false; ops++) { for (writes = 0; writes < 16 / writes_size; writes++) if (pg_pwrite(tmpfile, buf, writes_size * 1024, writes * writes_size * 1024) != writes_size * 1024) die("write failed"); } STOP_TIMER; close(tmpfile); } #else printf(NA_FORMAT, _("n/a")); #endif } static void test_file_descriptor_sync(void) { int tmpfile, ops; /* * Test whether fsync can sync data written on a different descriptor for * the same file. This checks the efficiency of multi-process fsyncs * against the same file. Possibly this should be done with writethrough * on platforms which support it. */ printf(_("\nTest if fsync on non-write file descriptor is honored:\n")); printf(_("(If the times are similar, fsync() can sync data written on a different\n" "descriptor.)\n")); /* * first write, fsync and close, which is the normal behavior without * multiple descriptors */ printf(LABEL_FORMAT, "write, fsync, close"); fflush(stdout); START_TIMER; for (ops = 0; alarm_triggered == false; ops++) { if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1) die("could not open output file"); if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ) die("write failed"); if (fsync(tmpfile) != 0) die("fsync failed"); close(tmpfile); /* * open and close the file again to be consistent with the following * test */ if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1) die("could not open output file"); close(tmpfile); } STOP_TIMER; /* * Now open, write, close, open again and fsync This simulates processes * fsyncing each other's writes. */ printf(LABEL_FORMAT, "write, close, fsync"); fflush(stdout); START_TIMER; for (ops = 0; alarm_triggered == false; ops++) { if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1) die("could not open output file"); if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ) die("write failed"); close(tmpfile); /* reopen file */ if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1) die("could not open output file"); if (fsync(tmpfile) != 0) die("fsync failed"); close(tmpfile); } STOP_TIMER; } static void test_non_sync(void) { int tmpfile, ops; /* * Test a simple write without fsync */ printf(_("\nNon-sync'ed %dkB writes:\n"), XLOG_BLCKSZ_K); printf(LABEL_FORMAT, "write"); fflush(stdout); if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1) die("could not open output file"); START_TIMER; for (ops = 0; alarm_triggered == false; ops++) { if (pg_pwrite(tmpfile, buf, XLOG_BLCKSZ, 0) != XLOG_BLCKSZ) die("write failed"); } STOP_TIMER; close(tmpfile); } static void signal_cleanup(int signum) { /* Delete the file if it exists. Ignore errors */ if (needs_unlink) unlink(filename); /* Finish incomplete line on stdout */ puts(""); exit(signum); } #ifdef HAVE_FSYNC_WRITETHROUGH static int pg_fsync_writethrough(int fd) { #ifdef WIN32 return _commit(fd); #elif defined(F_FULLFSYNC) return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0; #else errno = ENOSYS; return -1; #endif } #endif /* * print out the writes per second for tests */ static void print_elapse(struct timeval start_t, struct timeval stop_t, int ops) { double total_time = (stop_t.tv_sec - start_t.tv_sec) + (stop_t.tv_usec - start_t.tv_usec) * 0.000001; double per_second = ops / total_time; double avg_op_time_us = (total_time / ops) * USECS_SEC; printf(_(OPS_FORMAT), per_second, avg_op_time_us); } #ifndef WIN32 static void process_alarm(int sig) { alarm_triggered = true; } #else static DWORD WINAPI process_alarm(LPVOID param) { /* WIN32 doesn't support alarm, so we create a thread and sleep here */ Sleep(secs_per_test * 1000); alarm_triggered = true; ExitThread(0); } #endif