diff options
Diffstat (limited to 'src/pmdk/src/core/os_posix.c')
-rw-r--r-- | src/pmdk/src/core/os_posix.c | 353 |
1 files changed, 353 insertions, 0 deletions
diff --git a/src/pmdk/src/core/os_posix.c b/src/pmdk/src/core/os_posix.c new file mode 100644 index 000000000..c953a4046 --- /dev/null +++ b/src/pmdk/src/core/os_posix.c @@ -0,0 +1,353 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* Copyright 2017-2020, Intel Corporation */ + +/* + * os_posix.c -- abstraction layer for basic Posix functions + */ + +#define _GNU_SOURCE + +#include <fcntl.h> +#include <stdarg.h> +#include <sys/file.h> +#ifdef __FreeBSD__ +#include <sys/mount.h> +#endif +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/uio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> +#include <errno.h> +#include "util.h" +#include "out.h" +#include "os.h" + +/* + * os_open -- open abstraction layer + */ +int +os_open(const char *pathname, int flags, ...) +{ + int mode_required = (flags & O_CREAT) == O_CREAT; + +#ifdef O_TMPFILE + mode_required |= (flags & O_TMPFILE) == O_TMPFILE; +#endif + + if (mode_required) { + va_list arg; + va_start(arg, flags); + /* Clang requires int due to auto-promotion */ + int mode = va_arg(arg, int); + va_end(arg); + return open(pathname, flags, (mode_t)mode); + } else { + return open(pathname, flags); + } +} + +/* + * os_fsync -- fsync abstraction layer + */ +int +os_fsync(int fd) +{ + return fsync(fd); +} + +/* + * os_fsync_dir -- fsync the directory + */ +int +os_fsync_dir(const char *dir_name) +{ + int fd = os_open(dir_name, O_RDONLY | O_DIRECTORY); + if (fd < 0) + return -1; + + int ret = os_fsync(fd); + + os_close(fd); + + return ret; +} + +/* + * os_stat -- stat abstraction layer + */ +int +os_stat(const char *pathname, os_stat_t *buf) +{ + return stat(pathname, buf); +} + +/* + * os_unlink -- unlink abstraction layer + */ +int +os_unlink(const char *pathname) +{ + return unlink(pathname); +} + +/* + * os_access -- access abstraction layer + */ +int +os_access(const char *pathname, int mode) +{ + return access(pathname, mode); +} + +/* + * os_fopen -- fopen abstraction layer + */ +FILE * +os_fopen(const char *pathname, const char *mode) +{ + return fopen(pathname, mode); +} + +/* + * os_fdopen -- fdopen abstraction layer + */ +FILE * +os_fdopen(int fd, const char *mode) +{ + return fdopen(fd, mode); +} + +/* + * os_chmod -- chmod abstraction layer + */ +int +os_chmod(const char *pathname, mode_t mode) +{ + return chmod(pathname, mode); +} + +/* + * os_mkstemp -- mkstemp abstraction layer + */ +int +os_mkstemp(char *temp) +{ + return mkstemp(temp); +} + +/* + * os_posix_fallocate -- posix_fallocate abstraction layer + */ +int +os_posix_fallocate(int fd, os_off_t offset, off_t len) +{ + +#ifdef __FreeBSD__ + struct stat fbuf; + struct statfs fsbuf; +/* + * XXX Workaround for https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=223287 + * + * FreeBSD implements posix_fallocate with a simple block allocation/zero + * loop. If the requested size is unreasonably large, this can result in + * an uninterruptable system call that will suck up all the space in the + * file system and could take hours to fail. To avoid this, make a crude + * check to see if the requested allocation is larger than the available + * space in the file system (minus any blocks already allocated to the + * file), and if so, immediately return ENOSPC. We do the check only if + * the offset is 0; otherwise, trying to figure out how many additional + * blocks are required is too complicated. + * + * This workaround is here mostly to fail "absurdly" large requests for + * testing purposes; however, it is coded to allow normal (albeit slow) + * operation if the space can actually be allocated. Because of the way + * PMDK uses posix_fallocate, supporting Linux-style fallocate in + * FreeBSD should be considered. + */ + if (offset == 0) { + if (fstatfs(fd, &fsbuf) == -1 || fstat(fd, &fbuf) == -1) + return errno; + + size_t reqd_blocks = + ((size_t)len + (fsbuf.f_bsize - 1)) / fsbuf.f_bsize; + if (fbuf.st_blocks > 0) { + if (reqd_blocks >= (size_t)fbuf.st_blocks) + reqd_blocks -= (size_t)fbuf.st_blocks; + else + reqd_blocks = 0; + } + if (reqd_blocks > (size_t)fsbuf.f_bavail) + return ENOSPC; + } +#endif + +/* + * First, try to alloc the whole thing in one go. This allows ENOSPC to + * fail immediately -- allocating piece by piece would fill the storage + * just to abort halfway. + */ + int err = posix_fallocate(fd, offset, len); + if (err != ENOMEM && err != EINTR) + return err; + +/* + * Workaround for a bug in tmpfs where it fails large but reasonable + * requests that exceed available DRAM but fit within swap space. And + * even if a request fits within DRAM, tmpfs will evict other tasks + * just to reserve space. + * + * We also want to survive random unrelated signals. Profilers spam + * the program with SIGVTALRM/SIGPROF, anything run from a terminal can + * receive SIGNWINCH, etc. As fallocate is a long-running syscall, + * let's restart it, but in a way that avoids infinite loops. + * + * Thus: + * * limit a single syscall to 1GB + * * ignore sporadic signals + * * on repeated failures, start reducing syscall size + * * ... but not below 1MB + */ + os_off_t chunk = 1LL << 30; /* 1GB */ + int tries = 0; + + while (len) { + if (chunk > len) + chunk = len; + + int err = posix_fallocate(fd, offset, chunk); + if (!err) { + offset += chunk; + len -= chunk; + tries = 0; + } else if (err != ENOMEM && err != EINTR) { + return err; + } else if (++tries == 5) { + tries = 0; + chunk /= 2; + + /* + * Within memory pressure or a signal storm, small + * allocs are more likely to get through, but once we + * get this small, something is badly wrong. + */ + if (chunk < 1LL << 20) /* 1MB */ + return err; + } + } + + return 0; +} + +/* + * os_ftruncate -- ftruncate abstraction layer + */ +int +os_ftruncate(int fd, os_off_t length) +{ + return ftruncate(fd, length); +} + +/* + * os_flock -- flock abstraction layer + */ +int +os_flock(int fd, int operation) +{ + int opt = 0; + if (operation & OS_LOCK_EX) + opt |= LOCK_EX; + if (operation & OS_LOCK_SH) + opt |= LOCK_SH; + if (operation & OS_LOCK_UN) + opt |= LOCK_UN; + if (operation & OS_LOCK_NB) + opt |= LOCK_NB; + + return flock(fd, opt); +} + +/* + * os_writev -- writev abstraction layer + */ +ssize_t +os_writev(int fd, const struct iovec *iov, int iovcnt) +{ + return writev(fd, iov, iovcnt); +} + +/* + * os_clock_gettime -- clock_gettime abstraction layer + */ +int +os_clock_gettime(int id, struct timespec *ts) +{ + return clock_gettime(id, ts); +} + +/* + * os_rand_r -- rand_r abstraction layer + */ +unsigned +os_rand_r(unsigned *seedp) +{ + return (unsigned)rand_r(seedp); +} + +/* + * os_unsetenv -- unsetenv abstraction layer + */ +int +os_unsetenv(const char *name) +{ + return unsetenv(name); +} + +/* + * os_setenv -- setenv abstraction layer + */ +int +os_setenv(const char *name, const char *value, int overwrite) +{ + return setenv(name, value, overwrite); +} + +/* + * secure_getenv -- provide GNU secure_getenv for FreeBSD + */ +#if defined(__FreeBSD__) +static char * +secure_getenv(const char *name) +{ + if (issetugid() != 0) + return NULL; + + return getenv(name); +} +#endif + +/* + * os_getenv -- getenv abstraction layer + */ +char * +os_getenv(const char *name) +{ + return secure_getenv(name); +} + +/* + * os_strsignal -- strsignal abstraction layer + */ +const char * +os_strsignal(int sig) +{ + return strsignal(sig); +} + +int +os_execv(const char *path, char *const argv[]) +{ + return execv(path, argv); +} |