diff options
Diffstat (limited to 'src/pmdk/src/libpmem2/map_posix.c')
-rw-r--r-- | src/pmdk/src/libpmem2/map_posix.c | 609 |
1 files changed, 609 insertions, 0 deletions
diff --git a/src/pmdk/src/libpmem2/map_posix.c b/src/pmdk/src/libpmem2/map_posix.c new file mode 100644 index 000000000..0b042aa68 --- /dev/null +++ b/src/pmdk/src/libpmem2/map_posix.c @@ -0,0 +1,609 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* Copyright 2019-2020, Intel Corporation */ + +/* + * map_posix.c -- pmem2_map (POSIX) + */ + +#include <errno.h> +#include <stdbool.h> +#include <string.h> +#include <sys/mman.h> + +#include "libpmem2.h" + +#include "alloc.h" +#include "auto_flush.h" +#include "config.h" +#include "file.h" +#include "map.h" +#include "out.h" +#include "persist.h" +#include "pmem2_utils.h" +#include "source.h" +#include "sys_util.h" +#include "valgrind_internal.h" + +#ifndef MAP_SYNC +#define MAP_SYNC 0x80000 +#endif + +#ifndef MAP_SHARED_VALIDATE +#define MAP_SHARED_VALIDATE 0x03 +#endif + +#define MEGABYTE ((uintptr_t)1 << 20) +#define GIGABYTE ((uintptr_t)1 << 30) + +/* indicates the cases in which the error cannot occur */ +#define GRAN_IMPOSSIBLE "impossible" +#ifdef __linux__ + /* requested CACHE_LINE, available PAGE */ +#define REQ_CL_AVAIL_PG \ + "requested granularity not available because fd doesn't point to DAX-enabled file " \ + "or kernel doesn't support MAP_SYNC flag (Linux >= 4.15)" + +/* requested BYTE, available PAGE */ +#define REQ_BY_AVAIL_PG REQ_CL_AVAIL_PG + +/* requested BYTE, available CACHE_LINE */ +#define REQ_BY_AVAIL_CL \ + "requested granularity not available because the platform doesn't support eADR" + +static const char *granularity_err_msg[3][3] = { +/* requested granularity / available granularity */ +/* -------------------------------------------------------------------- */ +/* BYTE CACHE_LINE PAGE */ +/* -------------------------------------------------------------------- */ +/* BYTE */ {GRAN_IMPOSSIBLE, REQ_BY_AVAIL_CL, REQ_BY_AVAIL_PG}, +/* CL */ {GRAN_IMPOSSIBLE, GRAN_IMPOSSIBLE, REQ_CL_AVAIL_PG}, +/* PAGE */ {GRAN_IMPOSSIBLE, GRAN_IMPOSSIBLE, GRAN_IMPOSSIBLE}}; +#else +/* requested CACHE_LINE, available PAGE */ +#define REQ_CL_AVAIL_PG \ + "the operating system doesn't provide a method of detecting granularity" + +/* requested BYTE, available PAGE */ +#define REQ_BY_AVAIL_PG \ + "the operating system doesn't provide a method of detecting whether the platform supports eADR" + +static const char *granularity_err_msg[3][3] = { +/* requested granularity / available granularity */ +/* -------------------------------------------------------------------- */ +/* BYTE CACHE_LINE PAGE */ +/* -------------------------------------------------------------------- */ +/* BYTE */ {GRAN_IMPOSSIBLE, GRAN_IMPOSSIBLE, REQ_BY_AVAIL_PG}, +/* CL */ {GRAN_IMPOSSIBLE, GRAN_IMPOSSIBLE, REQ_CL_AVAIL_PG}, +/* PAGE */ {GRAN_IMPOSSIBLE, GRAN_IMPOSSIBLE, GRAN_IMPOSSIBLE}}; +#endif + +/* + * get_map_alignment -- (internal) choose the desired mapping alignment + * + * The smallest supported alignment is 2 megabytes because of the object + * alignment requirements. Changing this value to 4 kilobytes constitutes a + * layout change. + * + * Use 1GB page alignment only if the mapping length is at least + * twice as big as the page size. + */ +static inline size_t +get_map_alignment(size_t len, size_t req_align) +{ + size_t align = 2 * MEGABYTE; + if (req_align) + align = req_align; + else if (len >= 2 * GIGABYTE) + align = GIGABYTE; + + return align; +} + +/* + * map_reserve -- (internal) reserve an address for mmap() + * + * ALSR in 64-bit Linux kernel uses 28-bit of randomness for mmap + * (bit positions 12-39), which means the base mapping address is randomized + * within [0..1024GB] range, with 4KB granularity. Assuming additional + * 1GB alignment, it results in 1024 possible locations. + */ +static int +map_reserve(size_t len, size_t alignment, void **reserv, size_t *reslen, + const struct pmem2_config *cfg) +{ + ASSERTne(reserv, NULL); + + size_t dlength = len + alignment; /* dummy length */ + + /* + * Create dummy mapping to find an unused region of given size. + * Request for increased size for later address alignment. + * Use MAP_PRIVATE with read-only access to simulate + * zero cost for overcommit accounting. Note: MAP_NORESERVE + * flag is ignored if overcommit is disabled (mode 2). + */ + char *daddr = mmap(NULL, dlength, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (daddr == MAP_FAILED) { + if (errno == EEXIST) { + ERR("!mmap MAP_FIXED_NOREPLACE"); + return PMEM2_E_MAPPING_EXISTS; + } + ERR("!mmap MAP_ANONYMOUS"); + return PMEM2_E_ERRNO; + } + + LOG(4, "system choice %p", daddr); + *reserv = (void *)roundup((uintptr_t)daddr, alignment); + /* + * since the last part of the reservation from (reserv + reslen == end) + * will be unmapped, the 'end' address has to be page-aligned. + * 'reserv' is already page-aligned (or even aligned to multiple of page + * size) so it is enough to page-align the 'reslen' value. + */ + *reslen = roundup(len, Pagesize); + LOG(4, "hint %p", *reserv); + + /* + * The placeholder mapping is divided into few parts: + * + * 1 2 3 4 5 + * |......|uuuuuuuuu|rrr|.................| + * + * Addresses: + * 1 == daddr + * 2 == reserv + * 3 == reserv + len + * 4 == reserv + reslen == end (has to be page-aligned) + * 5 == daddr + dlength + * + * Key: + * - '.' is an unused part of the placeholder + * - 'u' is where the actual mapping lies + * - 'r' is what reserved as padding + */ + + /* unmap the placeholder before the actual mapping */ + const size_t before = (uintptr_t)(*reserv) - (uintptr_t)daddr; + if (before) { + if (munmap(daddr, before)) { + ERR("!munmap"); + return PMEM2_E_ERRNO; + } + } + + /* unmap the placeholder after the actual mapping */ + const size_t after = dlength - *reslen - before; + void *end = (void *)((uintptr_t)(*reserv) + (uintptr_t)*reslen); + if (after) + if (munmap(end, after)) { + ERR("!munmap"); + return PMEM2_E_ERRNO; + } + + return 0; +} + +/* + * file_map -- (internal) memory map given file into memory + * If (flags & MAP_PRIVATE) it uses just mmap. Otherwise, it tries to mmap with + * (flags | MAP_SHARED_VALIDATE | MAP_SYNC) which allows flushing from the + * user-space. If MAP_SYNC fails and the user did not specify it by himself it + * falls back to the mmap with user-provided flags. + */ +static int +file_map(void *reserv, size_t len, int proto, int flags, + int fd, off_t offset, bool *map_sync, void **base) +{ + LOG(15, "reserve %p len %zu proto %x flags %x fd %d offset %ld " + "map_sync %p", reserv, len, proto, flags, fd, offset, + map_sync); + + ASSERTne(map_sync, NULL); + ASSERTne(base, NULL); + + /* + * MAP_PRIVATE and MAP_SHARED are mutually exclusive, therefore mmap + * with MAP_PRIVATE is executed separately. + */ + if (flags & MAP_PRIVATE) { + *base = mmap(reserv, len, proto, flags, fd, offset); + if (*base == MAP_FAILED) { + ERR("!mmap"); + return PMEM2_E_ERRNO; + } + LOG(4, "mmap with MAP_PRIVATE succeeded"); + *map_sync = false; + return 0; + } + + /* try to mmap with MAP_SYNC flag */ + const int sync_flags = MAP_SHARED_VALIDATE | MAP_SYNC; + *base = mmap(reserv, len, proto, flags | sync_flags, fd, offset); + if (*base != MAP_FAILED) { + LOG(4, "mmap with MAP_SYNC succeeded"); + *map_sync = true; + return 0; + } + + /* try to mmap with MAP_SHARED flag (without MAP_SYNC) */ + if (errno == EINVAL || errno == ENOTSUP) { + LOG(4, "mmap with MAP_SYNC not supported"); + *base = mmap(reserv, len, proto, flags | MAP_SHARED, fd, + offset); + if (*base != MAP_FAILED) { + *map_sync = false; + return 0; + } + } + + ERR("!mmap"); + return PMEM2_E_ERRNO; +} + +/* + * unmap -- (internal) unmap a memory range + */ +static int +unmap(void *addr, size_t len) +{ + int retval = munmap(addr, len); + if (retval < 0) { + ERR("!munmap"); + return PMEM2_E_ERRNO; + } + + return 0; +} + +/* + * vm_reservation_mend -- replaces the given mapping with anonymous + * reservation, mending the reservation area + */ +static int +vm_reservation_mend(struct pmem2_vm_reservation *rsv, void *addr, size_t size) +{ + void *rsv_addr = pmem2_vm_reservation_get_address(rsv); + size_t rsv_size = pmem2_vm_reservation_get_size(rsv); + + ASSERT((char *)addr >= (char *)rsv_addr && + (char *)addr + size <= (char *)rsv_addr + rsv_size); + + char *daddr = mmap(addr, size, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (daddr == MAP_FAILED) { + ERR("!mmap MAP_ANONYMOUS"); + return PMEM2_E_ERRNO; + } + + return 0; +} + +/* + * pmem2_map_new -- map memory according to provided config + */ +int +pmem2_map_new(struct pmem2_map **map_ptr, const struct pmem2_config *cfg, + const struct pmem2_source *src) +{ + LOG(3, "cfg %p src %p map_ptr %p", cfg, src, map_ptr); + PMEM2_ERR_CLR(); + + int ret = 0; + struct pmem2_map *map; + size_t file_len; + *map_ptr = NULL; + + if (cfg->requested_max_granularity == PMEM2_GRANULARITY_INVALID) { + ERR( + "please define the max granularity requested for the mapping"); + + return PMEM2_E_GRANULARITY_NOT_SET; + } + + size_t src_alignment; + ret = pmem2_source_alignment(src, &src_alignment); + if (ret) + return ret; + + /* get file size */ + ret = pmem2_source_size(src, &file_len); + if (ret) + return ret; + + /* get offset */ + size_t effective_offset; + ret = pmem2_validate_offset(cfg, &effective_offset, src_alignment); + if (ret) + return ret; + ASSERTeq(effective_offset, cfg->offset); + + if (src->type == PMEM2_SOURCE_ANON) + effective_offset = 0; + + os_off_t off = (os_off_t)effective_offset; + + /* map input and output variables */ + bool map_sync = false; + /* + * MAP_SHARED - is required to mmap directly the underlying hardware + * MAP_FIXED - is required to mmap at exact address pointed by hint + */ + int flags = MAP_FIXED; + void *addr; + + /* "translate" pmem2 protection flags into linux flags */ + int proto = 0; + if (cfg->protection_flag == PMEM2_PROT_NONE) + proto = PROT_NONE; + if (cfg->protection_flag & PMEM2_PROT_EXEC) + proto |= PROT_EXEC; + if (cfg->protection_flag & PMEM2_PROT_READ) + proto |= PROT_READ; + if (cfg->protection_flag & PMEM2_PROT_WRITE) + proto |= PROT_WRITE; + + if (src->type == PMEM2_SOURCE_FD) { + if (src->value.ftype == PMEM2_FTYPE_DIR) { + ERR("the directory is not a supported file type"); + return PMEM2_E_INVALID_FILE_TYPE; + } + + ASSERT(src->value.ftype == PMEM2_FTYPE_REG || + src->value.ftype == PMEM2_FTYPE_DEVDAX); + + if (cfg->sharing == PMEM2_PRIVATE && + src->value.ftype == PMEM2_FTYPE_DEVDAX) { + ERR( + "device DAX does not support mapping with MAP_PRIVATE"); + return PMEM2_E_SRC_DEVDAX_PRIVATE; + } + } + + size_t content_length, reserved_length = 0; + ret = pmem2_config_validate_length(cfg, file_len, src_alignment); + if (ret) + return ret; + + /* without user-provided length, map to the end of the file */ + if (cfg->length) + content_length = cfg->length; + else + content_length = file_len - effective_offset; + + size_t alignment = get_map_alignment(content_length, + src_alignment); + + void *reserv_region = NULL; + void *rsv = cfg->reserv; + if (rsv) { + void *rsv_addr = pmem2_vm_reservation_get_address(rsv); + size_t rsv_size = pmem2_vm_reservation_get_size(rsv); + size_t rsv_offset = cfg->reserv_offset; + + reserved_length = roundup(content_length, Pagesize); + + if (rsv_offset % Mmap_align) { + ret = PMEM2_E_OFFSET_UNALIGNED; + ERR( + "virtual memory reservation offset %zu is not a multiple of %llu", + rsv_offset, Mmap_align); + return ret; + } + + if (rsv_offset + reserved_length > rsv_size) { + ret = PMEM2_E_LENGTH_OUT_OF_RANGE; + ERR( + "Reservation %p has not enough space for the intended content", + rsv); + return ret; + } + + reserv_region = (char *)rsv_addr + rsv_offset; + if ((size_t)reserv_region % alignment) { + ret = PMEM2_E_ADDRESS_UNALIGNED; + ERR( + "base mapping address %p (virtual memory reservation address + offset)" \ + " is not a multiple of %zu required by device DAX", + reserv_region, alignment); + return ret; + } + + /* check if the region in the reservation is occupied */ + if (vm_reservation_map_find_acquire(rsv, rsv_offset, + reserved_length)) { + ret = PMEM2_E_MAPPING_EXISTS; + ERR( + "region of the reservation %p at the offset %zu and " + "length %zu is at least partly occupied by other mapping", + rsv, rsv_offset, reserved_length); + goto err_reservation_release; + } + } else { + /* find a hint for the mapping */ + ret = map_reserve(content_length, alignment, &reserv_region, + &reserved_length, cfg); + if (ret != 0) { + if (ret == PMEM2_E_MAPPING_EXISTS) + LOG(1, + "given mapping region is already occupied"); + else + LOG(1, + "cannot find a contiguous region of given size"); + return ret; + } + } + + ASSERTne(reserv_region, NULL); + + if (cfg->sharing == PMEM2_PRIVATE) { + flags |= MAP_PRIVATE; + } + + int map_fd = INVALID_FD; + if (src->type == PMEM2_SOURCE_FD) { + map_fd = src->value.fd; + } else if (src->type == PMEM2_SOURCE_ANON) { + flags |= MAP_ANONYMOUS; + } else { + ASSERT(0); + } + + ret = file_map(reserv_region, content_length, proto, flags, map_fd, off, + &map_sync, &addr); + if (ret) { + /* + * unmap the reservation mapping only + * if it wasn't provided by the config + */ + if (!rsv) + munmap(reserv_region, reserved_length); + + if (ret == -EACCES) + ret = PMEM2_E_NO_ACCESS; + else if (ret == -ENOTSUP) + ret = PMEM2_E_NOSUPP; + else if (ret == -EEXIST) + ret = PMEM2_E_MAPPING_EXISTS; + goto err_reservation_release; + } + + LOG(3, "mapped at %p", addr); + + bool eADR = (pmem2_auto_flush() == 1); + enum pmem2_granularity available_min_granularity = + src->type == PMEM2_SOURCE_ANON ? PMEM2_GRANULARITY_BYTE : + get_min_granularity(eADR, map_sync, cfg->sharing); + + if (available_min_granularity > cfg->requested_max_granularity) { + const char *err = granularity_err_msg + [cfg->requested_max_granularity] + [available_min_granularity]; + if (strcmp(err, GRAN_IMPOSSIBLE) == 0) + FATAL( + "unhandled granularity error: available_min_granularity: %d" \ + "requested_max_granularity: %d", + available_min_granularity, + cfg->requested_max_granularity); + ERR("%s", err); + ret = PMEM2_E_GRANULARITY_NOT_SUPPORTED; + goto err_undo_mapping; + } + + /* prepare pmem2_map structure */ + map = (struct pmem2_map *)pmem2_malloc(sizeof(*map), &ret); + if (!map) + goto err_undo_mapping; + + map->addr = addr; + map->reserved_length = reserved_length; + map->content_length = content_length; + map->effective_granularity = available_min_granularity; + pmem2_set_flush_fns(map); + pmem2_set_mem_fns(map); + map->reserv = rsv; + map->source = *src; + map->source.value.fd = INVALID_FD; /* fd should not be used after map */ + + ret = pmem2_register_mapping(map); + if (ret) { + goto err_free_map_struct; + } + + if (rsv) { + ret = vm_reservation_map_register_release(rsv, map); + if (ret) + goto err_unregister_map; + } + + *map_ptr = map; + + if (src->type == PMEM2_SOURCE_FD) { + VALGRIND_REGISTER_PMEM_MAPPING(map->addr, map->content_length); + VALGRIND_REGISTER_PMEM_FILE(src->value.fd, + map->addr, map->content_length, 0); + } + + return 0; + +err_unregister_map: + pmem2_unregister_mapping(map); +err_free_map_struct: + Free(map); +err_undo_mapping: + /* + * if the reservation was given by pmem2_config, instead of unmapping, + * we will need to mend the reservation + */ + if (rsv) + vm_reservation_mend(rsv, addr, reserved_length); + else + unmap(addr, reserved_length); +err_reservation_release: + if (rsv) + vm_reservation_release(rsv); + return ret; +} + +/* + * pmem2_map_delete -- unmap the specified mapping + */ +int +pmem2_map_delete(struct pmem2_map **map_ptr) +{ + LOG(3, "map_ptr %p", map_ptr); + PMEM2_ERR_CLR(); + + int ret = 0; + struct pmem2_map *map = *map_ptr; + size_t map_len = map->content_length; + void *map_addr = map->addr; + struct pmem2_vm_reservation *rsv = map->reserv; + + ret = pmem2_unregister_mapping(map); + if (ret) + return ret; + + /* + * when reserved_length==0 mapping is created by pmem2_map_from_existing + * such mappings are provided by the users and shouldn't be unmapped + * by pmem2. + */ + if (map->reserved_length) { + VALGRIND_REMOVE_PMEM_MAPPING(map_addr, map_len); + + if (rsv) { + void *rsv_addr = pmem2_vm_reservation_get_address(rsv); + size_t rsv_offset = (size_t)map_addr - (size_t)rsv_addr; + if (!vm_reservation_map_find_acquire(rsv, rsv_offset, + map_len)) { + ret = PMEM2_E_MAPPING_NOT_FOUND; + goto err_reservation_release; + } + + ret = vm_reservation_mend(rsv, map_addr, map_len); + if (ret) + goto err_reservation_release; + + ret = vm_reservation_map_unregister_release(rsv, map); + if (ret) + goto err_register_map; + } else { + ret = unmap(map_addr, map_len); + if (ret) + goto err_register_map; + } + } + + Free(map); + *map_ptr = NULL; + + return 0; + +err_reservation_release: + vm_reservation_release(rsv); +err_register_map: + VALGRIND_REGISTER_PMEM_MAPPING(map_addr, map_len); + pmem2_register_mapping(map); + return ret; +} |