diff options
Diffstat (limited to 'src/pmdk/src/libpmemblk')
-rw-r--r-- | src/pmdk/src/libpmemblk/Makefile | 22 | ||||
-rw-r--r-- | src/pmdk/src/libpmemblk/blk.c | 948 | ||||
-rw-r--r-- | src/pmdk/src/libpmemblk/blk.h | 102 | ||||
-rw-r--r-- | src/pmdk/src/libpmemblk/btt.c | 2051 | ||||
-rw-r--r-- | src/pmdk/src/libpmemblk/btt.h | 59 | ||||
-rw-r--r-- | src/pmdk/src/libpmemblk/btt_layout.h | 107 | ||||
-rw-r--r-- | src/pmdk/src/libpmemblk/libpmemblk.c | 200 | ||||
-rw-r--r-- | src/pmdk/src/libpmemblk/libpmemblk.def | 36 | ||||
-rw-r--r-- | src/pmdk/src/libpmemblk/libpmemblk.link.in | 28 | ||||
-rw-r--r-- | src/pmdk/src/libpmemblk/libpmemblk.rc | 12 | ||||
-rw-r--r-- | src/pmdk/src/libpmemblk/libpmemblk.vcxproj | 133 | ||||
-rw-r--r-- | src/pmdk/src/libpmemblk/libpmemblk.vcxproj.filters | 217 | ||||
-rw-r--r-- | src/pmdk/src/libpmemblk/libpmemblk_main.c | 32 |
13 files changed, 3947 insertions, 0 deletions
diff --git a/src/pmdk/src/libpmemblk/Makefile b/src/pmdk/src/libpmemblk/Makefile new file mode 100644 index 000000000..8f5d99ecd --- /dev/null +++ b/src/pmdk/src/libpmemblk/Makefile @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2014-2020, Intel Corporation + +# +# src/libpmemblk/Makefile -- Makefile for libpmemblk +# + +LIBRARY_NAME = pmemblk +LIBRARY_SO_VERSION = 1 +LIBRARY_VERSION = 0.0 + +include ../core/pmemcore.inc +include ../common/pmemcommon.inc +SOURCE +=\ + blk.c\ + btt.c\ + libpmemblk.c + +include ../Makefile.inc + +CFLAGS += $(LIBNDCTL_CFLAGS) +LIBS += -pthread -lpmem $(LIBNDCTL_LIBS) diff --git a/src/pmdk/src/libpmemblk/blk.c b/src/pmdk/src/libpmemblk/blk.c new file mode 100644 index 000000000..58f9c9fff --- /dev/null +++ b/src/pmdk/src/libpmemblk/blk.c @@ -0,0 +1,948 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* Copyright 2014-2020, Intel Corporation */ + +/* + * blk.c -- block memory pool entry points for libpmem + */ + +#include <inttypes.h> +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <sys/param.h> +#include <unistd.h> +#include <errno.h> +#include <time.h> +#include <stdint.h> +#include <endian.h> +#include <stdbool.h> + +#include "libpmem.h" +#include "libpmemblk.h" + +#include "mmap.h" +#include "set.h" +#include "out.h" +#include "btt.h" +#include "blk.h" +#include "util.h" +#include "sys_util.h" +#include "util_pmem.h" +#include "valgrind_internal.h" + +static const struct pool_attr Blk_create_attr = { + BLK_HDR_SIG, + BLK_FORMAT_MAJOR, + BLK_FORMAT_FEAT_DEFAULT, + {0}, {0}, {0}, {0}, {0} +}; + +static const struct pool_attr Blk_open_attr = { + BLK_HDR_SIG, + BLK_FORMAT_MAJOR, + BLK_FORMAT_FEAT_CHECK, + {0}, {0}, {0}, {0}, {0} +}; + +/* + * lane_enter -- (internal) acquire a unique lane number + */ +static void +lane_enter(PMEMblkpool *pbp, unsigned *lane) +{ + unsigned mylane; + + mylane = util_fetch_and_add32(&pbp->next_lane, 1) % pbp->nlane; + + /* lane selected, grab the per-lane lock */ + util_mutex_lock(&pbp->locks[mylane]); + + *lane = mylane; +} + +/* + * lane_exit -- (internal) drop lane lock + */ +static void +lane_exit(PMEMblkpool *pbp, unsigned mylane) +{ + util_mutex_unlock(&pbp->locks[mylane]); +} + +/* + * nsread -- (internal) read data from the namespace encapsulating the BTT + * + * This routine is provided to btt_init() to allow the btt module to + * do I/O on the memory pool containing the BTT layout. + */ +static int +nsread(void *ns, unsigned lane, void *buf, size_t count, uint64_t off) +{ + struct pmemblk *pbp = (struct pmemblk *)ns; + + LOG(13, "pbp %p lane %u count %zu off %" PRIu64, pbp, lane, count, off); + + if (off + count > pbp->datasize) { + ERR("offset + count (%zu) past end of data area (%zu)", + (size_t)off + count, pbp->datasize); + errno = EINVAL; + return -1; + } + + memcpy(buf, (char *)pbp->data + off, count); + + return 0; +} + +/* + * nswrite -- (internal) write data to the namespace encapsulating the BTT + * + * This routine is provided to btt_init() to allow the btt module to + * do I/O on the memory pool containing the BTT layout. + */ +static int +nswrite(void *ns, unsigned lane, const void *buf, size_t count, + uint64_t off) +{ + struct pmemblk *pbp = (struct pmemblk *)ns; + + LOG(13, "pbp %p lane %u count %zu off %" PRIu64, pbp, lane, count, off); + + if (off + count > pbp->datasize) { + ERR("offset + count (%zu) past end of data area (%zu)", + (size_t)off + count, pbp->datasize); + errno = EINVAL; + return -1; + } + + void *dest = (char *)pbp->data + off; + +#ifdef DEBUG + /* grab debug write lock */ + util_mutex_lock(&pbp->write_lock); +#endif + + /* unprotect the memory (debug version only) */ + RANGE_RW(dest, count, pbp->is_dev_dax); + + if (pbp->is_pmem) + pmem_memcpy_nodrain(dest, buf, count); + else + memcpy(dest, buf, count); + + /* protect the memory again (debug version only) */ + RANGE_RO(dest, count, pbp->is_dev_dax); + +#ifdef DEBUG + /* release debug write lock */ + util_mutex_unlock(&pbp->write_lock); +#endif + + if (pbp->is_pmem) + pmem_drain(); + else + pmem_msync(dest, count); + + return 0; +} + +/* + * nsmap -- (internal) allow direct access to a range of a namespace + * + * The caller requests a range to be "mapped" but the return value + * may indicate a smaller amount (in which case the caller is expected + * to call back later for another mapping). + * + * This routine is provided to btt_init() to allow the btt module to + * do I/O on the memory pool containing the BTT layout. + */ +static ssize_t +nsmap(void *ns, unsigned lane, void **addrp, size_t len, uint64_t off) +{ + struct pmemblk *pbp = (struct pmemblk *)ns; + + LOG(12, "pbp %p lane %u len %zu off %" PRIu64, pbp, lane, len, off); + + ASSERT(((ssize_t)len) >= 0); + + if (off + len >= pbp->datasize) { + ERR("offset + len (%zu) past end of data area (%zu)", + (size_t)off + len, pbp->datasize - 1); + errno = EINVAL; + return -1; + } + + /* + * Since the entire file is memory-mapped, this callback + * can always provide the entire length requested. + */ + *addrp = (char *)pbp->data + off; + + LOG(12, "returning addr %p", *addrp); + + return (ssize_t)len; +} + +/* + * nssync -- (internal) flush changes made to a namespace range + * + * This is used in conjunction with the addresses handed out by + * nsmap() above. There's no need to sync things written via + * nswrite() since those changes are flushed each time nswrite() + * is called. + * + * This routine is provided to btt_init() to allow the btt module to + * do I/O on the memory pool containing the BTT layout. + */ +static void +nssync(void *ns, unsigned lane, void *addr, size_t len) +{ + struct pmemblk *pbp = (struct pmemblk *)ns; + + LOG(12, "pbp %p lane %u addr %p len %zu", pbp, lane, addr, len); + + if (pbp->is_pmem) + pmem_persist(addr, len); + else + pmem_msync(addr, len); +} + +/* + * nszero -- (internal) zero data in the namespace encapsulating the BTT + * + * This routine is provided to btt_init() to allow the btt module to + * zero the memory pool containing the BTT layout. + */ +static int +nszero(void *ns, unsigned lane, size_t count, uint64_t off) +{ + struct pmemblk *pbp = (struct pmemblk *)ns; + + LOG(13, "pbp %p lane %u count %zu off %" PRIu64, pbp, lane, count, off); + + if (off + count > pbp->datasize) { + ERR("offset + count (%zu) past end of data area (%zu)", + (size_t)off + count, pbp->datasize); + errno = EINVAL; + return -1; + } + + void *dest = (char *)pbp->data + off; + + /* unprotect the memory (debug version only) */ + RANGE_RW(dest, count, pbp->is_dev_dax); + + pmem_memset_persist(dest, 0, count); + + /* protect the memory again (debug version only) */ + RANGE_RO(dest, count, pbp->is_dev_dax); + + return 0; +} + +/* callbacks for btt_init() */ +static struct ns_callback ns_cb = { + .nsread = nsread, + .nswrite = nswrite, + .nszero = nszero, + .nsmap = nsmap, + .nssync = nssync, + .ns_is_zeroed = 0 +}; + +/* + * blk_descr_create -- (internal) create block memory pool descriptor + */ +static void +blk_descr_create(PMEMblkpool *pbp, uint32_t bsize, int zeroed) +{ + LOG(3, "pbp %p bsize %u zeroed %d", pbp, bsize, zeroed); + + /* create the required metadata */ + pbp->bsize = htole32(bsize); + util_persist(pbp->is_pmem, &pbp->bsize, sizeof(bsize)); + + pbp->is_zeroed = zeroed; + util_persist(pbp->is_pmem, &pbp->is_zeroed, sizeof(pbp->is_zeroed)); +} + +/* + * blk_descr_check -- (internal) validate block memory pool descriptor + */ +static int +blk_descr_check(PMEMblkpool *pbp, size_t *bsize) +{ + LOG(3, "pbp %p bsize %zu", pbp, *bsize); + + size_t hdr_bsize = le32toh(pbp->bsize); + if (*bsize && *bsize != hdr_bsize) { + ERR("wrong bsize (%zu), pool created with bsize %zu", + *bsize, hdr_bsize); + errno = EINVAL; + return -1; + } + *bsize = hdr_bsize; + LOG(3, "using block size from header: %zu", *bsize); + + return 0; +} + +/* + * blk_runtime_init -- (internal) initialize block memory pool runtime data + */ +static int +blk_runtime_init(PMEMblkpool *pbp, size_t bsize, int rdonly) +{ + LOG(3, "pbp %p bsize %zu rdonly %d", + pbp, bsize, rdonly); + + /* remove volatile part of header */ + VALGRIND_REMOVE_PMEM_MAPPING(&pbp->addr, + sizeof(struct pmemblk) - + sizeof(struct pool_hdr) - + sizeof(pbp->bsize) - + sizeof(pbp->is_zeroed)); + + /* + * Use some of the memory pool area for run-time info. This + * run-time state is never loaded from the file, it is always + * created here, so no need to worry about byte-order. + */ + pbp->rdonly = rdonly; + pbp->data = (char *)pbp->addr + + roundup(sizeof(*pbp), BLK_FORMAT_DATA_ALIGN); + ASSERT(((char *)pbp->addr + pbp->size) >= (char *)pbp->data); + pbp->datasize = (size_t) + (((char *)pbp->addr + pbp->size) - (char *)pbp->data); + + LOG(4, "data area %p data size %zu bsize %zu", + pbp->data, pbp->datasize, bsize); + + long ncpus = sysconf(_SC_NPROCESSORS_ONLN); + if (ncpus < 1) + ncpus = 1; + + ns_cb.ns_is_zeroed = pbp->is_zeroed; + + /* things free by "goto err" if not NULL */ + struct btt *bttp = NULL; + os_mutex_t *locks = NULL; + + bttp = btt_init(pbp->datasize, (uint32_t)bsize, pbp->hdr.poolset_uuid, + (unsigned)ncpus * 2, pbp, &ns_cb); + + if (bttp == NULL) + goto err; /* btt_init set errno, called LOG */ + + pbp->bttp = bttp; + + pbp->nlane = btt_nlane(pbp->bttp); + pbp->next_lane = 0; + if ((locks = Malloc(pbp->nlane * sizeof(*locks))) == NULL) { + ERR("!Malloc for lane locks"); + goto err; + } + + for (unsigned i = 0; i < pbp->nlane; i++) + util_mutex_init(&locks[i]); + + pbp->locks = locks; + +#ifdef DEBUG + /* initialize debug lock */ + util_mutex_init(&pbp->write_lock); +#endif + + /* + * If possible, turn off all permissions on the pool header page. + * + * The prototype PMFS doesn't allow this when large pages are in + * use. It is not considered an error if this fails. + */ + RANGE_NONE(pbp->addr, sizeof(struct pool_hdr), pbp->is_dev_dax); + + /* the data area should be kept read-only for debug version */ + RANGE_RO(pbp->data, pbp->datasize, pbp->is_dev_dax); + + return 0; + +err: + LOG(4, "error clean up"); + int oerrno = errno; + if (bttp) + btt_fini(bttp); + errno = oerrno; + return -1; +} + +/* + * pmemblk_createU -- create a block memory pool + */ +#ifndef _WIN32 +static inline +#endif +PMEMblkpool * +pmemblk_createU(const char *path, size_t bsize, size_t poolsize, mode_t mode) +{ + LOG(3, "path %s bsize %zu poolsize %zu mode %o", + path, bsize, poolsize, mode); + + /* check if bsize is valid */ + if (bsize == 0) { + ERR("Invalid block size %zu", bsize); + errno = EINVAL; + return NULL; + } + + if (bsize > UINT32_MAX) { + ERR("Invalid block size %zu", bsize); + errno = EINVAL; + return NULL; + } + + struct pool_set *set; + struct pool_attr adj_pool_attr = Blk_create_attr; + + /* force set SDS feature */ + if (SDS_at_create) + adj_pool_attr.features.incompat |= POOL_FEAT_SDS; + else + adj_pool_attr.features.incompat &= ~POOL_FEAT_SDS; + + if (util_pool_create(&set, path, poolsize, PMEMBLK_MIN_POOL, + PMEMBLK_MIN_PART, &adj_pool_attr, NULL, + REPLICAS_DISABLED) != 0) { + LOG(2, "cannot create pool or pool set"); + return NULL; + } + + ASSERT(set->nreplicas > 0); + + struct pool_replica *rep = set->replica[0]; + PMEMblkpool *pbp = rep->part[0].addr; + + VALGRIND_REMOVE_PMEM_MAPPING(&pbp->addr, + sizeof(struct pmemblk) - + ((uintptr_t)&pbp->addr - (uintptr_t)&pbp->hdr)); + + pbp->addr = pbp; + pbp->size = rep->repsize; + pbp->set = set; + pbp->is_pmem = rep->is_pmem; + pbp->is_dev_dax = rep->part[0].is_dev_dax; + + /* is_dev_dax implies is_pmem */ + ASSERT(!pbp->is_dev_dax || pbp->is_pmem); + + /* create pool descriptor */ + blk_descr_create(pbp, (uint32_t)bsize, set->zeroed); + + /* initialize runtime parts */ + if (blk_runtime_init(pbp, bsize, 0) != 0) { + ERR("pool initialization failed"); + goto err; + } + + if (util_poolset_chmod(set, mode)) + goto err; + + util_poolset_fdclose(set); + + LOG(3, "pbp %p", pbp); + return pbp; + +err: + LOG(4, "error clean up"); + int oerrno = errno; + util_poolset_close(set, DELETE_CREATED_PARTS); + errno = oerrno; + return NULL; +} + +#ifndef _WIN32 +/* + * pmemblk_create -- create a block memory pool + */ +PMEMblkpool * +pmemblk_create(const char *path, size_t bsize, size_t poolsize, mode_t mode) +{ + return pmemblk_createU(path, bsize, poolsize, mode); +} +#else +/* + * pmemblk_createW -- create a block memory pool + */ +PMEMblkpool * +pmemblk_createW(const wchar_t *path, size_t bsize, size_t poolsize, + mode_t mode) +{ + char *upath = util_toUTF8(path); + if (upath == NULL) + return NULL; + + PMEMblkpool *ret = pmemblk_createU(upath, bsize, poolsize, mode); + + util_free_UTF8(upath); + return ret; +} +#endif + +/* + * blk_open_common -- (internal) open a block memory pool + * + * This routine does all the work, but takes a cow flag so internal + * calls can map a read-only pool if required. + * + * Passing in bsize == 0 means a valid pool header must exist (which + * will supply the block size). + */ +static PMEMblkpool * +blk_open_common(const char *path, size_t bsize, unsigned flags) +{ + LOG(3, "path %s bsize %zu flags 0x%x", path, bsize, flags); + + struct pool_set *set; + + if (util_pool_open(&set, path, PMEMBLK_MIN_PART, &Blk_open_attr, + NULL, NULL, flags) != 0) { + LOG(2, "cannot open pool or pool set"); + return NULL; + } + + ASSERT(set->nreplicas > 0); + + struct pool_replica *rep = set->replica[0]; + PMEMblkpool *pbp = rep->part[0].addr; + + VALGRIND_REMOVE_PMEM_MAPPING(&pbp->addr, + sizeof(struct pmemblk) - + ((uintptr_t)&pbp->addr - (uintptr_t)&pbp->hdr)); + + pbp->addr = pbp; + pbp->size = rep->repsize; + pbp->set = set; + pbp->is_pmem = rep->is_pmem; + pbp->is_dev_dax = rep->part[0].is_dev_dax; + + /* is_dev_dax implies is_pmem */ + ASSERT(!pbp->is_dev_dax || pbp->is_pmem); + + if (set->nreplicas > 1) { + errno = ENOTSUP; + ERR("!replicas not supported"); + goto err; + } + + /* validate pool descriptor */ + if (blk_descr_check(pbp, &bsize) != 0) { + LOG(2, "descriptor check failed"); + goto err; + } + + /* initialize runtime parts */ + if (blk_runtime_init(pbp, bsize, set->rdonly) != 0) { + ERR("pool initialization failed"); + goto err; + } + + util_poolset_fdclose(set); + + LOG(3, "pbp %p", pbp); + return pbp; + +err: + LOG(4, "error clean up"); + int oerrno = errno; + util_poolset_close(set, DO_NOT_DELETE_PARTS); + errno = oerrno; + return NULL; +} + +/* + * pmemblk_openU -- open a block memory pool + */ +#ifndef _WIN32 +static inline +#endif +PMEMblkpool * +pmemblk_openU(const char *path, size_t bsize) +{ + LOG(3, "path %s bsize %zu", path, bsize); + + return blk_open_common(path, bsize, COW_at_open ? POOL_OPEN_COW : 0); +} + +#ifndef _WIN32 +/* + * pmemblk_open -- open a block memory pool + */ +PMEMblkpool * +pmemblk_open(const char *path, size_t bsize) +{ + return pmemblk_openU(path, bsize); +} +#else +/* + * pmemblk_openW -- open a block memory pool + */ +PMEMblkpool * +pmemblk_openW(const wchar_t *path, size_t bsize) +{ + char *upath = util_toUTF8(path); + if (upath == NULL) + return NULL; + + PMEMblkpool *ret = pmemblk_openU(upath, bsize); + + util_free_UTF8(upath); + return ret; +} +#endif + +/* + * pmemblk_close -- close a block memory pool + */ +void +pmemblk_close(PMEMblkpool *pbp) +{ + LOG(3, "pbp %p", pbp); + + btt_fini(pbp->bttp); + if (pbp->locks) { + for (unsigned i = 0; i < pbp->nlane; i++) + util_mutex_destroy(&pbp->locks[i]); + Free((void *)pbp->locks); + } + +#ifdef DEBUG + /* destroy debug lock */ + util_mutex_destroy(&pbp->write_lock); +#endif + + util_poolset_close(pbp->set, DO_NOT_DELETE_PARTS); +} + +/* + * pmemblk_bsize -- return size of block for specified pool + */ +size_t +pmemblk_bsize(PMEMblkpool *pbp) +{ + LOG(3, "pbp %p", pbp); + + return le32toh(pbp->bsize); +} + +/* + * pmemblk_nblock -- return number of usable blocks in a block memory pool + */ +size_t +pmemblk_nblock(PMEMblkpool *pbp) +{ + LOG(3, "pbp %p", pbp); + + return btt_nlba(pbp->bttp); +} + +/* + * pmemblk_read -- read a block in a block memory pool + */ +int +pmemblk_read(PMEMblkpool *pbp, void *buf, long long blockno) +{ + LOG(3, "pbp %p buf %p blockno %lld", pbp, buf, blockno); + + if (blockno < 0) { + ERR("negative block number"); + errno = EINVAL; + return -1; + } + + unsigned lane; + + lane_enter(pbp, &lane); + + int err = btt_read(pbp->bttp, lane, (uint64_t)blockno, buf); + + lane_exit(pbp, lane); + + return err; +} + +/* + * pmemblk_write -- write a block (atomically) in a block memory pool + */ +int +pmemblk_write(PMEMblkpool *pbp, const void *buf, long long blockno) +{ + LOG(3, "pbp %p buf %p blockno %lld", pbp, buf, blockno); + + if (pbp->rdonly) { + ERR("EROFS (pool is read-only)"); + errno = EROFS; + return -1; + } + + if (blockno < 0) { + ERR("negative block number"); + errno = EINVAL; + return -1; + } + + unsigned lane; + + lane_enter(pbp, &lane); + + int err = btt_write(pbp->bttp, lane, (uint64_t)blockno, buf); + + lane_exit(pbp, lane); + + return err; +} + +/* + * pmemblk_set_zero -- zero a block in a block memory pool + */ +int +pmemblk_set_zero(PMEMblkpool *pbp, long long blockno) +{ + LOG(3, "pbp %p blockno %lld", pbp, blockno); + + if (pbp->rdonly) { + ERR("EROFS (pool is read-only)"); + errno = EROFS; + return -1; + } + + if (blockno < 0) { + ERR("negative block number"); + errno = EINVAL; + return -1; + } + + unsigned lane; + + lane_enter(pbp, &lane); + + int err = btt_set_zero(pbp->bttp, lane, (uint64_t)blockno); + + lane_exit(pbp, lane); + + return err; +} + +/* + * pmemblk_set_error -- set the error state on a block in a block memory pool + */ +int +pmemblk_set_error(PMEMblkpool *pbp, long long blockno) +{ + LOG(3, "pbp %p blockno %lld", pbp, blockno); + + if (pbp->rdonly) { + ERR("EROFS (pool is read-only)"); + errno = EROFS; + return -1; + } + + if (blockno < 0) { + ERR("negative block number"); + errno = EINVAL; + return -1; + } + + unsigned lane; + + lane_enter(pbp, &lane); + + int err = btt_set_error(pbp->bttp, lane, (uint64_t)blockno); + + lane_exit(pbp, lane); + + return err; +} + +/* + * pmemblk_checkU -- block memory pool consistency check + */ +#ifndef _WIN32 +static inline +#endif +int +pmemblk_checkU(const char *path, size_t bsize) +{ + LOG(3, "path \"%s\" bsize %zu", path, bsize); + + /* map the pool read-only */ + PMEMblkpool *pbp = blk_open_common(path, bsize, POOL_OPEN_COW); + if (pbp == NULL) + return -1; /* errno set by blk_open_common() */ + + int retval = btt_check(pbp->bttp); + int oerrno = errno; + pmemblk_close(pbp); + errno = oerrno; + + return retval; +} + +#ifndef _WIN32 +/* + * pmemblk_check -- block memory pool consistency check + */ +int +pmemblk_check(const char *path, size_t bsize) +{ + return pmemblk_checkU(path, bsize); +} +#else +/* + * pmemblk_checkW -- block memory pool consistency check + */ +int +pmemblk_checkW(const wchar_t *path, size_t bsize) +{ + char *upath = util_toUTF8(path); + if (upath == NULL) + return -1; + + int ret = pmemblk_checkU(upath, bsize); + + util_free_UTF8(upath); + return ret; +} +#endif + +/* + * pmemblk_ctl_getU -- programmatically executes a read ctl query + */ +#ifndef _WIN32 +static inline +#endif +int +pmemblk_ctl_getU(PMEMblkpool *pbp, const char *name, void *arg) +{ + LOG(3, "pbp %p name %s arg %p", pbp, name, arg); + return ctl_query(pbp == NULL ? NULL : pbp->ctl, pbp, + CTL_QUERY_PROGRAMMATIC, name, CTL_QUERY_READ, arg); +} + +/* + * pmemblk_ctl_setU -- programmatically executes a write ctl query + */ +#ifndef _WIN32 +static inline +#endif +int +pmemblk_ctl_setU(PMEMblkpool *pbp, const char *name, void *arg) +{ + LOG(3, "pbp %p name %s arg %p", pbp, name, arg); + return ctl_query(pbp == NULL ? NULL : pbp->ctl, pbp, + CTL_QUERY_PROGRAMMATIC, name, CTL_QUERY_WRITE, arg); +} + +/* + * pmemblk_ctl_execU -- programmatically executes a runnable ctl query + */ +#ifndef _WIN32 +static inline +#endif +int +pmemblk_ctl_execU(PMEMblkpool *pbp, const char *name, void *arg) +{ + LOG(3, "pbp %p name %s arg %p", pbp, name, arg); + return ctl_query(pbp == NULL ? NULL : pbp->ctl, pbp, + CTL_QUERY_PROGRAMMATIC, name, CTL_QUERY_RUNNABLE, arg); +} + +#ifndef _WIN32 +/* + * pmemblk_ctl_get -- programmatically executes a read ctl query + */ +int +pmemblk_ctl_get(PMEMblkpool *pbp, const char *name, void *arg) +{ + return pmemblk_ctl_getU(pbp, name, arg); +} + +/* + * pmemblk_ctl_set -- programmatically executes a write ctl query + */ +int +pmemblk_ctl_set(PMEMblkpool *pbp, const char *name, void *arg) +{ + return pmemblk_ctl_setU(pbp, name, arg); +} + +/* + * pmemblk_ctl_exec -- programmatically executes a runnable ctl query + */ +int +pmemblk_ctl_exec(PMEMblkpool *pbp, const char *name, void *arg) +{ + return pmemblk_ctl_execU(pbp, name, arg); +} +#else +/* + * pmemblk_ctl_getW -- programmatically executes a read ctl query + */ +int +pmemblk_ctl_getW(PMEMblkpool *pbp, const wchar_t *name, void *arg) +{ + char *uname = util_toUTF8(name); + if (uname == NULL) + return -1; + + int ret = pmemblk_ctl_getU(pbp, uname, arg); + util_free_UTF8(uname); + + return ret; +} + +/* + * pmemblk_ctl_setW -- programmatically executes a write ctl query + */ +int +pmemblk_ctl_setW(PMEMblkpool *pbp, const wchar_t *name, void *arg) +{ + char *uname = util_toUTF8(name); + if (uname == NULL) + return -1; + + int ret = pmemblk_ctl_setU(pbp, uname, arg); + util_free_UTF8(uname); + + return ret; +} + +/* + * pmemblk_ctl_execW -- programmatically executes a runnable ctl query + */ +int +pmemblk_ctl_execW(PMEMblkpool *pbp, const wchar_t *name, void *arg) +{ + char *uname = util_toUTF8(name); + if (uname == NULL) + return -1; + + int ret = pmemblk_ctl_execU(pbp, uname, arg); + util_free_UTF8(uname); + + return ret; +} +#endif + +#if FAULT_INJECTION +void +pmemblk_inject_fault_at(enum pmem_allocation_type type, int nth, + const char *at) +{ + core_inject_fault_at(type, nth, at); +} + +int +pmemblk_fault_injection_enabled(void) +{ + return core_fault_injection_enabled(); +} +#endif diff --git a/src/pmdk/src/libpmemblk/blk.h b/src/pmdk/src/libpmemblk/blk.h new file mode 100644 index 000000000..095331b8b --- /dev/null +++ b/src/pmdk/src/libpmemblk/blk.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2014-2020, Intel Corporation */ + +/* + * blk.h -- internal definitions for libpmem blk module + */ + +#ifndef BLK_H +#define BLK_H 1 + +#include <stddef.h> + +#include "ctl.h" +#include "os_thread.h" +#include "pool_hdr.h" +#include "page_size.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#include "alloc.h" +#include "fault_injection.h" + +#define PMEMBLK_LOG_PREFIX "libpmemblk" +#define PMEMBLK_LOG_LEVEL_VAR "PMEMBLK_LOG_LEVEL" +#define PMEMBLK_LOG_FILE_VAR "PMEMBLK_LOG_FILE" + +/* attributes of the blk memory pool format for the pool header */ +#define BLK_HDR_SIG "PMEMBLK" /* must be 8 bytes including '\0' */ +#define BLK_FORMAT_MAJOR 1 + +#define BLK_FORMAT_FEAT_DEFAULT \ + {POOL_FEAT_COMPAT_DEFAULT, POOL_FEAT_INCOMPAT_DEFAULT, 0x0000} + +#define BLK_FORMAT_FEAT_CHECK \ + {POOL_FEAT_COMPAT_VALID, POOL_FEAT_INCOMPAT_VALID, 0x0000} + +static const features_t blk_format_feat_default = BLK_FORMAT_FEAT_DEFAULT; + +struct pmemblk { + struct pool_hdr hdr; /* memory pool header */ + + /* root info for on-media format... */ + uint32_t bsize; /* block size */ + + /* flag indicating if the pool was zero-initialized */ + int is_zeroed; + + /* some run-time state, allocated out of memory pool... */ + void *addr; /* mapped region */ + size_t size; /* size of mapped region */ + int is_pmem; /* true if pool is PMEM */ + int rdonly; /* true if pool is opened read-only */ + void *data; /* post-header data area */ + size_t datasize; /* size of data area */ + size_t nlba; /* number of LBAs in pool */ + struct btt *bttp; /* btt handle */ + unsigned nlane; /* number of lanes */ + unsigned next_lane; /* used to rotate through lanes */ + os_mutex_t *locks; /* one per lane */ + int is_dev_dax; /* true if mapped on device dax */ + struct ctl *ctl; /* top level node of the ctl tree structure */ + + struct pool_set *set; /* pool set info */ + +#ifdef DEBUG + /* held during read/write mprotected sections */ + os_mutex_t write_lock; +#endif +}; + +/* data area starts at this alignment after the struct pmemblk above */ +#define BLK_FORMAT_DATA_ALIGN ((uintptr_t)PMEM_PAGESIZE) + +#if FAULT_INJECTION +void +pmemblk_inject_fault_at(enum pmem_allocation_type type, int nth, + const char *at); + +int +pmemblk_fault_injection_enabled(void); +#else +static inline void +pmemblk_inject_fault_at(enum pmem_allocation_type type, int nth, + const char *at) +{ + abort(); +} + +static inline int +pmemblk_fault_injection_enabled(void) +{ + return 0; +} +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/pmdk/src/libpmemblk/btt.c b/src/pmdk/src/libpmemblk/btt.c new file mode 100644 index 000000000..f59389f21 --- /dev/null +++ b/src/pmdk/src/libpmemblk/btt.c @@ -0,0 +1,2051 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* Copyright 2014-2019, Intel Corporation */ + +/* + * btt.c -- block translation table providing atomic block updates + * + * This is a user-space implementation of the BTT mechanism providing + * single block powerfail write atomicity, as described by: + * The NVDIMM Namespace Specification + * + * To use this module, the caller must provide five routines for + * accessing the namespace containing the data (in this context, + * "namespace" refers to the storage containing the BTT layout, such + * as a file). All namespace I/O is done by these callbacks: + * + * nsread Read count bytes from namespace at offset off + * nswrite Write count bytes to namespace at offset off + * nszero Zero count bytes in namespace at offset off + * nsmap Return direct access to a range of a namespace + * nssync Flush changes made to an nsmap'd range + * + * Data written by the nswrite callback is flushed out to the media + * (made durable) when the call returns. Data written directly via + * the nsmap callback must be flushed explicitly using nssync. + * + * The caller passes these callbacks, along with information such as + * namespace size and UUID to btt_init() and gets back an opaque handle + * which is then used with the rest of the entry points. + * + * Here is a brief list of the entry points to this module: + * + * btt_nlane Returns number of concurrent threads allowed + * + * btt_nlba Returns the usable size, as a count of LBAs + * + * btt_read Reads a single block at a given LBA + * + * btt_write Writes a single block (atomically) at a given LBA + * + * btt_set_zero Sets a block to read back as zeros + * + * btt_set_error Sets a block to return error on read + * + * btt_check Checks the BTT metadata for consistency + * + * btt_fini Frees run-time state, done using namespace + * + * If the caller is multi-threaded, it must only allow btt_nlane() threads + * to enter this module at a time, each assigned a unique "lane" number + * between 0 and btt_nlane() - 1. + * + * There are a number of static routines defined in this module. Here's + * a brief overview of the most important routines: + * + * read_layout Checks for valid BTT layout and builds run-time state. + * A number of helper functions are used by read_layout + * to handle various parts of the metadata: + * read_info + * read_arenas + * read_arena + * read_flogs + * read_flog_pair + * + * write_layout Generates a new BTT layout when one doesn't exist. + * Once a new layout is written, write_layout uses + * the same helper functions above to construct the + * run-time state. + * + * invalid_lba Range check done by each entry point that takes + * an LBA. + * + * lba_to_arena_lba + * Find the arena and LBA in that arena for a given + * external LBA. This is the heart of the arena + * range matching logic. + * + * flog_update Update the BTT free list/log combined data structure + * (known as the "flog"). This is the heart of the + * logic that makes writes powerfail atomic. + * + * map_lock These routines provide atomic access to the BTT map + * map_unlock data structure in an area. + * map_abort + * + * map_entry_setf Common code for btt_set_zero() and btt_set_error(). + * + * zero_block Generate a block of all zeros (instead of actually + * doing a read), when the metadata indicates the + * block should read as zeros. + * + * build_rtt These routines construct the run-time tracking + * build_map_locks data structures used during I/O. + */ + +#include <inttypes.h> +#include <stdio.h> +#include <sys/param.h> +#include <unistd.h> +#include <errno.h> +#include <string.h> +#include <stdint.h> +#include <endian.h> + +#include "out.h" +#include "uuid.h" +#include "btt.h" +#include "btt_layout.h" +#include "sys_util.h" +#include "util.h" +#include "alloc.h" + +/* + * The opaque btt handle containing state tracked by this module + * for the btt namespace. This is created by btt_init(), handed to + * all the other btt_* entry points, and deleted by btt_fini(). + */ +struct btt { + unsigned nlane; /* number of concurrent threads allowed per btt */ + + /* + * The laidout flag indicates whether the namespace contains valid BTT + * metadata. It is initialized by read_layout() and if no valid layout + * is found, all reads return zeros and the first write will write the + * BTT layout. The layout_write_mutex protects the laidout flag so + * only one write threads ends up writing the initial metadata by + * calling write_layout(). + */ + os_mutex_t layout_write_mutex; + int laidout; + + /* + * UUID of the BTT + */ + uint8_t uuid[BTTINFO_UUID_LEN]; + + /* + * UUID of the containing namespace, used to validate BTT metadata. + */ + uint8_t parent_uuid[BTTINFO_UUID_LEN]; + + /* + * Parameters controlling/describing the BTT layout. + */ + uint64_t rawsize; /* size of containing namespace */ + uint32_t lbasize; /* external LBA size */ + uint32_t nfree; /* available flog entries */ + uint64_t nlba; /* total number of external LBAs */ + unsigned narena; /* number of arenas */ + + /* run-time state kept for each arena */ + struct arena { + uint32_t flags; /* arena flags (btt_info) */ + uint32_t external_nlba; /* LBAs that live in this arena */ + uint32_t internal_lbasize; + uint32_t internal_nlba; + + /* + * The following offsets are relative to the beginning of + * the encapsulating namespace. This is different from + * how these offsets are stored on-media, where they are + * relative to the start of the arena. The offset are + * converted by read_layout() to make them more convenient + * for run-time use. + */ + uint64_t startoff; /* offset to start of arena */ + uint64_t dataoff; /* offset to arena data area */ + uint64_t mapoff; /* offset to area map */ + uint64_t flogoff; /* offset to area flog */ + uint64_t nextoff; /* offset to next arena */ + + /* + * Run-time flog state. Indexed by lane. + * + * The write path uses the flog to find the free block + * it writes to before atomically making it the new + * active block for an external LBA. + * + * The read path doesn't use the flog at all. + */ + struct flog_runtime { + struct btt_flog flog; /* current info */ + uint64_t entries[2]; /* offsets for flog pair */ + int next; /* next write (0 or 1) */ + } *flogs; + + /* + * Read tracking table. Indexed by lane. + * + * Before using a free block found in the flog, the write path + * scans the rtt to see if there are any outstanding reads on + * that block (reads that started before the block was freed by + * a concurrent write). Unused slots in the rtt are indicated + * by setting the error bit, BTT_MAP_ENTRY_ERROR, so that the + * entry won't match any post-map LBA when checked. + */ + uint32_t volatile *rtt; + + /* + * Map locking. Indexed by pre-map LBA modulo nlane. + */ + os_mutex_t *map_locks; + + /* + * Arena info block locking. + */ + os_mutex_t info_lock; + } *arenas; + + /* + * Callbacks for doing I/O to namespace. These are provided by + * the code calling the BTT module, which passes them in to + * btt_init(). All namespace I/O is done using these. + * + * The opaque namespace handle "ns" was provided by the code calling + * the BTT module and is passed to each callback to identify the + * namespace being accessed. + */ + void *ns; + const struct ns_callback *ns_cbp; +}; + +/* + * Signature for arena info blocks. Total size is 16 bytes, including + * the '\0' added to the string by the declaration (the last two bytes + * of the string are '\0'). + */ +static const char Sig[] = BTTINFO_SIG; + +/* + * Zeroed out flog entry, used when initializing the flog. + */ +static const struct btt_flog Zflog; + +/* + * Lookup table and macro for looking up sequence numbers. These are + * the 2-bit numbers that cycle between 01, 10, and 11. + * + * To advance a sequence number to the next number, use something like: + * seq = NSEQ(seq); + */ +static const unsigned Nseq[] = { 0, 2, 3, 1 }; +#define NSEQ(seq) (Nseq[(seq) & 3]) + +/* + * get_map_lock_num -- (internal) Calculate offset into map_locks[] + * + * map_locks[] contains nfree locks which are used to protect the map + * from concurrent access to the same cache line. The index into + * map_locks[] is calculated by looking at the byte offset into the map + * (premap_lba * BTT_MAP_ENTRY_SIZE), figuring out how many cache lines + * that is into the map that is (dividing by BTT_MAP_LOCK_ALIGN), and + * then selecting one of nfree locks (the modulo at the end). + * + * The extra cast is to keep gcc from generating a false positive + * 64-32 bit conversion error when -fsanitize is set. + */ +static inline uint32_t +get_map_lock_num(uint32_t premap_lba, uint32_t nfree) +{ + return (uint32_t)(premap_lba * BTT_MAP_ENTRY_SIZE / BTT_MAP_LOCK_ALIGN) + % nfree; +} + +/* + * invalid_lba -- (internal) set errno and return true if lba is invalid + * + * This function is used at the top of the entry points where an external + * LBA is provided, like this: + * + * if (invalid_lba(bttp, lba)) + * return -1; + */ +static int +invalid_lba(struct btt *bttp, uint64_t lba) +{ + LOG(3, "bttp %p lba %" PRIu64, bttp, lba); + + if (lba >= bttp->nlba) { + ERR("lba out of range (nlba %" PRIu64 ")", bttp->nlba); + errno = EINVAL; + return 1; + } + + return 0; +} + +/* + * read_info -- (internal) convert btt_info to host byte order & validate + * + * Returns true if info block is valid, and all the integer fields are + * converted to host byte order. If the info block is not valid, this + * routine returns false and the info block passed in is left in an + * unknown state. + */ +static int +read_info(struct btt *bttp, struct btt_info *infop) +{ + LOG(3, "infop %p", infop); + + if (memcmp(infop->sig, Sig, BTTINFO_SIG_LEN)) { + LOG(3, "signature invalid"); + return 0; + } + + if (memcmp(infop->parent_uuid, bttp->parent_uuid, BTTINFO_UUID_LEN)) { + LOG(3, "parent UUID mismatch"); + return 0; + } + + /* to be valid, the fields must checksum correctly */ + if (!util_checksum(infop, sizeof(*infop), &infop->checksum, 0, 0)) { + LOG(3, "invalid checksum"); + return 0; + } + + /* to be valid, info block must have a major version of at least 1 */ + if ((infop->major = le16toh(infop->major)) == 0) { + LOG(3, "invalid major version (0)"); + return 0; + } + + infop->flags = le32toh(infop->flags); + infop->minor = le16toh(infop->minor); + infop->external_lbasize = le32toh(infop->external_lbasize); + infop->external_nlba = le32toh(infop->external_nlba); + infop->internal_lbasize = le32toh(infop->internal_lbasize); + infop->internal_nlba = le32toh(infop->internal_nlba); + infop->nfree = le32toh(infop->nfree); + infop->infosize = le32toh(infop->infosize); + infop->nextoff = le64toh(infop->nextoff); + infop->dataoff = le64toh(infop->dataoff); + infop->mapoff = le64toh(infop->mapoff); + infop->flogoff = le64toh(infop->flogoff); + infop->infooff = le64toh(infop->infooff); + + return 1; +} + +/* + * map_entry_is_zero -- (internal) checks if map_entry is in zero state + */ +static inline int +map_entry_is_zero(uint32_t map_entry) +{ + return (map_entry & ~BTT_MAP_ENTRY_LBA_MASK) == BTT_MAP_ENTRY_ZERO; +} + +/* + * map_entry_is_error -- (internal) checks if map_entry is in error state + */ +static inline int +map_entry_is_error(uint32_t map_entry) +{ + return (map_entry & ~BTT_MAP_ENTRY_LBA_MASK) == BTT_MAP_ENTRY_ERROR; +} + +/* + * map_entry_is_initial -- checks if map_entry is in initial state + */ +int +map_entry_is_initial(uint32_t map_entry) +{ + return (map_entry & ~BTT_MAP_ENTRY_LBA_MASK) == 0; +} + +/* + * map_entry_is_zero_or_initial -- (internal) checks if map_entry is in initial + * or zero state + */ +static inline int +map_entry_is_zero_or_initial(uint32_t map_entry) +{ + uint32_t entry_flags = map_entry & ~BTT_MAP_ENTRY_LBA_MASK; + return entry_flags == 0 || entry_flags == BTT_MAP_ENTRY_ZERO; +} + +/* + * btt_flog_get_valid -- return valid and current flog entry + */ +struct btt_flog * +btt_flog_get_valid(struct btt_flog *flog_pair, int *next) +{ + /* + * Interesting cases: + * - no valid seq numbers: layout consistency error + * - one valid seq number: that's the current entry + * - two valid seq numbers: higher number is current entry + * - identical seq numbers: layout consistency error + */ + if (flog_pair[0].seq == flog_pair[1].seq) { + return NULL; + } else if (flog_pair[0].seq == 0) { + /* singleton valid flog at flog_pair[1] */ + *next = 0; + return &flog_pair[1]; + } else if (flog_pair[1].seq == 0) { + /* singleton valid flog at flog_pair[0] */ + *next = 1; + return &flog_pair[0]; + } else if (NSEQ(flog_pair[0].seq) == flog_pair[1].seq) { + /* flog_pair[1] has the later sequence number */ + *next = 0; + return &flog_pair[1]; + } else { + /* flog_pair[0] has the later sequence number */ + *next = 1; + return &flog_pair[0]; + } +} + +/* + * read_flog_pair -- (internal) load up a single flog pair + * + * Zero is returned on success, otherwise -1/errno. + */ +static int +read_flog_pair(struct btt *bttp, unsigned lane, struct arena *arenap, + uint64_t flog_off, struct flog_runtime *flog_runtimep, uint32_t flognum) +{ + LOG(5, "bttp %p lane %u arenap %p flog_off %" PRIu64 " runtimep %p " + "flognum %u", bttp, lane, arenap, flog_off, flog_runtimep, + flognum); + + flog_runtimep->entries[0] = flog_off; + flog_runtimep->entries[1] = flog_off + sizeof(struct btt_flog); + + if (lane >= bttp->nfree) { + ERR("invalid lane %u among nfree %d", lane, bttp->nfree); + errno = EINVAL; + return -1; + } + + if (flog_off == 0) { + ERR("invalid flog offset %" PRIu64, flog_off); + errno = EINVAL; + return -1; + } + + struct btt_flog flog_pair[2]; + if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, flog_pair, + sizeof(flog_pair), flog_off) < 0) + return -1; + + btt_flog_convert2h(&flog_pair[0]); + if (invalid_lba(bttp, flog_pair[0].lba)) + return -1; + + btt_flog_convert2h(&flog_pair[1]); + if (invalid_lba(bttp, flog_pair[1].lba)) + return -1; + + LOG(6, "flog_pair[0] flog_off %" PRIu64 " old_map %u new_map %u seq %u", + flog_off, flog_pair[0].old_map, + flog_pair[0].new_map, flog_pair[0].seq); + LOG(6, "flog_pair[1] old_map %u new_map %u seq %u", + flog_pair[1].old_map, flog_pair[1].new_map, + flog_pair[1].seq); + + struct btt_flog *currentp = btt_flog_get_valid(flog_pair, + &flog_runtimep->next); + + if (currentp == NULL) { + ERR("flog layout error: bad seq numbers %d %d", + flog_pair[0].seq, flog_pair[1].seq); + arenap->flags |= BTTINFO_FLAG_ERROR; + return 0; + } + + LOG(6, "run-time flog next is %d", flog_runtimep->next); + + /* copy current flog into run-time flog state */ + flog_runtimep->flog = *currentp; + + LOG(9, "read flog[%u]: lba %u old %u%s%s%s new %u%s%s%s", flognum, + currentp->lba, + currentp->old_map & BTT_MAP_ENTRY_LBA_MASK, + (map_entry_is_error(currentp->old_map)) ? " ERROR" : "", + (map_entry_is_zero(currentp->old_map)) ? " ZERO" : "", + (map_entry_is_initial(currentp->old_map)) ? " INIT" : "", + currentp->new_map & BTT_MAP_ENTRY_LBA_MASK, + (map_entry_is_error(currentp->new_map)) ? " ERROR" : "", + (map_entry_is_zero(currentp->new_map)) ? " ZERO" : "", + (map_entry_is_initial(currentp->new_map)) ? " INIT" : ""); + + /* + * Decide if the current flog info represents a completed + * operation or an incomplete operation. If completed, the + * old_map field will contain the free block to be used for + * the next write. But if the operation didn't complete (indicated + * by the map entry not being updated), then the operation is + * completed now by updating the map entry. + * + * A special case, used by flog entries when first created, is + * when old_map == new_map. This counts as a complete entry + * and doesn't require reading the map to see if recovery is + * required. + */ + if (currentp->old_map == currentp->new_map) { + LOG(9, "flog[%u] entry complete (initial state)", flognum); + return 0; + } + + /* convert pre-map LBA into an offset into the map */ + uint64_t map_entry_off = arenap->mapoff + + BTT_MAP_ENTRY_SIZE * currentp->lba; + + /* read current map entry */ + uint32_t entry; + if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &entry, + sizeof(entry), map_entry_off) < 0) + return -1; + + entry = le32toh(entry); + + /* map entry in initial state */ + if (map_entry_is_initial(entry)) + entry = currentp->lba | BTT_MAP_ENTRY_NORMAL; + + if (currentp->new_map != entry && currentp->old_map == entry) { + /* last update didn't complete */ + LOG(9, "recover flog[%u]: map[%u]: %u", + flognum, currentp->lba, currentp->new_map); + + /* + * Recovery step is to complete the transaction by + * updating the map entry. + */ + entry = htole32(currentp->new_map); + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &entry, + sizeof(uint32_t), map_entry_off) < 0) + return -1; + } + + return 0; +} + +/* + * flog_update -- (internal) write out an updated flog entry + * + * The flog entries are not checksummed. Instead, increasing sequence + * numbers are used to atomically switch the active flog entry between + * the first and second struct btt_flog in each slot. In order for this + * to work, the sequence number must be updated only after all the other + * fields in the flog are updated. So the writes to the flog are broken + * into two writes, one for the first three fields (lba, old_map, new_map) + * and, only after those fields are known to be written durably, the + * second write for the seq field is done. + * + * Returns 0 on success, otherwise -1/errno. + */ +static int +flog_update(struct btt *bttp, unsigned lane, struct arena *arenap, + uint32_t lba, uint32_t old_map, uint32_t new_map) +{ + LOG(3, "bttp %p lane %u arenap %p lba %u old_map %u new_map %u", + bttp, lane, arenap, lba, old_map, new_map); + + /* construct new flog entry in little-endian byte order */ + struct btt_flog new_flog; + new_flog.lba = lba; + new_flog.old_map = old_map; + new_flog.new_map = new_map; + new_flog.seq = NSEQ(arenap->flogs[lane].flog.seq); + btt_flog_convert2le(&new_flog); + + uint64_t new_flog_off = + arenap->flogs[lane].entries[arenap->flogs[lane].next]; + + /* write out first two fields first */ + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &new_flog, + sizeof(uint32_t) * 2, new_flog_off) < 0) + return -1; + new_flog_off += sizeof(uint32_t) * 2; + + /* write out new_map and seq field to make it active */ + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &new_flog.new_map, + sizeof(uint32_t) * 2, new_flog_off) < 0) + return -1; + + /* flog entry written successfully, update run-time state */ + arenap->flogs[lane].next = 1 - arenap->flogs[lane].next; + arenap->flogs[lane].flog.lba = lba; + arenap->flogs[lane].flog.old_map = old_map; + arenap->flogs[lane].flog.new_map = new_map; + arenap->flogs[lane].flog.seq = NSEQ(arenap->flogs[lane].flog.seq); + + LOG(9, "update flog[%u]: lba %u old %u%s%s%s new %u%s%s%s", lane, lba, + old_map & BTT_MAP_ENTRY_LBA_MASK, + (map_entry_is_error(old_map)) ? " ERROR" : "", + (map_entry_is_zero(old_map)) ? " ZERO" : "", + (map_entry_is_initial(old_map)) ? " INIT" : "", + new_map & BTT_MAP_ENTRY_LBA_MASK, + (map_entry_is_error(new_map)) ? " ERROR" : "", + (map_entry_is_zero(new_map)) ? " ZERO" : "", + (map_entry_is_initial(new_map)) ? " INIT" : ""); + + return 0; +} + +/* + * arena_setf -- (internal) updates the given flag for the arena info block + */ +static int +arena_setf(struct btt *bttp, struct arena *arenap, unsigned lane, uint32_t setf) +{ + LOG(3, "bttp %p arenap %p lane %u setf 0x%x", bttp, arenap, lane, setf); + + /* update runtime state */ + util_fetch_and_or32(&arenap->flags, setf); + + if (!bttp->laidout) { + /* no layout yet to update */ + return 0; + } + + /* + * Read, modify and write out the info block + * at both the beginning and end of the arena. + */ + uint64_t arena_off = arenap->startoff; + + struct btt_info info; + + /* protect from simultaneous writes to the layout */ + util_mutex_lock(&arenap->info_lock); + + if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &info, + sizeof(info), arena_off) < 0) { + goto err; + } + + uint64_t infooff = le64toh(info.infooff); + + /* update flags */ + info.flags |= htole32(setf); + + /* update checksum */ + util_checksum(&info, sizeof(info), &info.checksum, 1, 0); + + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info, + sizeof(info), arena_off) < 0) { + goto err; + } + + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info, + sizeof(info), arena_off + infooff) < 0) { + goto err; + } + + util_mutex_unlock(&arenap->info_lock); + return 0; + +err: + util_mutex_unlock(&arenap->info_lock); + return -1; +} + +/* + * set_arena_error -- (internal) set the error flag for the given arena + */ +static int +set_arena_error(struct btt *bttp, struct arena *arenap, unsigned lane) +{ + LOG(3, "bttp %p arena %p lane %u", bttp, arenap, lane); + + return arena_setf(bttp, arenap, lane, BTTINFO_FLAG_ERROR); +} + +/* + * read_flogs -- (internal) load up all the flog entries for an arena + * + * Zero is returned on success, otherwise -1/errno. + */ +static int +read_flogs(struct btt *bttp, unsigned lane, struct arena *arenap) +{ + if ((arenap->flogs = Zalloc(bttp->nfree * + sizeof(struct flog_runtime))) == NULL) { + ERR("!Malloc for %u flog entries", bttp->nfree); + return -1; + } + + /* + * Load up the flog state. read_flog_pair() will determine if + * any recovery steps are required take them on the in-memory + * data structures it creates. Sets error flag when it + * determines an invalid state. + */ + uint64_t flog_off = arenap->flogoff; + struct flog_runtime *flog_runtimep = arenap->flogs; + for (uint32_t i = 0; i < bttp->nfree; i++) { + if (read_flog_pair(bttp, lane, arenap, flog_off, + flog_runtimep, i) < 0) { + set_arena_error(bttp, arenap, lane); + return -1; + } + + /* prepare for next time around the loop */ + flog_off += roundup(2 * sizeof(struct btt_flog), + BTT_FLOG_PAIR_ALIGN); + flog_runtimep++; + } + + return 0; +} + +/* + * build_rtt -- (internal) construct a read tracking table for an arena + * + * Zero is returned on success, otherwise -1/errno. + * + * The rtt is big enough to hold an entry for each free block (nfree) + * since nlane can't be bigger than nfree. nlane may end up smaller, + * in which case some of the high rtt entries will be unused. + */ +static int +build_rtt(struct btt *bttp, struct arena *arenap) +{ + if ((arenap->rtt = Malloc(bttp->nfree * sizeof(uint32_t))) + == NULL) { + ERR("!Malloc for %d rtt entries", bttp->nfree); + return -1; + } + for (uint32_t lane = 0; lane < bttp->nfree; lane++) + arenap->rtt[lane] = BTT_MAP_ENTRY_ERROR; + util_synchronize(); + + return 0; +} + +/* + * build_map_locks -- (internal) construct map locks + * + * Zero is returned on success, otherwise -1/errno. + */ +static int +build_map_locks(struct btt *bttp, struct arena *arenap) +{ + if ((arenap->map_locks = + Malloc(bttp->nfree * sizeof(*arenap->map_locks))) + == NULL) { + ERR("!Malloc for %d map_lock entries", bttp->nfree); + return -1; + } + for (uint32_t lane = 0; lane < bttp->nfree; lane++) + util_mutex_init(&arenap->map_locks[lane]); + + return 0; +} + +/* + * read_arena -- (internal) load up an arena and build run-time state + * + * Zero is returned on success, otherwise -1/errno. + */ +static int +read_arena(struct btt *bttp, unsigned lane, uint64_t arena_off, + struct arena *arenap) +{ + LOG(3, "bttp %p lane %u arena_off %" PRIu64 " arenap %p", + bttp, lane, arena_off, arenap); + + struct btt_info info; + if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &info, sizeof(info), + arena_off) < 0) + return -1; + + arenap->flags = le32toh(info.flags); + arenap->external_nlba = le32toh(info.external_nlba); + arenap->internal_lbasize = le32toh(info.internal_lbasize); + arenap->internal_nlba = le32toh(info.internal_nlba); + + arenap->startoff = arena_off; + arenap->dataoff = arena_off + le64toh(info.dataoff); + arenap->mapoff = arena_off + le64toh(info.mapoff); + arenap->flogoff = arena_off + le64toh(info.flogoff); + arenap->nextoff = arena_off + le64toh(info.nextoff); + + if (read_flogs(bttp, lane, arenap) < 0) + return -1; + + if (build_rtt(bttp, arenap) < 0) + return -1; + + if (build_map_locks(bttp, arenap) < 0) + return -1; + + /* initialize the per arena info block lock */ + util_mutex_init(&arenap->info_lock); + + return 0; +} + +/* + * util_convert2h_btt_info -- convert btt_info to host byte order + */ +void +btt_info_convert2h(struct btt_info *infop) +{ + infop->flags = le32toh(infop->flags); + infop->major = le16toh(infop->major); + infop->minor = le16toh(infop->minor); + infop->external_lbasize = le32toh(infop->external_lbasize); + infop->external_nlba = le32toh(infop->external_nlba); + infop->internal_lbasize = le32toh(infop->internal_lbasize); + infop->internal_nlba = le32toh(infop->internal_nlba); + infop->nfree = le32toh(infop->nfree); + infop->infosize = le32toh(infop->infosize); + infop->nextoff = le64toh(infop->nextoff); + infop->dataoff = le64toh(infop->dataoff); + infop->mapoff = le64toh(infop->mapoff); + infop->flogoff = le64toh(infop->flogoff); + infop->infooff = le64toh(infop->infooff); +} + +/* + * btt_info_convert2le -- convert btt_info to little-endian byte order + */ +void +btt_info_convert2le(struct btt_info *infop) +{ + infop->flags = le32toh(infop->flags); + infop->major = le16toh(infop->major); + infop->minor = le16toh(infop->minor); + infop->external_lbasize = le32toh(infop->external_lbasize); + infop->external_nlba = le32toh(infop->external_nlba); + infop->internal_lbasize = le32toh(infop->internal_lbasize); + infop->internal_nlba = le32toh(infop->internal_nlba); + infop->nfree = le32toh(infop->nfree); + infop->infosize = le32toh(infop->infosize); + infop->nextoff = le64toh(infop->nextoff); + infop->dataoff = le64toh(infop->dataoff); + infop->mapoff = le64toh(infop->mapoff); + infop->flogoff = le64toh(infop->flogoff); + infop->infooff = le64toh(infop->infooff); +} + +/* + * btt_flog_convert2h -- convert btt_flog to host byte order + */ +void +btt_flog_convert2h(struct btt_flog *flogp) +{ + flogp->lba = le32toh(flogp->lba); + flogp->old_map = le32toh(flogp->old_map); + flogp->new_map = le32toh(flogp->new_map); + flogp->seq = le32toh(flogp->seq); +} + +/* + * btt_flog_convert2le -- convert btt_flog to LE byte order + */ +void +btt_flog_convert2le(struct btt_flog *flogp) +{ + flogp->lba = htole32(flogp->lba); + flogp->old_map = htole32(flogp->old_map); + flogp->new_map = htole32(flogp->new_map); + flogp->seq = htole32(flogp->seq); +} + +/* + * read_arenas -- (internal) load up all arenas and build run-time state + * + * On entry, layout must be known to be valid, and the number of arenas + * must be known. Zero is returned on success, otherwise -1/errno. + */ +static int +read_arenas(struct btt *bttp, unsigned lane, unsigned narena) +{ + LOG(3, "bttp %p lane %u narena %d", bttp, lane, narena); + + if ((bttp->arenas = Zalloc(narena * sizeof(*bttp->arenas))) == NULL) { + ERR("!Malloc for %u arenas", narena); + goto err; + } + + uint64_t arena_off = 0; + struct arena *arenap = bttp->arenas; + for (unsigned i = 0; i < narena; i++) { + + if (read_arena(bttp, lane, arena_off, arenap) < 0) + goto err; + + /* prepare for next time around the loop */ + arena_off = arenap->nextoff; + arenap++; + } + + bttp->laidout = 1; + + return 0; + +err: + LOG(4, "error clean up"); + int oerrno = errno; + if (bttp->arenas) { + for (unsigned i = 0; i < bttp->narena; i++) { + if (bttp->arenas[i].flogs) + Free(bttp->arenas[i].flogs); + if (bttp->arenas[i].rtt) + Free((void *)bttp->arenas[i].rtt); + if (bttp->arenas[i].map_locks) + Free((void *)bttp->arenas[i].map_locks); + } + Free(bttp->arenas); + bttp->arenas = NULL; + } + errno = oerrno; + return -1; +} + +/* + * internal_lbasize -- (internal) calculate internal LBA size + */ +static inline uint32_t +internal_lbasize(uint32_t external_lbasize) +{ + uint32_t internal_lbasize = external_lbasize; + if (internal_lbasize < BTT_MIN_LBA_SIZE) + internal_lbasize = BTT_MIN_LBA_SIZE; + internal_lbasize = + roundup(internal_lbasize, BTT_INTERNAL_LBA_ALIGNMENT); + /* check for overflow */ + if (internal_lbasize < BTT_INTERNAL_LBA_ALIGNMENT) { + errno = EINVAL; + ERR("!Invalid lba size after alignment: %u ", internal_lbasize); + return 0; + } + + return internal_lbasize; +} + +/* + * btt_flog_size -- calculate flog data size + */ +uint64_t +btt_flog_size(uint32_t nfree) +{ + uint64_t flog_size = nfree * roundup(2 * sizeof(struct btt_flog), + BTT_FLOG_PAIR_ALIGN); + return roundup(flog_size, BTT_ALIGNMENT); +} + +/* + * btt_map_size -- calculate map data size + */ +uint64_t +btt_map_size(uint32_t external_nlba) +{ + return roundup(external_nlba * BTT_MAP_ENTRY_SIZE, BTT_ALIGNMENT); +} + +/* + * btt_arena_datasize -- whole arena size without BTT Info header, backup and + * flog means size of blocks and map + */ +uint64_t +btt_arena_datasize(uint64_t arena_size, uint32_t nfree) +{ + return arena_size - 2 * sizeof(struct btt_info) - btt_flog_size(nfree); +} + +/* + * btt_info_set_params -- (internal) calculate and set BTT Info + * external_lbasize, internal_lbasize, nfree, infosize, external_nlba and + * internal_nlba + */ +static int +btt_info_set_params(struct btt_info *info, uint32_t external_lbasize, + uint32_t internal_lbasize, uint32_t nfree, uint64_t arena_size) +{ + info->external_lbasize = external_lbasize; + info->internal_lbasize = internal_lbasize; + info->nfree = nfree; + info->infosize = sizeof(*info); + + uint64_t arena_data_size = btt_arena_datasize(arena_size, nfree); + + /* allow for map alignment padding */ + uint64_t internal_nlba = (arena_data_size - BTT_ALIGNMENT) / + (info->internal_lbasize + BTT_MAP_ENTRY_SIZE); + + /* ensure the number of blocks is at least 2*nfree */ + if (internal_nlba < 2 * nfree) { + errno = EINVAL; + ERR("!number of internal blocks: %" PRIu64 + " expected at least %u", + internal_nlba, 2 * nfree); + return -1; + } + + ASSERT(internal_nlba <= UINT32_MAX); + uint32_t internal_nlba_u32 = (uint32_t)internal_nlba; + + info->internal_nlba = internal_nlba_u32; + /* external LBA does not include free blocks */ + info->external_nlba = internal_nlba_u32 - info->nfree; + + ASSERT((arena_data_size - btt_map_size(info->external_nlba)) / + internal_lbasize >= internal_nlba); + + return 0; +} + +/* + * btt_info_set_offs -- (internal) calculate and set the BTT Info dataoff, + * nextoff, infooff, flogoff and mapoff. These are all relative to the + * beginning of the arena. + */ +static void +btt_info_set_offs(struct btt_info *info, uint64_t arena_size, + uint64_t space_left) +{ + info->dataoff = info->infosize; + + /* set offset to next valid arena */ + if (space_left >= BTT_MIN_SIZE) + info->nextoff = arena_size; + else + info->nextoff = 0; + + info->infooff = arena_size - sizeof(struct btt_info); + info->flogoff = info->infooff - btt_flog_size(info->nfree); + info->mapoff = info->flogoff - btt_map_size(info->external_nlba); + + ASSERTeq(btt_arena_datasize(arena_size, info->nfree) - + btt_map_size(info->external_nlba), info->mapoff - + info->dataoff); +} + +/* + * btt_info_set -- set BTT Info params and offsets + */ +int +btt_info_set(struct btt_info *info, uint32_t external_lbasize, + uint32_t nfree, uint64_t arena_size, uint64_t space_left) +{ + /* calculate internal LBA size */ + uint32_t internal_lba_size = internal_lbasize(external_lbasize); + if (internal_lba_size == 0) + return -1; + + /* set params and offsets */ + if (btt_info_set_params(info, external_lbasize, + internal_lba_size, nfree, arena_size)) + return -1; + + btt_info_set_offs(info, arena_size, space_left); + + return 0; +} + +/* + * write_layout -- (internal) write out the initial btt metadata layout + * + * Called with write == 1 only once in the life time of a btt namespace, when + * the first write happens. The caller of this routine is responsible for + * locking out multiple threads. This routine doesn't read anything -- by the + * time it is called, it is known there's no layout in the namespace and a new + * layout should be written. + * + * Calling with write == 0 tells this routine to do the calculations for + * bttp->narena and bttp->nlba, but don't write out any metadata. + * + * If successful, sets bttp->layout to 1 and returns 0. Otherwise -1 + * is returned and errno is set, and bttp->layout remains 0 so that + * later attempts to write will try again to create the layout. + */ +static int +write_layout(struct btt *bttp, unsigned lane, int write) +{ + LOG(3, "bttp %p lane %u write %d", bttp, lane, write); + + ASSERT(bttp->rawsize >= BTT_MIN_SIZE); + ASSERT(bttp->nfree); + + /* + * If a new layout is being written, generate the BTT's UUID. + */ + if (write) { + int ret = util_uuid_generate(bttp->uuid); + if (ret < 0) { + LOG(2, "util_uuid_generate failed"); + return -1; + } + } + + /* + * The number of arenas is the number of full arena of + * size BTT_MAX_ARENA that fit into rawsize and then, if + * the remainder is at least BTT_MIN_SIZE in size, then + * that adds one more arena. + */ + bttp->narena = (unsigned)(bttp->rawsize / BTT_MAX_ARENA); + if (bttp->rawsize % BTT_MAX_ARENA >= BTT_MIN_SIZE) + bttp->narena++; + LOG(4, "narena %u", bttp->narena); + + uint32_t internal_lba_size = internal_lbasize(bttp->lbasize); + if (internal_lba_size == 0) + return -1; + LOG(4, "adjusted internal_lbasize %u", internal_lba_size); + + uint64_t total_nlba = 0; + uint64_t rawsize = bttp->rawsize; + unsigned arena_num = 0; + uint64_t arena_off = 0; + + /* + * for each arena... + */ + while (rawsize >= BTT_MIN_SIZE) { + LOG(4, "layout arena %u", arena_num); + + uint64_t arena_rawsize = rawsize; + if (arena_rawsize > BTT_MAX_ARENA) { + arena_rawsize = BTT_MAX_ARENA; + } + rawsize -= arena_rawsize; + arena_num++; + + struct btt_info info; + memset(&info, '\0', sizeof(info)); + if (btt_info_set_params(&info, bttp->lbasize, + internal_lba_size, bttp->nfree, arena_rawsize)) + return -1; + + LOG(4, "internal_nlba %u external_nlba %u", + info.internal_nlba, info.external_nlba); + + total_nlba += info.external_nlba; + + /* + * The rest of the loop body calculates metadata structures + * and lays it out for this arena. So only continue if + * the write flag is set. + */ + if (!write) + continue; + + btt_info_set_offs(&info, arena_rawsize, rawsize); + + LOG(4, "nextoff 0x%016" PRIx64, info.nextoff); + LOG(4, "dataoff 0x%016" PRIx64, info.dataoff); + LOG(4, "mapoff 0x%016" PRIx64, info.mapoff); + LOG(4, "flogoff 0x%016" PRIx64, info.flogoff); + LOG(4, "infooff 0x%016" PRIx64, info.infooff); + + /* zero map if ns is not zero-initialized */ + if (!bttp->ns_cbp->ns_is_zeroed) { + uint64_t mapsize = btt_map_size(info.external_nlba); + if ((*bttp->ns_cbp->nszero)(bttp->ns, lane, mapsize, + info.mapoff) < 0) + return -1; + } + + /* write out the initial flog */ + uint64_t flog_entry_off = arena_off + info.flogoff; + uint32_t next_free_lba = info.external_nlba; + for (uint32_t i = 0; i < bttp->nfree; i++) { + struct btt_flog flog; + flog.lba = htole32(i); + flog.old_map = flog.new_map = + htole32(next_free_lba | BTT_MAP_ENTRY_ZERO); + flog.seq = htole32(1); + + /* + * Write both btt_flog structs in the pair, writing + * the second one as all zeros. + */ + LOG(6, "flog[%u] entry off %" PRIu64 + " initial %u + zero = %u", + i, flog_entry_off, + next_free_lba, + next_free_lba | BTT_MAP_ENTRY_ZERO); + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &flog, + sizeof(flog), flog_entry_off) < 0) + return -1; + flog_entry_off += sizeof(flog); + + LOG(6, "flog[%u] entry off %" PRIu64 " zeros", + i, flog_entry_off); + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &Zflog, + sizeof(Zflog), flog_entry_off) < 0) + return -1; + flog_entry_off += sizeof(flog); + flog_entry_off = roundup(flog_entry_off, + BTT_FLOG_PAIR_ALIGN); + + next_free_lba++; + } + + /* + * Construct the BTT info block and write it out + * at both the beginning and end of the arena. + */ + memcpy(info.sig, Sig, BTTINFO_SIG_LEN); + memcpy(info.uuid, bttp->uuid, BTTINFO_UUID_LEN); + memcpy(info.parent_uuid, bttp->parent_uuid, BTTINFO_UUID_LEN); + info.major = BTTINFO_MAJOR_VERSION; + info.minor = BTTINFO_MINOR_VERSION; + btt_info_convert2le(&info); + + util_checksum(&info, sizeof(info), &info.checksum, 1, 0); + + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info, + sizeof(info), arena_off) < 0) + return -1; + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info, + sizeof(info), arena_off + info.infooff) < 0) + return -1; + + arena_off += info.nextoff; + } + + ASSERTeq(bttp->narena, arena_num); + + bttp->nlba = total_nlba; + + if (write) { + /* + * The layout is written now, so load up the arenas. + */ + return read_arenas(bttp, lane, bttp->narena); + } + + return 0; +} + +/* + * read_layout -- (internal) load up layout info from btt namespace + * + * Called once when the btt namespace is opened for use. + * Sets bttp->layout to 0 if no valid layout is found, 1 otherwise. + * + * Any recovery actions required (as indicated by the flog state) are + * performed by this routine. + * + * Any quick checks for layout consistency are performed by this routine + * (quick enough to be done each time a BTT area is opened for use, not + * like the slow consistency checks done by btt_check()). + * + * Returns 0 if no errors are encountered accessing the namespace (in this + * context, detecting there's no layout is not an error if the nsread function + * didn't have any problems doing the reads). Otherwise, -1 is returned + * and errno is set. + */ +static int +read_layout(struct btt *bttp, unsigned lane) +{ + LOG(3, "bttp %p", bttp); + + ASSERT(bttp->rawsize >= BTT_MIN_SIZE); + + unsigned narena = 0; + uint32_t smallest_nfree = UINT32_MAX; + uint64_t rawsize = bttp->rawsize; + uint64_t total_nlba = 0; + uint64_t arena_off = 0; + + bttp->nfree = BTT_DEFAULT_NFREE; + + /* + * For each arena, see if there's a valid info block + */ + while (rawsize >= BTT_MIN_SIZE) { + narena++; + + struct btt_info info; + if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &info, + sizeof(info), arena_off) < 0) + return -1; + + if (!read_info(bttp, &info)) { + /* + * Failed to find complete BTT metadata. Just + * calculate the narena and nlba values that will + * result when write_layout() gets called. This + * allows checks against nlba to work correctly + * even before the layout is written. + */ + return write_layout(bttp, lane, 0); + } + if (info.external_lbasize != bttp->lbasize) { + /* can't read it assuming the wrong block size */ + ERR("inconsistent lbasize"); + errno = EINVAL; + return -1; + } + + if (info.nfree == 0) { + ERR("invalid nfree"); + errno = EINVAL; + return -1; + } + + if (info.external_nlba == 0) { + ERR("invalid external_nlba"); + errno = EINVAL; + return -1; + } + + if (info.nextoff && (info.nextoff != BTT_MAX_ARENA)) { + ERR("invalid arena size"); + errno = EINVAL; + return -1; + } + + if (info.nfree < smallest_nfree) + smallest_nfree = info.nfree; + + total_nlba += info.external_nlba; + arena_off += info.nextoff; + if (info.nextoff == 0) + break; + if (info.nextoff > rawsize) { + ERR("invalid next arena offset"); + errno = EINVAL; + return -1; + } + rawsize -= info.nextoff; + } + + ASSERT(narena); + + bttp->narena = narena; + bttp->nlba = total_nlba; + + /* + * All arenas were valid. nfree should be the smallest value found + * among different arenas. + */ + if (smallest_nfree < bttp->nfree) + bttp->nfree = smallest_nfree; + + /* + * Load up arenas. + */ + return read_arenas(bttp, lane, narena); +} + +/* + * zero_block -- (internal) satisfy a read with a block of zeros + * + * Returns 0 on success, otherwise -1/errno. + */ +static int +zero_block(struct btt *bttp, void *buf) +{ + LOG(3, "bttp %p", bttp); + + memset(buf, '\0', bttp->lbasize); + return 0; +} + +/* + * lba_to_arena_lba -- (internal) calculate the arena & pre-map LBA + * + * This routine takes the external LBA and matches it to the + * appropriate arena, adjusting the lba for use within that arena. + * + * If successful, zero is returned, *arenapp is a pointer to the appropriate + * arena struct in the run-time state, and *premap_lbap is the LBA adjusted + * to an arena-internal LBA (also known as the pre-map LBA). Otherwise + * -1/errno. + */ +static int +lba_to_arena_lba(struct btt *bttp, uint64_t lba, + struct arena **arenapp, uint32_t *premap_lbap) +{ + LOG(3, "bttp %p lba %" PRIu64, bttp, lba); + + ASSERT(bttp->laidout); + + unsigned arena; + for (arena = 0; arena < bttp->narena; arena++) + if (lba < bttp->arenas[arena].external_nlba) + break; + else + lba -= bttp->arenas[arena].external_nlba; + + ASSERT(arena < bttp->narena); + + *arenapp = &bttp->arenas[arena]; + ASSERT(lba <= UINT32_MAX); + *premap_lbap = (uint32_t)lba; + + LOG(3, "arenap %p pre-map LBA %u", *arenapp, *premap_lbap); + return 0; +} + +/* + * btt_init -- prepare a btt namespace for use, returning an opaque handle + * + * Returns handle on success, otherwise NULL/errno. + * + * When submitted a pristine namespace it will be formatted implicitly when + * touched for the first time. + * + * If arenas have different nfree values, we will be using the lowest one + * found as limiting to the overall "bandwidth". + */ +struct btt * +btt_init(uint64_t rawsize, uint32_t lbasize, uint8_t parent_uuid[], + unsigned maxlane, void *ns, const struct ns_callback *ns_cbp) +{ + LOG(3, "rawsize %" PRIu64 " lbasize %u", rawsize, lbasize); + + if (rawsize < BTT_MIN_SIZE) { + ERR("rawsize smaller than BTT_MIN_SIZE %u", BTT_MIN_SIZE); + errno = EINVAL; + return NULL; + } + + struct btt *bttp = Zalloc(sizeof(*bttp)); + + if (bttp == NULL) { + ERR("!Malloc %zu bytes", sizeof(*bttp)); + return NULL; + } + + util_mutex_init(&bttp->layout_write_mutex); + memcpy(bttp->parent_uuid, parent_uuid, BTTINFO_UUID_LEN); + bttp->rawsize = rawsize; + bttp->lbasize = lbasize; + bttp->ns = ns; + bttp->ns_cbp = ns_cbp; + + /* + * Load up layout, if it exists. + * + * Whether read_layout() finds a valid layout or not, it finishes + * updating these layout-related fields: + * bttp->nfree + * bttp->nlba + * bttp->narena + * since these fields are used even before a valid layout it written. + */ + if (read_layout(bttp, 0) < 0) { + btt_fini(bttp); /* free up any allocations */ + return NULL; + } + + bttp->nlane = bttp->nfree; + + /* maxlane, if provided, is an upper bound on nlane */ + if (maxlane && bttp->nlane > maxlane) + bttp->nlane = maxlane; + + LOG(3, "success, bttp %p nlane %u", bttp, bttp->nlane); + return bttp; +} + +/* + * btt_nlane -- return the number of "lanes" for this btt namespace + * + * The number of lanes is the number of threads allowed in this module + * concurrently for a given btt. Each thread executing this code must + * have a unique "lane" number assigned to it between 0 and btt_nlane() - 1. + */ +unsigned +btt_nlane(struct btt *bttp) +{ + LOG(3, "bttp %p", bttp); + + return bttp->nlane; +} + +/* + * btt_nlba -- return the number of usable blocks in a btt namespace + * + * Valid LBAs to pass to btt_read() and btt_write() are 0 through + * btt_nlba() - 1. + */ +size_t +btt_nlba(struct btt *bttp) +{ + LOG(3, "bttp %p", bttp); + + return bttp->nlba; +} + +/* + * btt_read -- read a block from a btt namespace + * + * Returns 0 on success, otherwise -1/errno. + */ +int +btt_read(struct btt *bttp, unsigned lane, uint64_t lba, void *buf) +{ + LOG(3, "bttp %p lane %u lba %" PRIu64, bttp, lane, lba); + + if (invalid_lba(bttp, lba)) + return -1; + + /* if there's no layout written yet, all reads come back as zeros */ + if (!bttp->laidout) + return zero_block(bttp, buf); + + /* find which arena LBA lives in, and the offset to the map entry */ + struct arena *arenap; + uint32_t premap_lba; + uint64_t map_entry_off; + if (lba_to_arena_lba(bttp, lba, &arenap, &premap_lba) < 0) + return -1; + + /* convert pre-map LBA into an offset into the map */ + map_entry_off = arenap->mapoff + BTT_MAP_ENTRY_SIZE * premap_lba; + + /* + * Read the current map entry to get the post-map LBA for the data + * block read. + */ + uint32_t entry; + + if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &entry, + sizeof(entry), map_entry_off) < 0) + return -1; + + entry = le32toh(entry); + + /* + * Retries come back to the top of this loop (for a rare case where + * the map is changed by another thread doing writes to the same LBA). + */ + while (1) { + if (map_entry_is_error(entry)) { + ERR("EIO due to map entry error flag"); + errno = EIO; + return -1; + } + + if (map_entry_is_zero_or_initial(entry)) + return zero_block(bttp, buf); + + /* + * Record the post-map LBA in the read tracking table during + * the read. The write will check entries in the read tracking + * table before allocating a block for a write, waiting for + * outstanding reads on that block to complete. + * + * Since we already checked for error, zero, and initial + * states above, the entry must have both error and zero + * bits set at this point (BTT_MAP_ENTRY_NORMAL). We store + * the entry that way, with those bits set, in the rtt and + * btt_write() will check for it the same way, with the bits + * both set. + */ + arenap->rtt[lane] = entry; + util_synchronize(); + + /* + * In case this thread was preempted between reading entry and + * storing it in the rtt, check to see if the map changed. If + * it changed, the block about to be read is at least free now + * (in the flog, but that's okay since the data will still be + * undisturbed) and potentially allocated and being used for + * another write (data disturbed, so not okay to continue). + */ + uint32_t latest_entry; + if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &latest_entry, + sizeof(latest_entry), map_entry_off) < 0) { + arenap->rtt[lane] = BTT_MAP_ENTRY_ERROR; + return -1; + } + + latest_entry = le32toh(latest_entry); + + if (entry == latest_entry) + break; /* map stayed the same */ + else + entry = latest_entry; /* try again */ + } + + /* + * It is safe to read the block now, since the rtt protects the + * block from getting re-allocated to something else by a write. + */ + uint64_t data_block_off = + arenap->dataoff + (uint64_t)(entry & BTT_MAP_ENTRY_LBA_MASK) * + arenap->internal_lbasize; + int readret = (*bttp->ns_cbp->nsread)(bttp->ns, lane, buf, + bttp->lbasize, data_block_off); + + /* done with read, so clear out rtt entry */ + arenap->rtt[lane] = BTT_MAP_ENTRY_ERROR; + + return readret; +} + +/* + * map_lock -- (internal) grab the map_lock and read a map entry + */ +static int +map_lock(struct btt *bttp, unsigned lane, struct arena *arenap, + uint32_t *entryp, uint32_t premap_lba) +{ + LOG(3, "bttp %p lane %u arenap %p premap_lba %u", + bttp, lane, arenap, premap_lba); + + uint64_t map_entry_off = + arenap->mapoff + BTT_MAP_ENTRY_SIZE * premap_lba; + uint32_t map_lock_num = get_map_lock_num(premap_lba, bttp->nfree); + + util_mutex_lock(&arenap->map_locks[map_lock_num]); + + /* read the old map entry */ + if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, entryp, + sizeof(uint32_t), map_entry_off) < 0) { + util_mutex_unlock(&arenap->map_locks[map_lock_num]); + return -1; + } + + /* if map entry is in its initial state return premap_lba */ + if (map_entry_is_initial(*entryp)) + *entryp = htole32(premap_lba | BTT_MAP_ENTRY_NORMAL); + + LOG(9, "locked map[%d]: %u%s%s", premap_lba, + *entryp & BTT_MAP_ENTRY_LBA_MASK, + (map_entry_is_error(*entryp)) ? " ERROR" : "", + (map_entry_is_zero(*entryp)) ? " ZERO" : ""); + + return 0; +} + +/* + * map_abort -- (internal) drop the map_lock without updating the entry + */ +static void +map_abort(struct btt *bttp, unsigned lane, struct arena *arenap, + uint32_t premap_lba) +{ + LOG(3, "bttp %p lane %u arenap %p premap_lba %u", + bttp, lane, arenap, premap_lba); + + util_mutex_unlock(&arenap->map_locks[get_map_lock_num(premap_lba, + bttp->nfree)]); +} + +/* + * map_unlock -- (internal) update the map and drop the map_lock + */ +static int +map_unlock(struct btt *bttp, unsigned lane, struct arena *arenap, + uint32_t entry, uint32_t premap_lba) +{ + LOG(3, "bttp %p lane %u arenap %p entry %u premap_lba %u", + bttp, lane, arenap, entry, premap_lba); + + uint64_t map_entry_off = + arenap->mapoff + BTT_MAP_ENTRY_SIZE * premap_lba; + + /* write the new map entry */ + int err = (*bttp->ns_cbp->nswrite)(bttp->ns, lane, &entry, + sizeof(uint32_t), map_entry_off); + + util_mutex_unlock(&arenap->map_locks[get_map_lock_num(premap_lba, + bttp->nfree)]); + + LOG(9, "unlocked map[%d]: %u%s%s", premap_lba, + entry & BTT_MAP_ENTRY_LBA_MASK, + (map_entry_is_error(entry)) ? " ERROR" : "", + (map_entry_is_zero(entry)) ? " ZERO" : ""); + + return err; +} + +/* + * btt_write -- write a block to a btt namespace + * + * Returns 0 on success, otherwise -1/errno. + */ +int +btt_write(struct btt *bttp, unsigned lane, uint64_t lba, const void *buf) +{ + LOG(3, "bttp %p lane %u lba %" PRIu64, bttp, lane, lba); + + if (invalid_lba(bttp, lba)) + return -1; + + /* first write through here will initialize the metadata layout */ + if (!bttp->laidout) { + int err = 0; + + util_mutex_lock(&bttp->layout_write_mutex); + + if (!bttp->laidout) + err = write_layout(bttp, lane, 1); + + util_mutex_unlock(&bttp->layout_write_mutex); + + if (err < 0) + return err; + } + + /* find which arena LBA lives in, and the offset to the map entry */ + struct arena *arenap; + uint32_t premap_lba; + if (lba_to_arena_lba(bttp, lba, &arenap, &premap_lba) < 0) + return -1; + + /* if the arena is in an error state, writing is not allowed */ + if (arenap->flags & BTTINFO_FLAG_ERROR_MASK) { + ERR("EIO due to btt_info error flags 0x%x", + arenap->flags & BTTINFO_FLAG_ERROR_MASK); + errno = EIO; + return -1; + } + + /* + * This routine was passed a unique "lane" which is an index + * into the flog. That means the free block held by flog[lane] + * is assigned to this thread and to no other threads (no additional + * locking required). So start by performing the write to the + * free block. It is only safe to write to a free block if it + * doesn't appear in the read tracking table, so scan that first + * and if found, wait for the thread reading from it to finish. + */ + uint32_t free_entry = (arenap->flogs[lane].flog.old_map & + BTT_MAP_ENTRY_LBA_MASK) | BTT_MAP_ENTRY_NORMAL; + + LOG(3, "free_entry %u (before mask %u)", free_entry, + arenap->flogs[lane].flog.old_map); + + /* wait for other threads to finish any reads on free block */ + for (unsigned i = 0; i < bttp->nlane; i++) + while (arenap->rtt[i] == free_entry) + ; + + /* it is now safe to perform write to the free block */ + uint64_t data_block_off = arenap->dataoff + + (uint64_t)(free_entry & BTT_MAP_ENTRY_LBA_MASK) * + arenap->internal_lbasize; + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, buf, + bttp->lbasize, data_block_off) < 0) + return -1; + + /* + * Make the new block active atomically by updating the on-media flog + * and then updating the map. + */ + uint32_t old_entry; + if (map_lock(bttp, lane, arenap, &old_entry, premap_lba) < 0) + return -1; + + old_entry = le32toh(old_entry); + + /* update the flog */ + if (flog_update(bttp, lane, arenap, premap_lba, + old_entry, free_entry) < 0) { + map_abort(bttp, lane, arenap, premap_lba); + return -1; + } + + if (map_unlock(bttp, lane, arenap, htole32(free_entry), + premap_lba) < 0) { + /* + * A critical write error occurred, set the arena's + * info block error bit. + */ + set_arena_error(bttp, arenap, lane); + errno = EIO; + return -1; + } + + return 0; +} + +/* + * map_entry_setf -- (internal) set a given flag on a map entry + * + * Returns 0 on success, otherwise -1/errno. + */ +static int +map_entry_setf(struct btt *bttp, unsigned lane, uint64_t lba, uint32_t setf) +{ + LOG(3, "bttp %p lane %u lba %" PRIu64 " setf 0x%x", + bttp, lane, lba, setf); + + if (invalid_lba(bttp, lba)) + return -1; + + if (!bttp->laidout) { + /* + * No layout is written yet. If the flag being set + * is the zero flag, it is superfluous since all blocks + * read as zero at this point. + */ + if (setf == BTT_MAP_ENTRY_ZERO) + return 0; + + /* + * Treat this like the first write and write out + * the metadata layout at this point. + */ + int err = 0; + util_mutex_lock(&bttp->layout_write_mutex); + + if (!bttp->laidout) + err = write_layout(bttp, lane, 1); + + util_mutex_unlock(&bttp->layout_write_mutex); + + if (err < 0) + return err; + } + + /* find which arena LBA lives in, and the offset to the map entry */ + struct arena *arenap; + uint32_t premap_lba; + if (lba_to_arena_lba(bttp, lba, &arenap, &premap_lba) < 0) + return -1; + + /* if the arena is in an error state, writing is not allowed */ + if (arenap->flags & BTTINFO_FLAG_ERROR_MASK) { + ERR("EIO due to btt_info error flags 0x%x", + arenap->flags & BTTINFO_FLAG_ERROR_MASK); + errno = EIO; + return -1; + } + + /* + * Set the flags in the map entry. To do this, read the + * current map entry, set the flags, and write out the update. + */ + uint32_t old_entry; + uint32_t new_entry; + + if (map_lock(bttp, lane, arenap, &old_entry, premap_lba) < 0) + return -1; + + old_entry = le32toh(old_entry); + + if (setf == BTT_MAP_ENTRY_ZERO && + map_entry_is_zero_or_initial(old_entry)) { + map_abort(bttp, lane, arenap, premap_lba); + return 0; /* block already zero, nothing to do */ + } + + /* create the new map entry */ + new_entry = (old_entry & BTT_MAP_ENTRY_LBA_MASK) | setf; + + if (map_unlock(bttp, lane, arenap, htole32(new_entry), premap_lba) < 0) + return -1; + + return 0; +} + +/* + * btt_set_zero -- mark a block as zeroed in a btt namespace + * + * Returns 0 on success, otherwise -1/errno. + */ +int +btt_set_zero(struct btt *bttp, unsigned lane, uint64_t lba) +{ + LOG(3, "bttp %p lane %u lba %" PRIu64, bttp, lane, lba); + + return map_entry_setf(bttp, lane, lba, BTT_MAP_ENTRY_ZERO); +} + +/* + * btt_set_error -- mark a block as in an error state in a btt namespace + * + * Returns 0 on success, otherwise -1/errno. + */ +int +btt_set_error(struct btt *bttp, unsigned lane, uint64_t lba) +{ + LOG(3, "bttp %p lane %u lba %" PRIu64, bttp, lane, lba); + + return map_entry_setf(bttp, lane, lba, BTT_MAP_ENTRY_ERROR); +} + +/* + * check_arena -- (internal) perform a consistency check on an arena + */ +static int +check_arena(struct btt *bttp, struct arena *arenap) +{ + LOG(3, "bttp %p arenap %p", bttp, arenap); + + int consistent = 1; + + uint64_t map_entry_off = arenap->mapoff; + uint32_t bitmapsize = howmany(arenap->internal_nlba, 8); + uint8_t *bitmap = Zalloc(bitmapsize); + if (bitmap == NULL) { + ERR("!Malloc for bitmap"); + return -1; + } + + /* + * Go through every post-map LBA mentioned in the map and make sure + * there are no duplicates. bitmap is used to track which LBAs have + * been seen so far. + */ + uint32_t *mapp = NULL; + ssize_t mlen; + int next_index = 0; + size_t remaining = 0; + for (uint32_t i = 0; i < arenap->external_nlba; i++) { + uint32_t entry; + + if (remaining == 0) { + /* request a mapping of remaining map area */ + size_t req_len = + (arenap->external_nlba - i) * sizeof(uint32_t); + mlen = (*bttp->ns_cbp->nsmap)(bttp->ns, 0, + (void **)&mapp, req_len, map_entry_off); + + if (mlen < 0) + return -1; + + remaining = (size_t)mlen; + next_index = 0; + } + entry = le32toh(mapp[next_index]); + + /* for debug, dump non-zero map entries at log level 11 */ + if (map_entry_is_zero_or_initial(entry) == 0) + LOG(11, "map[%d]: %u%s", i, + entry & BTT_MAP_ENTRY_LBA_MASK, + (map_entry_is_error(entry)) ? " ERROR" : ""); + + /* this is an uninitialized map entry, set the default value */ + if (map_entry_is_initial(entry)) + entry = i; + else + entry &= BTT_MAP_ENTRY_LBA_MASK; + + /* check if entry is valid */ + if (entry >= arenap->internal_nlba) { + ERR("map[%d] entry out of bounds: %u", i, entry); + errno = EINVAL; + return -1; + } + + if (util_isset(bitmap, entry)) { + ERR("map[%d] duplicate entry: %u", i, entry); + consistent = 0; + } else + util_setbit(bitmap, entry); + + map_entry_off += sizeof(uint32_t); + next_index++; + ASSERT(remaining >= sizeof(uint32_t)); + remaining -= sizeof(uint32_t); + } + + /* + * Go through the free blocks in the flog, adding them to bitmap + * and checking for duplications. It is sufficient to read the + * run-time flog here, avoiding more calls to nsread. + */ + for (uint32_t i = 0; i < bttp->nfree; i++) { + uint32_t entry = arenap->flogs[i].flog.old_map; + entry &= BTT_MAP_ENTRY_LBA_MASK; + + if (util_isset(bitmap, entry)) { + ERR("flog[%u] duplicate entry: %u", i, entry); + consistent = 0; + } else + util_setbit(bitmap, entry); + } + + /* + * Make sure every possible post-map LBA was accounted for + * in the two loops above. + */ + for (uint32_t i = 0; i < arenap->internal_nlba; i++) + if (util_isclr(bitmap, i)) { + ERR("unreferenced lba: %d", i); + consistent = 0; + } + + Free(bitmap); + + return consistent; +} + +/* + * btt_check -- perform a consistency check on a btt namespace + * + * This routine contains a fairly high-impact set of consistency checks. + * It may use a good amount of dynamic memory and CPU time performing + * the checks. Any lightweight, quick consistency checks are included + * in read_layout() so they happen every time the BTT area is opened + * for use. + * + * Returns true if consistent, zero if inconsistent, -1/error if checking + * cannot happen due to other errors. + * + * No lane number required here because only one thread is allowed -- all + * other threads must be locked out of all btt routines for this btt + * namespace while this is running. + */ +int +btt_check(struct btt *bttp) +{ + LOG(3, "bttp %p", bttp); + + int consistent = 1; + + if (!bttp->laidout) { + /* consistent by definition */ + LOG(3, "no layout yet"); + return consistent; + } + + /* XXX report issues found during read_layout (from flags) */ + + /* for each arena... */ + struct arena *arenap = bttp->arenas; + for (unsigned i = 0; i < bttp->narena; i++, arenap++) { + /* + * Perform the consistency checks for the arena. + */ + int retval = check_arena(bttp, arenap); + if (retval < 0) + return retval; + else if (retval == 0) + consistent = 0; + } + + /* XXX stub */ + return consistent; +} + +/* + * btt_fini -- delete opaque btt info, done using btt namespace + */ +void +btt_fini(struct btt *bttp) +{ + LOG(3, "bttp %p", bttp); + + if (bttp->arenas) { + for (unsigned i = 0; i < bttp->narena; i++) { + if (bttp->arenas[i].flogs) + Free(bttp->arenas[i].flogs); + if (bttp->arenas[i].rtt) + Free((void *)bttp->arenas[i].rtt); + if (bttp->arenas[i].rtt) + Free((void *)bttp->arenas[i].map_locks); + } + Free(bttp->arenas); + } + Free(bttp); +} diff --git a/src/pmdk/src/libpmemblk/btt.h b/src/pmdk/src/libpmemblk/btt.h new file mode 100644 index 000000000..94f699427 --- /dev/null +++ b/src/pmdk/src/libpmemblk/btt.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2014-2020, Intel Corporation */ + +/* + * btt.h -- btt module definitions + */ + +#ifndef BTT_H +#define BTT_H 1 + +#ifdef __cplusplus +extern "C" { +#endif + +/* callback functions passed to btt_init() */ +struct ns_callback { + int (*nsread)(void *ns, unsigned lane, + void *buf, size_t count, uint64_t off); + int (*nswrite)(void *ns, unsigned lane, + const void *buf, size_t count, uint64_t off); + int (*nszero)(void *ns, unsigned lane, size_t count, uint64_t off); + ssize_t (*nsmap)(void *ns, unsigned lane, void **addrp, + size_t len, uint64_t off); + void (*nssync)(void *ns, unsigned lane, void *addr, size_t len); + + int ns_is_zeroed; +}; + +struct btt_info; + +struct btt *btt_init(uint64_t rawsize, uint32_t lbasize, uint8_t parent_uuid[], + unsigned maxlane, void *ns, const struct ns_callback *ns_cbp); +unsigned btt_nlane(struct btt *bttp); +size_t btt_nlba(struct btt *bttp); +int btt_read(struct btt *bttp, unsigned lane, uint64_t lba, void *buf); +int btt_write(struct btt *bttp, unsigned lane, uint64_t lba, const void *buf); +int btt_set_zero(struct btt *bttp, unsigned lane, uint64_t lba); +int btt_set_error(struct btt *bttp, unsigned lane, uint64_t lba); +int btt_check(struct btt *bttp); +void btt_fini(struct btt *bttp); + +uint64_t btt_flog_size(uint32_t nfree); +uint64_t btt_map_size(uint32_t external_nlba); +uint64_t btt_arena_datasize(uint64_t arena_size, uint32_t nfree); +int btt_info_set(struct btt_info *info, uint32_t external_lbasize, + uint32_t nfree, uint64_t arena_size, uint64_t space_left); + +struct btt_flog *btt_flog_get_valid(struct btt_flog *flog_pair, int *next); +int map_entry_is_initial(uint32_t map_entry); +void btt_info_convert2h(struct btt_info *infop); +void btt_info_convert2le(struct btt_info *infop); +void btt_flog_convert2h(struct btt_flog *flogp); +void btt_flog_convert2le(struct btt_flog *flogp); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/pmdk/src/libpmemblk/btt_layout.h b/src/pmdk/src/libpmemblk/btt_layout.h new file mode 100644 index 000000000..8fa33f985 --- /dev/null +++ b/src/pmdk/src/libpmemblk/btt_layout.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright 2014-2020, Intel Corporation */ + +/* + * btt_layout.h -- block translation table on-media layout definitions + */ + +/* + * Layout of BTT info block. All integers are stored little-endian. + */ + +#ifndef BTT_LAYOUT_H +#define BTT_LAYOUT_H 1 + +#ifdef __cplusplus +extern "C" { +#endif + +#define BTT_ALIGNMENT ((uintptr_t)4096) /* alignment of all BTT structures */ +#define BTTINFO_SIG_LEN 16 +#define BTTINFO_UUID_LEN 16 +#define BTTINFO_UNUSED_LEN 3968 +#define BTTINFO_SIG "BTT_ARENA_INFO\0" + +struct btt_info { + char sig[BTTINFO_SIG_LEN]; /* must be "BTT_ARENA_INFO\0\0" */ + uint8_t uuid[BTTINFO_UUID_LEN]; /* BTT UUID */ + uint8_t parent_uuid[BTTINFO_UUID_LEN]; /* UUID of container */ + uint32_t flags; /* see flag bits below */ + uint16_t major; /* major version */ + uint16_t minor; /* minor version */ + uint32_t external_lbasize; /* advertised LBA size (bytes) */ + uint32_t external_nlba; /* advertised LBAs in this arena */ + uint32_t internal_lbasize; /* size of data area blocks (bytes) */ + uint32_t internal_nlba; /* number of blocks in data area */ + uint32_t nfree; /* number of free blocks */ + uint32_t infosize; /* size of this info block */ + + /* + * The following offsets are relative to the beginning of + * the btt_info block. + */ + uint64_t nextoff; /* offset to next arena (or zero) */ + uint64_t dataoff; /* offset to arena data area */ + uint64_t mapoff; /* offset to area map */ + uint64_t flogoff; /* offset to area flog */ + uint64_t infooff; /* offset to backup info block */ + + char unused[BTTINFO_UNUSED_LEN]; /* must be zero */ + + uint64_t checksum; /* Fletcher64 of all fields */ +}; + +/* + * Definitions for flags mask for btt_info structure above. + */ +#define BTTINFO_FLAG_ERROR 0x00000001 /* error state (read-only) */ +#define BTTINFO_FLAG_ERROR_MASK 0x00000001 /* all error bits */ + +/* + * Current on-media format versions. + */ +#define BTTINFO_MAJOR_VERSION 1 +#define BTTINFO_MINOR_VERSION 1 + +/* + * Layout of a BTT "flog" entry. All integers are stored little-endian. + * + * The "nfree" field in the BTT info block determines how many of these + * flog entries there are, and each entry consists of two of the following + * structs (entry updates alternate between the two structs), padded up + * to a cache line boundary to isolate adjacent updates. + */ + +#define BTT_FLOG_PAIR_ALIGN ((uintptr_t)64) + +struct btt_flog { + uint32_t lba; /* last pre-map LBA using this entry */ + uint32_t old_map; /* old post-map LBA (the freed block) */ + uint32_t new_map; /* new post-map LBA */ + uint32_t seq; /* sequence number (01, 10, 11) */ +}; + +/* + * Layout of a BTT "map" entry. 4-byte internal LBA offset, little-endian. + */ +#define BTT_MAP_ENTRY_SIZE 4 +#define BTT_MAP_ENTRY_ERROR 0x40000000U +#define BTT_MAP_ENTRY_ZERO 0x80000000U +#define BTT_MAP_ENTRY_NORMAL 0xC0000000U +#define BTT_MAP_ENTRY_LBA_MASK 0x3fffffffU +#define BTT_MAP_LOCK_ALIGN ((uintptr_t)64) + +/* + * BTT layout properties... + */ +#define BTT_MIN_SIZE ((1u << 20) * 16) +#define BTT_MAX_ARENA (1ull << 39) /* 512GB per arena */ +#define BTT_MIN_LBA_SIZE (size_t)512 +#define BTT_INTERNAL_LBA_ALIGNMENT 256U +#define BTT_DEFAULT_NFREE 256 + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/pmdk/src/libpmemblk/libpmemblk.c b/src/pmdk/src/libpmemblk/libpmemblk.c new file mode 100644 index 000000000..21675eff7 --- /dev/null +++ b/src/pmdk/src/libpmemblk/libpmemblk.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* Copyright 2014-2018, Intel Corporation */ + +/* + * libpmemblk.c -- pmem entry points for libpmemblk + */ + +#include <stdio.h> +#include <stdint.h> + +#include "libpmemblk.h" +#include "ctl_global.h" + +#include "pmemcommon.h" +#include "blk.h" + +/* + * The variable from which the config is directly loaded. The string + * cannot contain any comments or extraneous white characters. + */ +#define BLK_CONFIG_ENV_VARIABLE "PMEMBLK_CONF" + +/* + * The variable that points to a config file from which the config is loaded. + */ +#define BLK_CONFIG_FILE_ENV_VARIABLE "PMEMBLK_CONF_FILE" + +/* + * blk_ctl_init_and_load -- (static) initializes CTL and loads configuration + * from env variable and file + */ +static int +blk_ctl_init_and_load(PMEMblkpool *pbp) +{ + LOG(3, "pbp %p", pbp); + + if (pbp != NULL && (pbp->ctl = ctl_new()) == NULL) { + LOG(2, "!ctl_new"); + return -1; + } + + char *env_config = os_getenv(BLK_CONFIG_ENV_VARIABLE); + if (env_config != NULL) { + if (ctl_load_config_from_string(pbp ? pbp->ctl : NULL, + pbp, env_config) != 0) { + LOG(2, "unable to parse config stored in %s " + "environment variable", + BLK_CONFIG_ENV_VARIABLE); + goto err; + } + } + + char *env_config_file = os_getenv(BLK_CONFIG_FILE_ENV_VARIABLE); + if (env_config_file != NULL && env_config_file[0] != '\0') { + if (ctl_load_config_from_file(pbp ? pbp->ctl : NULL, + pbp, env_config_file) != 0) { + LOG(2, "unable to parse config stored in %s " + "file (from %s environment variable)", + env_config_file, + BLK_CONFIG_FILE_ENV_VARIABLE); + goto err; + } + } + + return 0; +err: + if (pbp) + ctl_delete(pbp->ctl); + return -1; +} + +/* + * libpmemblk_init -- (internal) load-time initialization for blk + * + * Called automatically by the run-time loader. + */ +ATTR_CONSTRUCTOR +void +libpmemblk_init(void) +{ + ctl_global_register(); + + if (blk_ctl_init_and_load(NULL)) + FATAL("error: %s", pmemblk_errormsg()); + + common_init(PMEMBLK_LOG_PREFIX, PMEMBLK_LOG_LEVEL_VAR, + PMEMBLK_LOG_FILE_VAR, PMEMBLK_MAJOR_VERSION, + PMEMBLK_MINOR_VERSION); + LOG(3, NULL); +} + +/* + * libpmemblk_fini -- libpmemblk cleanup routine + * + * Called automatically when the process terminates. + */ +ATTR_DESTRUCTOR +void +libpmemblk_fini(void) +{ + LOG(3, NULL); + common_fini(); +} + +/* + * pmemblk_check_versionU -- see if lib meets application version requirements + */ +#ifndef _WIN32 +static inline +#endif +const char * +pmemblk_check_versionU(unsigned major_required, unsigned minor_required) +{ + LOG(3, "major_required %u minor_required %u", + major_required, minor_required); + + if (major_required != PMEMBLK_MAJOR_VERSION) { + ERR("libpmemblk major version mismatch (need %u, found %u)", + major_required, PMEMBLK_MAJOR_VERSION); + return out_get_errormsg(); + } + + if (minor_required > PMEMBLK_MINOR_VERSION) { + ERR("libpmemblk minor version mismatch (need %u, found %u)", + minor_required, PMEMBLK_MINOR_VERSION); + return out_get_errormsg(); + } + + return NULL; +} + +#ifndef _WIN32 +/* + * pmemblk_check_version -- see if lib meets application version requirements + */ +const char * +pmemblk_check_version(unsigned major_required, unsigned minor_required) +{ + return pmemblk_check_versionU(major_required, minor_required); +} +#else +/* + * pmemblk_check_versionW -- see if lib meets application version requirements + */ +const wchar_t * +pmemblk_check_versionW(unsigned major_required, unsigned minor_required) +{ + if (pmemblk_check_versionU(major_required, minor_required) != NULL) + return out_get_errormsgW(); + else + return NULL; +} +#endif + +/* + * pmemblk_set_funcs -- allow overriding libpmemblk's call to malloc, etc. + */ +void +pmemblk_set_funcs( + void *(*malloc_func)(size_t size), + void (*free_func)(void *ptr), + void *(*realloc_func)(void *ptr, size_t size), + char *(*strdup_func)(const char *s)) +{ + LOG(3, NULL); + + util_set_alloc_funcs(malloc_func, free_func, realloc_func, strdup_func); +} + +/* + * pmemblk_errormsgU -- return last error message + */ +#ifndef _WIN32 +static inline +#endif +const char * +pmemblk_errormsgU(void) +{ + return out_get_errormsg(); +} + +#ifndef _WIN32 +/* + * pmemblk_errormsg -- return last error message + */ +const char * +pmemblk_errormsg(void) +{ + return pmemblk_errormsgU(); +} +#else +/* + * pmemblk_errormsgW -- return last error message as wchar_t + */ +const wchar_t * +pmemblk_errormsgW(void) +{ + return out_get_errormsgW(); +} +#endif diff --git a/src/pmdk/src/libpmemblk/libpmemblk.def b/src/pmdk/src/libpmemblk/libpmemblk.def new file mode 100644 index 000000000..fa7f91f14 --- /dev/null +++ b/src/pmdk/src/libpmemblk/libpmemblk.def @@ -0,0 +1,36 @@ +;;;; Begin Copyright Notice +; SPDX-License-Identifier: BSD-3-Clause +; Copyright 2015-2018, Intel Corporation +;;;; End Copyright Notice + +LIBRARY libpmemblk + +VERSION 1.0 + +EXPORTS + pmemblk_check_versionU + pmemblk_check_versionW + pmemblk_set_funcs + pmemblk_errormsgU + pmemblk_errormsgW + pmemblk_createU + pmemblk_createW + pmemblk_openU + pmemblk_openW + pmemblk_close + pmemblk_checkU + pmemblk_checkW + pmemblk_ctl_execU; + pmemblk_ctl_execW; + pmemblk_ctl_getU; + pmemblk_ctl_getW; + pmemblk_ctl_setU; + pmemblk_ctl_setW; + pmemblk_bsize + pmemblk_nblock + pmemblk_read + pmemblk_write + pmemblk_set_zero + pmemblk_set_error + + DllMain diff --git a/src/pmdk/src/libpmemblk/libpmemblk.link.in b/src/pmdk/src/libpmemblk/libpmemblk.link.in new file mode 100644 index 000000000..b61e83ced --- /dev/null +++ b/src/pmdk/src/libpmemblk/libpmemblk.link.in @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2014-2019, Intel Corporation +# +# +# src/libpmemblk.link -- linker link file for libpmemblk +# +LIBPMEMBLK_1.0 { + global: + pmemblk_check_version; + pmemblk_set_funcs; + pmemblk_errormsg; + pmemblk_create; + pmemblk_open; + pmemblk_close; + pmemblk_check; + pmemblk_ctl_exec; + pmemblk_ctl_get; + pmemblk_ctl_set; + pmemblk_nblock; + pmemblk_read; + pmemblk_write; + pmemblk_set_zero; + pmemblk_set_error; + pmemblk_bsize; + fault_injection; + local: + *; +}; diff --git a/src/pmdk/src/libpmemblk/libpmemblk.rc b/src/pmdk/src/libpmemblk/libpmemblk.rc new file mode 100644 index 000000000..b95b6252e --- /dev/null +++ b/src/pmdk/src/libpmemblk/libpmemblk.rc @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* Copyright 2016, Intel Corporation */ + +/* + * libpmemblk.rc -- libpmemblk resource file + */ + +#include <windows.h> +#define FILE_NAME "libpmemblk.dll" +#define DESCRIPTION "libpmemblk - persistent memory resident array of blocks" +#define TYPE VFT_DLL +#include <common.rc>
\ No newline at end of file diff --git a/src/pmdk/src/libpmemblk/libpmemblk.vcxproj b/src/pmdk/src/libpmemblk/libpmemblk.vcxproj new file mode 100644 index 000000000..680052a3f --- /dev/null +++ b/src/pmdk/src/libpmemblk/libpmemblk.vcxproj @@ -0,0 +1,133 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup Label="ProjectConfigurations"> + <ProjectConfiguration Include="Debug|x64"> + <Configuration>Debug</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Release|x64"> + <Configuration>Release</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + </ItemGroup> + <ItemGroup> + <ClCompile Include="..\..\src\libpmemblk\blk.c" /> + <ClCompile Include="..\..\src\libpmemblk\btt.c" /> + <ClCompile Include="..\..\src\libpmemblk\libpmemblk.c" /> + <ClCompile Include="..\core\alloc.c" /> + <ClCompile Include="..\common\set_badblocks.c" /> + <ClCompile Include="..\common\ctl.c" /> + <ClCompile Include="..\common\ctl_cow.c" /> + <ClCompile Include="..\common\ctl_prefault.c" /> + <ClCompile Include="..\common\ctl_sds.c" /> + <ClCompile Include="..\common\ctl_fallocate.c" /> + <ClCompile Include="..\common\file.c" /> + <ClCompile Include="..\common\file_windows.c" /> + <ClCompile Include="..\common\mmap.c" /> + <ClCompile Include="..\common\mmap_windows.c" /> + <ClCompile Include="..\core\fs_windows.c" /> + <ClCompile Include="..\common\bad_blocks.c" /> + <ClCompile Include="..\common\os_deep_windows.c" /> + <ClCompile Include="..\core\os_thread_windows.c" /> + <ClCompile Include="..\core\os_windows.c" /> + <ClCompile Include="..\core\out.c" /> + <ClCompile Include="..\common\pool_hdr.c" /> + <ClCompile Include="..\common\set.c" /> + <ClCompile Include="..\common\shutdown_state.c" /> + <ClCompile Include="..\core\util.c" /> + <ClCompile Include="..\core\util_windows.c" /> + <ClCompile Include="..\common\uuid.c" /> + <ClCompile Include="..\common\uuid_windows.c" /> + <ClCompile Include="..\libpmem2\auto_flush_windows.c" /> + <ClCompile Include="..\libpmem2\usc_windows.c" /> + <ClCompile Include="libpmemblk_main.c" /> + <ClCompile Include="..\libpmem2\config.c" /> + <ClCompile Include="..\libpmem2\badblocks.c" /> + <ClCompile Include="..\libpmem2\badblocks_none.c" /> + <ClCompile Include="..\libpmem2\source.c" /> + <ClCompile Include="..\libpmem2\source_windows.c" /> + <ClCompile Include="..\libpmem2\pmem2_utils.c" /> + <ClCompile Include="..\libpmem2\pmem2_utils_other.c" /> + </ItemGroup> + <ItemGroup> + <ClInclude Include="..\..\src\core\out.h" /> + <ClInclude Include="..\..\src\core\util.h" /> + <ClInclude Include="..\..\src\common\valgrind_internal.h" /> + <ClInclude Include="..\..\src\include\libpmemblk.h" /> + <ClInclude Include="..\..\src\libpmemblk\blk.h" /> + <ClInclude Include="..\..\src\libpmemblk\btt.h" /> + <ClInclude Include="..\..\src\libpmemblk\btt_layout.h" /> + <ClInclude Include="..\core\alloc.h" /> + <ClInclude Include="..\common\ctl.h" /> + <ClInclude Include="..\common\ctl_global.h" /> + <ClInclude Include="..\common\dlsym.h" /> + <ClInclude Include="..\core\fault_injection.h" /> + <ClInclude Include="..\common\file.h" /> + <ClInclude Include="..\core\fs.h" /> + <ClInclude Include="..\common\mmap.h" /> + <ClInclude Include="..\core\os.h" /> + <ClInclude Include="..\common\os_deep.h" /> + <ClInclude Include="..\core\os_thread.h" /> + <ClInclude Include="..\common\pmemcommon.h" /> + <ClInclude Include="..\common\pool_hdr.h" /> + <ClInclude Include="..\common\set.h" /> + <ClInclude Include="..\common\sys_util.h" /> + <ClInclude Include="..\common\uuid.h" /> + <ClInclude Include="..\libpmem2\auto_flush.h" /> + <ClInclude Include="..\libpmem2\auto_flush_windows.h" /> + <ClInclude Include="..\libpmem2\config.h" /> + <ClInclude Include="..\libpmem2\pmem2_utils.h" /> + </ItemGroup> + <ItemGroup> + <None Include="libpmemblk.def" /> + </ItemGroup> + <ItemGroup> + <ProjectReference Include="..\libpmem\libpmem.vcxproj"> + <Project>{9e9e3d25-2139-4a5d-9200-18148ddead45}</Project> + </ProjectReference> + <ProjectReference Include="..\windows\srcversion\srcversion.vcxproj"> + <Project>{901f04db-e1a5-4a41-8b81-9d31c19acd59}</Project> + </ProjectReference> + </ItemGroup> + <ItemGroup> + <ResourceCompile Include="libpmemblk.rc" /> + </ItemGroup> + <PropertyGroup Label="Globals"> + <ProjectGuid>{f7c6c6b6-4142-4c82-8699-4a9d8183181b}</ProjectGuid> + <Keyword>DynamicLibrary</Keyword> + <ProjectName>libpmemblk</ProjectName> + <RootNamespace>libpmemblk</RootNamespace> + <DefaultLanguage>en-US</DefaultLanguage> + <MinimumVisualStudioVersion>14.0</MinimumVisualStudioVersion> + <WindowsTargetPlatformVersion>10.0.17134.0</WindowsTargetPlatformVersion> + <WindowsTargetPlatformMinVersion>10.0.10240.0</WindowsTargetPlatformMinVersion> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> + <ConfigurationType>DynamicLibrary</ConfigurationType> + <UseDebugLibraries>true</UseDebugLibraries> + <PlatformToolset>v140</PlatformToolset> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> + <ConfigurationType>DynamicLibrary</ConfigurationType> + <UseDebugLibraries>false</UseDebugLibraries> + <WholeProgramOptimization>false</WholeProgramOptimization> + <PlatformToolset>v140</PlatformToolset> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + <Import Project="..\windows\libs_debug.props" /> + </ImportGroup> + <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + <Import Project="..\windows\libs_release.props" /> + </ImportGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" /> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" /> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/src/pmdk/src/libpmemblk/libpmemblk.vcxproj.filters b/src/pmdk/src/libpmemblk/libpmemblk.vcxproj.filters new file mode 100644 index 000000000..198595944 --- /dev/null +++ b/src/pmdk/src/libpmemblk/libpmemblk.vcxproj.filters @@ -0,0 +1,217 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Header Files"> + <UniqueIdentifier>{5f4b56cf-a674-4f35-abfa-d867d9d91f68}</UniqueIdentifier> + </Filter> + <Filter Include="Source Files"> + <UniqueIdentifier>{dee0ff57-9af8-485a-888b-0087d6e11cf8}</UniqueIdentifier> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="..\..\src\libpmemblk\blk.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\..\src\libpmemblk\btt.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\..\src\libpmemblk\libpmemblk.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="libpmemblk_main.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\common\uuid_windows.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\common\uuid.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\core\util_windows.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\core\util.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\common\set.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\common\pool_hdr.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\core\out.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\common\mmap_windows.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\core\fs_windows.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\common\mmap.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\common\file_windows.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\common\file.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\core\os_windows.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\core\os_thread_windows.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\common\shutdown_state.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\common\os_deep_windows.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\common\set_badblocks.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\common\bad_blocks.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\common\ctl.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\common\ctl_prefault.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\common\ctl_fallocate.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\common\ctl_sds.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\core\alloc.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\common\ctl_cow.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\libpmem2\auto_flush_windows.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\libpmem2\config.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\libpmem2\badblocks.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\libpmem2\badblocks_none.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\libpmem2\source.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\libpmem2\source_windows.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\libpmem2\pmem2_utils.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\libpmem2\pmem2_utils_other.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\libpmem2\usc_windows.c"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> + <ItemGroup> + <ClInclude Include="..\..\src\include\libpmemblk.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\..\src\libpmemblk\blk.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\..\src\libpmemblk\btt.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\..\src\libpmemblk\btt_layout.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\..\src\core\out.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\..\src\core\util.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\..\src\common\valgrind_internal.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\common\uuid.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\common\sys_util.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\common\set.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\common\pool_hdr.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\common\pmemcommon.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\common\mmap.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\common\file.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\common\dlsym.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\core\fs.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\core\os.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\core\os_thread.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\common\os_deep.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\common\ctl_global.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\common\ctl.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\core\alloc.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\core\fault_injection.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\libpmem2\auto_flush.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\libpmem2\auto_flush_windows.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\libpmem2\config.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\libpmem2\pmem2_utils.h"> + <Filter>Header Files</Filter> + </ClInclude> + </ItemGroup> + <ItemGroup> + <None Include="libpmemblk.def"> + <Filter>Source Files</Filter> + </None> + </ItemGroup> + <ItemGroup> + <ResourceCompile Include="libpmemblk.rc"> + <Filter>Source Files</Filter> + </ResourceCompile> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/src/pmdk/src/libpmemblk/libpmemblk_main.c b/src/pmdk/src/libpmemblk/libpmemblk_main.c new file mode 100644 index 000000000..28c547d2e --- /dev/null +++ b/src/pmdk/src/libpmemblk/libpmemblk_main.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* Copyright 2015-2017, Intel Corporation */ + +/* + * libpmemblk_main.c -- entry point for libpmemblk.dll + * + * XXX - This is a placeholder. All the library initialization/cleanup + * that is done in library ctors/dtors, as well as TLS initialization + * should be moved here. + */ + +void libpmemblk_init(void); +void libpmemblk_fini(void); + +int APIENTRY +DllMain(HINSTANCE hInstance, DWORD dwReason, LPVOID lpReserved) +{ + switch (dwReason) { + case DLL_PROCESS_ATTACH: + libpmemblk_init(); + break; + + case DLL_THREAD_ATTACH: + case DLL_THREAD_DETACH: + break; + + case DLL_PROCESS_DETACH: + libpmemblk_fini(); + break; + } + return TRUE; +} |