summaryrefslogtreecommitdiffstats
path: root/src/pmdk/src/libpmemblk
diff options
context:
space:
mode:
Diffstat (limited to 'src/pmdk/src/libpmemblk')
-rw-r--r--src/pmdk/src/libpmemblk/Makefile22
-rw-r--r--src/pmdk/src/libpmemblk/blk.c948
-rw-r--r--src/pmdk/src/libpmemblk/blk.h102
-rw-r--r--src/pmdk/src/libpmemblk/btt.c2051
-rw-r--r--src/pmdk/src/libpmemblk/btt.h59
-rw-r--r--src/pmdk/src/libpmemblk/btt_layout.h107
-rw-r--r--src/pmdk/src/libpmemblk/libpmemblk.c200
-rw-r--r--src/pmdk/src/libpmemblk/libpmemblk.def36
-rw-r--r--src/pmdk/src/libpmemblk/libpmemblk.link.in28
-rw-r--r--src/pmdk/src/libpmemblk/libpmemblk.rc12
-rw-r--r--src/pmdk/src/libpmemblk/libpmemblk.vcxproj133
-rw-r--r--src/pmdk/src/libpmemblk/libpmemblk.vcxproj.filters217
-rw-r--r--src/pmdk/src/libpmemblk/libpmemblk_main.c32
13 files changed, 3947 insertions, 0 deletions
diff --git a/src/pmdk/src/libpmemblk/Makefile b/src/pmdk/src/libpmemblk/Makefile
new file mode 100644
index 000000000..8f5d99ecd
--- /dev/null
+++ b/src/pmdk/src/libpmemblk/Makefile
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2014-2020, Intel Corporation
+
+#
+# src/libpmemblk/Makefile -- Makefile for libpmemblk
+#
+
+LIBRARY_NAME = pmemblk
+LIBRARY_SO_VERSION = 1
+LIBRARY_VERSION = 0.0
+
+include ../core/pmemcore.inc
+include ../common/pmemcommon.inc
+SOURCE +=\
+ blk.c\
+ btt.c\
+ libpmemblk.c
+
+include ../Makefile.inc
+
+CFLAGS += $(LIBNDCTL_CFLAGS)
+LIBS += -pthread -lpmem $(LIBNDCTL_LIBS)
diff --git a/src/pmdk/src/libpmemblk/blk.c b/src/pmdk/src/libpmemblk/blk.c
new file mode 100644
index 000000000..58f9c9fff
--- /dev/null
+++ b/src/pmdk/src/libpmemblk/blk.c
@@ -0,0 +1,948 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/* Copyright 2014-2020, Intel Corporation */
+
+/*
+ * blk.c -- block memory pool entry points for libpmem
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <unistd.h>
+#include <errno.h>
+#include <time.h>
+#include <stdint.h>
+#include <endian.h>
+#include <stdbool.h>
+
+#include "libpmem.h"
+#include "libpmemblk.h"
+
+#include "mmap.h"
+#include "set.h"
+#include "out.h"
+#include "btt.h"
+#include "blk.h"
+#include "util.h"
+#include "sys_util.h"
+#include "util_pmem.h"
+#include "valgrind_internal.h"
+
+static const struct pool_attr Blk_create_attr = {
+ BLK_HDR_SIG,
+ BLK_FORMAT_MAJOR,
+ BLK_FORMAT_FEAT_DEFAULT,
+ {0}, {0}, {0}, {0}, {0}
+};
+
+static const struct pool_attr Blk_open_attr = {
+ BLK_HDR_SIG,
+ BLK_FORMAT_MAJOR,
+ BLK_FORMAT_FEAT_CHECK,
+ {0}, {0}, {0}, {0}, {0}
+};
+
+/*
+ * lane_enter -- (internal) acquire a unique lane number
+ */
+static void
+lane_enter(PMEMblkpool *pbp, unsigned *lane)
+{
+ unsigned mylane;
+
+ mylane = util_fetch_and_add32(&pbp->next_lane, 1) % pbp->nlane;
+
+ /* lane selected, grab the per-lane lock */
+ util_mutex_lock(&pbp->locks[mylane]);
+
+ *lane = mylane;
+}
+
+/*
+ * lane_exit -- (internal) drop lane lock
+ */
+static void
+lane_exit(PMEMblkpool *pbp, unsigned mylane)
+{
+ util_mutex_unlock(&pbp->locks[mylane]);
+}
+
+/*
+ * nsread -- (internal) read data from the namespace encapsulating the BTT
+ *
+ * This routine is provided to btt_init() to allow the btt module to
+ * do I/O on the memory pool containing the BTT layout.
+ */
+static int
+nsread(void *ns, unsigned lane, void *buf, size_t count, uint64_t off)
+{
+ struct pmemblk *pbp = (struct pmemblk *)ns;
+
+ LOG(13, "pbp %p lane %u count %zu off %" PRIu64, pbp, lane, count, off);
+
+ if (off + count > pbp->datasize) {
+ ERR("offset + count (%zu) past end of data area (%zu)",
+ (size_t)off + count, pbp->datasize);
+ errno = EINVAL;
+ return -1;
+ }
+
+ memcpy(buf, (char *)pbp->data + off, count);
+
+ return 0;
+}
+
+/*
+ * nswrite -- (internal) write data to the namespace encapsulating the BTT
+ *
+ * This routine is provided to btt_init() to allow the btt module to
+ * do I/O on the memory pool containing the BTT layout.
+ */
+static int
+nswrite(void *ns, unsigned lane, const void *buf, size_t count,
+ uint64_t off)
+{
+ struct pmemblk *pbp = (struct pmemblk *)ns;
+
+ LOG(13, "pbp %p lane %u count %zu off %" PRIu64, pbp, lane, count, off);
+
+ if (off + count > pbp->datasize) {
+ ERR("offset + count (%zu) past end of data area (%zu)",
+ (size_t)off + count, pbp->datasize);
+ errno = EINVAL;
+ return -1;
+ }
+
+ void *dest = (char *)pbp->data + off;
+
+#ifdef DEBUG
+ /* grab debug write lock */
+ util_mutex_lock(&pbp->write_lock);
+#endif
+
+ /* unprotect the memory (debug version only) */
+ RANGE_RW(dest, count, pbp->is_dev_dax);
+
+ if (pbp->is_pmem)
+ pmem_memcpy_nodrain(dest, buf, count);
+ else
+ memcpy(dest, buf, count);
+
+ /* protect the memory again (debug version only) */
+ RANGE_RO(dest, count, pbp->is_dev_dax);
+
+#ifdef DEBUG
+ /* release debug write lock */
+ util_mutex_unlock(&pbp->write_lock);
+#endif
+
+ if (pbp->is_pmem)
+ pmem_drain();
+ else
+ pmem_msync(dest, count);
+
+ return 0;
+}
+
+/*
+ * nsmap -- (internal) allow direct access to a range of a namespace
+ *
+ * The caller requests a range to be "mapped" but the return value
+ * may indicate a smaller amount (in which case the caller is expected
+ * to call back later for another mapping).
+ *
+ * This routine is provided to btt_init() to allow the btt module to
+ * do I/O on the memory pool containing the BTT layout.
+ */
+static ssize_t
+nsmap(void *ns, unsigned lane, void **addrp, size_t len, uint64_t off)
+{
+ struct pmemblk *pbp = (struct pmemblk *)ns;
+
+ LOG(12, "pbp %p lane %u len %zu off %" PRIu64, pbp, lane, len, off);
+
+ ASSERT(((ssize_t)len) >= 0);
+
+ if (off + len >= pbp->datasize) {
+ ERR("offset + len (%zu) past end of data area (%zu)",
+ (size_t)off + len, pbp->datasize - 1);
+ errno = EINVAL;
+ return -1;
+ }
+
+ /*
+ * Since the entire file is memory-mapped, this callback
+ * can always provide the entire length requested.
+ */
+ *addrp = (char *)pbp->data + off;
+
+ LOG(12, "returning addr %p", *addrp);
+
+ return (ssize_t)len;
+}
+
+/*
+ * nssync -- (internal) flush changes made to a namespace range
+ *
+ * This is used in conjunction with the addresses handed out by
+ * nsmap() above. There's no need to sync things written via
+ * nswrite() since those changes are flushed each time nswrite()
+ * is called.
+ *
+ * This routine is provided to btt_init() to allow the btt module to
+ * do I/O on the memory pool containing the BTT layout.
+ */
+static void
+nssync(void *ns, unsigned lane, void *addr, size_t len)
+{
+ struct pmemblk *pbp = (struct pmemblk *)ns;
+
+ LOG(12, "pbp %p lane %u addr %p len %zu", pbp, lane, addr, len);
+
+ if (pbp->is_pmem)
+ pmem_persist(addr, len);
+ else
+ pmem_msync(addr, len);
+}
+
+/*
+ * nszero -- (internal) zero data in the namespace encapsulating the BTT
+ *
+ * This routine is provided to btt_init() to allow the btt module to
+ * zero the memory pool containing the BTT layout.
+ */
+static int
+nszero(void *ns, unsigned lane, size_t count, uint64_t off)
+{
+ struct pmemblk *pbp = (struct pmemblk *)ns;
+
+ LOG(13, "pbp %p lane %u count %zu off %" PRIu64, pbp, lane, count, off);
+
+ if (off + count > pbp->datasize) {
+ ERR("offset + count (%zu) past end of data area (%zu)",
+ (size_t)off + count, pbp->datasize);
+ errno = EINVAL;
+ return -1;
+ }
+
+ void *dest = (char *)pbp->data + off;
+
+ /* unprotect the memory (debug version only) */
+ RANGE_RW(dest, count, pbp->is_dev_dax);
+
+ pmem_memset_persist(dest, 0, count);
+
+ /* protect the memory again (debug version only) */
+ RANGE_RO(dest, count, pbp->is_dev_dax);
+
+ return 0;
+}
+
+/* callbacks for btt_init() */
+static struct ns_callback ns_cb = {
+ .nsread = nsread,
+ .nswrite = nswrite,
+ .nszero = nszero,
+ .nsmap = nsmap,
+ .nssync = nssync,
+ .ns_is_zeroed = 0
+};
+
+/*
+ * blk_descr_create -- (internal) create block memory pool descriptor
+ */
+static void
+blk_descr_create(PMEMblkpool *pbp, uint32_t bsize, int zeroed)
+{
+ LOG(3, "pbp %p bsize %u zeroed %d", pbp, bsize, zeroed);
+
+ /* create the required metadata */
+ pbp->bsize = htole32(bsize);
+ util_persist(pbp->is_pmem, &pbp->bsize, sizeof(bsize));
+
+ pbp->is_zeroed = zeroed;
+ util_persist(pbp->is_pmem, &pbp->is_zeroed, sizeof(pbp->is_zeroed));
+}
+
+/*
+ * blk_descr_check -- (internal) validate block memory pool descriptor
+ */
+static int
+blk_descr_check(PMEMblkpool *pbp, size_t *bsize)
+{
+ LOG(3, "pbp %p bsize %zu", pbp, *bsize);
+
+ size_t hdr_bsize = le32toh(pbp->bsize);
+ if (*bsize && *bsize != hdr_bsize) {
+ ERR("wrong bsize (%zu), pool created with bsize %zu",
+ *bsize, hdr_bsize);
+ errno = EINVAL;
+ return -1;
+ }
+ *bsize = hdr_bsize;
+ LOG(3, "using block size from header: %zu", *bsize);
+
+ return 0;
+}
+
+/*
+ * blk_runtime_init -- (internal) initialize block memory pool runtime data
+ */
+static int
+blk_runtime_init(PMEMblkpool *pbp, size_t bsize, int rdonly)
+{
+ LOG(3, "pbp %p bsize %zu rdonly %d",
+ pbp, bsize, rdonly);
+
+ /* remove volatile part of header */
+ VALGRIND_REMOVE_PMEM_MAPPING(&pbp->addr,
+ sizeof(struct pmemblk) -
+ sizeof(struct pool_hdr) -
+ sizeof(pbp->bsize) -
+ sizeof(pbp->is_zeroed));
+
+ /*
+ * Use some of the memory pool area for run-time info. This
+ * run-time state is never loaded from the file, it is always
+ * created here, so no need to worry about byte-order.
+ */
+ pbp->rdonly = rdonly;
+ pbp->data = (char *)pbp->addr +
+ roundup(sizeof(*pbp), BLK_FORMAT_DATA_ALIGN);
+ ASSERT(((char *)pbp->addr + pbp->size) >= (char *)pbp->data);
+ pbp->datasize = (size_t)
+ (((char *)pbp->addr + pbp->size) - (char *)pbp->data);
+
+ LOG(4, "data area %p data size %zu bsize %zu",
+ pbp->data, pbp->datasize, bsize);
+
+ long ncpus = sysconf(_SC_NPROCESSORS_ONLN);
+ if (ncpus < 1)
+ ncpus = 1;
+
+ ns_cb.ns_is_zeroed = pbp->is_zeroed;
+
+ /* things free by "goto err" if not NULL */
+ struct btt *bttp = NULL;
+ os_mutex_t *locks = NULL;
+
+ bttp = btt_init(pbp->datasize, (uint32_t)bsize, pbp->hdr.poolset_uuid,
+ (unsigned)ncpus * 2, pbp, &ns_cb);
+
+ if (bttp == NULL)
+ goto err; /* btt_init set errno, called LOG */
+
+ pbp->bttp = bttp;
+
+ pbp->nlane = btt_nlane(pbp->bttp);
+ pbp->next_lane = 0;
+ if ((locks = Malloc(pbp->nlane * sizeof(*locks))) == NULL) {
+ ERR("!Malloc for lane locks");
+ goto err;
+ }
+
+ for (unsigned i = 0; i < pbp->nlane; i++)
+ util_mutex_init(&locks[i]);
+
+ pbp->locks = locks;
+
+#ifdef DEBUG
+ /* initialize debug lock */
+ util_mutex_init(&pbp->write_lock);
+#endif
+
+ /*
+ * If possible, turn off all permissions on the pool header page.
+ *
+ * The prototype PMFS doesn't allow this when large pages are in
+ * use. It is not considered an error if this fails.
+ */
+ RANGE_NONE(pbp->addr, sizeof(struct pool_hdr), pbp->is_dev_dax);
+
+ /* the data area should be kept read-only for debug version */
+ RANGE_RO(pbp->data, pbp->datasize, pbp->is_dev_dax);
+
+ return 0;
+
+err:
+ LOG(4, "error clean up");
+ int oerrno = errno;
+ if (bttp)
+ btt_fini(bttp);
+ errno = oerrno;
+ return -1;
+}
+
+/*
+ * pmemblk_createU -- create a block memory pool
+ */
+#ifndef _WIN32
+static inline
+#endif
+PMEMblkpool *
+pmemblk_createU(const char *path, size_t bsize, size_t poolsize, mode_t mode)
+{
+ LOG(3, "path %s bsize %zu poolsize %zu mode %o",
+ path, bsize, poolsize, mode);
+
+ /* check if bsize is valid */
+ if (bsize == 0) {
+ ERR("Invalid block size %zu", bsize);
+ errno = EINVAL;
+ return NULL;
+ }
+
+ if (bsize > UINT32_MAX) {
+ ERR("Invalid block size %zu", bsize);
+ errno = EINVAL;
+ return NULL;
+ }
+
+ struct pool_set *set;
+ struct pool_attr adj_pool_attr = Blk_create_attr;
+
+ /* force set SDS feature */
+ if (SDS_at_create)
+ adj_pool_attr.features.incompat |= POOL_FEAT_SDS;
+ else
+ adj_pool_attr.features.incompat &= ~POOL_FEAT_SDS;
+
+ if (util_pool_create(&set, path, poolsize, PMEMBLK_MIN_POOL,
+ PMEMBLK_MIN_PART, &adj_pool_attr, NULL,
+ REPLICAS_DISABLED) != 0) {
+ LOG(2, "cannot create pool or pool set");
+ return NULL;
+ }
+
+ ASSERT(set->nreplicas > 0);
+
+ struct pool_replica *rep = set->replica[0];
+ PMEMblkpool *pbp = rep->part[0].addr;
+
+ VALGRIND_REMOVE_PMEM_MAPPING(&pbp->addr,
+ sizeof(struct pmemblk) -
+ ((uintptr_t)&pbp->addr - (uintptr_t)&pbp->hdr));
+
+ pbp->addr = pbp;
+ pbp->size = rep->repsize;
+ pbp->set = set;
+ pbp->is_pmem = rep->is_pmem;
+ pbp->is_dev_dax = rep->part[0].is_dev_dax;
+
+ /* is_dev_dax implies is_pmem */
+ ASSERT(!pbp->is_dev_dax || pbp->is_pmem);
+
+ /* create pool descriptor */
+ blk_descr_create(pbp, (uint32_t)bsize, set->zeroed);
+
+ /* initialize runtime parts */
+ if (blk_runtime_init(pbp, bsize, 0) != 0) {
+ ERR("pool initialization failed");
+ goto err;
+ }
+
+ if (util_poolset_chmod(set, mode))
+ goto err;
+
+ util_poolset_fdclose(set);
+
+ LOG(3, "pbp %p", pbp);
+ return pbp;
+
+err:
+ LOG(4, "error clean up");
+ int oerrno = errno;
+ util_poolset_close(set, DELETE_CREATED_PARTS);
+ errno = oerrno;
+ return NULL;
+}
+
+#ifndef _WIN32
+/*
+ * pmemblk_create -- create a block memory pool
+ */
+PMEMblkpool *
+pmemblk_create(const char *path, size_t bsize, size_t poolsize, mode_t mode)
+{
+ return pmemblk_createU(path, bsize, poolsize, mode);
+}
+#else
+/*
+ * pmemblk_createW -- create a block memory pool
+ */
+PMEMblkpool *
+pmemblk_createW(const wchar_t *path, size_t bsize, size_t poolsize,
+ mode_t mode)
+{
+ char *upath = util_toUTF8(path);
+ if (upath == NULL)
+ return NULL;
+
+ PMEMblkpool *ret = pmemblk_createU(upath, bsize, poolsize, mode);
+
+ util_free_UTF8(upath);
+ return ret;
+}
+#endif
+
+/*
+ * blk_open_common -- (internal) open a block memory pool
+ *
+ * This routine does all the work, but takes a cow flag so internal
+ * calls can map a read-only pool if required.
+ *
+ * Passing in bsize == 0 means a valid pool header must exist (which
+ * will supply the block size).
+ */
+static PMEMblkpool *
+blk_open_common(const char *path, size_t bsize, unsigned flags)
+{
+ LOG(3, "path %s bsize %zu flags 0x%x", path, bsize, flags);
+
+ struct pool_set *set;
+
+ if (util_pool_open(&set, path, PMEMBLK_MIN_PART, &Blk_open_attr,
+ NULL, NULL, flags) != 0) {
+ LOG(2, "cannot open pool or pool set");
+ return NULL;
+ }
+
+ ASSERT(set->nreplicas > 0);
+
+ struct pool_replica *rep = set->replica[0];
+ PMEMblkpool *pbp = rep->part[0].addr;
+
+ VALGRIND_REMOVE_PMEM_MAPPING(&pbp->addr,
+ sizeof(struct pmemblk) -
+ ((uintptr_t)&pbp->addr - (uintptr_t)&pbp->hdr));
+
+ pbp->addr = pbp;
+ pbp->size = rep->repsize;
+ pbp->set = set;
+ pbp->is_pmem = rep->is_pmem;
+ pbp->is_dev_dax = rep->part[0].is_dev_dax;
+
+ /* is_dev_dax implies is_pmem */
+ ASSERT(!pbp->is_dev_dax || pbp->is_pmem);
+
+ if (set->nreplicas > 1) {
+ errno = ENOTSUP;
+ ERR("!replicas not supported");
+ goto err;
+ }
+
+ /* validate pool descriptor */
+ if (blk_descr_check(pbp, &bsize) != 0) {
+ LOG(2, "descriptor check failed");
+ goto err;
+ }
+
+ /* initialize runtime parts */
+ if (blk_runtime_init(pbp, bsize, set->rdonly) != 0) {
+ ERR("pool initialization failed");
+ goto err;
+ }
+
+ util_poolset_fdclose(set);
+
+ LOG(3, "pbp %p", pbp);
+ return pbp;
+
+err:
+ LOG(4, "error clean up");
+ int oerrno = errno;
+ util_poolset_close(set, DO_NOT_DELETE_PARTS);
+ errno = oerrno;
+ return NULL;
+}
+
+/*
+ * pmemblk_openU -- open a block memory pool
+ */
+#ifndef _WIN32
+static inline
+#endif
+PMEMblkpool *
+pmemblk_openU(const char *path, size_t bsize)
+{
+ LOG(3, "path %s bsize %zu", path, bsize);
+
+ return blk_open_common(path, bsize, COW_at_open ? POOL_OPEN_COW : 0);
+}
+
+#ifndef _WIN32
+/*
+ * pmemblk_open -- open a block memory pool
+ */
+PMEMblkpool *
+pmemblk_open(const char *path, size_t bsize)
+{
+ return pmemblk_openU(path, bsize);
+}
+#else
+/*
+ * pmemblk_openW -- open a block memory pool
+ */
+PMEMblkpool *
+pmemblk_openW(const wchar_t *path, size_t bsize)
+{
+ char *upath = util_toUTF8(path);
+ if (upath == NULL)
+ return NULL;
+
+ PMEMblkpool *ret = pmemblk_openU(upath, bsize);
+
+ util_free_UTF8(upath);
+ return ret;
+}
+#endif
+
+/*
+ * pmemblk_close -- close a block memory pool
+ */
+void
+pmemblk_close(PMEMblkpool *pbp)
+{
+ LOG(3, "pbp %p", pbp);
+
+ btt_fini(pbp->bttp);
+ if (pbp->locks) {
+ for (unsigned i = 0; i < pbp->nlane; i++)
+ util_mutex_destroy(&pbp->locks[i]);
+ Free((void *)pbp->locks);
+ }
+
+#ifdef DEBUG
+ /* destroy debug lock */
+ util_mutex_destroy(&pbp->write_lock);
+#endif
+
+ util_poolset_close(pbp->set, DO_NOT_DELETE_PARTS);
+}
+
+/*
+ * pmemblk_bsize -- return size of block for specified pool
+ */
+size_t
+pmemblk_bsize(PMEMblkpool *pbp)
+{
+ LOG(3, "pbp %p", pbp);
+
+ return le32toh(pbp->bsize);
+}
+
+/*
+ * pmemblk_nblock -- return number of usable blocks in a block memory pool
+ */
+size_t
+pmemblk_nblock(PMEMblkpool *pbp)
+{
+ LOG(3, "pbp %p", pbp);
+
+ return btt_nlba(pbp->bttp);
+}
+
+/*
+ * pmemblk_read -- read a block in a block memory pool
+ */
+int
+pmemblk_read(PMEMblkpool *pbp, void *buf, long long blockno)
+{
+ LOG(3, "pbp %p buf %p blockno %lld", pbp, buf, blockno);
+
+ if (blockno < 0) {
+ ERR("negative block number");
+ errno = EINVAL;
+ return -1;
+ }
+
+ unsigned lane;
+
+ lane_enter(pbp, &lane);
+
+ int err = btt_read(pbp->bttp, lane, (uint64_t)blockno, buf);
+
+ lane_exit(pbp, lane);
+
+ return err;
+}
+
+/*
+ * pmemblk_write -- write a block (atomically) in a block memory pool
+ */
+int
+pmemblk_write(PMEMblkpool *pbp, const void *buf, long long blockno)
+{
+ LOG(3, "pbp %p buf %p blockno %lld", pbp, buf, blockno);
+
+ if (pbp->rdonly) {
+ ERR("EROFS (pool is read-only)");
+ errno = EROFS;
+ return -1;
+ }
+
+ if (blockno < 0) {
+ ERR("negative block number");
+ errno = EINVAL;
+ return -1;
+ }
+
+ unsigned lane;
+
+ lane_enter(pbp, &lane);
+
+ int err = btt_write(pbp->bttp, lane, (uint64_t)blockno, buf);
+
+ lane_exit(pbp, lane);
+
+ return err;
+}
+
+/*
+ * pmemblk_set_zero -- zero a block in a block memory pool
+ */
+int
+pmemblk_set_zero(PMEMblkpool *pbp, long long blockno)
+{
+ LOG(3, "pbp %p blockno %lld", pbp, blockno);
+
+ if (pbp->rdonly) {
+ ERR("EROFS (pool is read-only)");
+ errno = EROFS;
+ return -1;
+ }
+
+ if (blockno < 0) {
+ ERR("negative block number");
+ errno = EINVAL;
+ return -1;
+ }
+
+ unsigned lane;
+
+ lane_enter(pbp, &lane);
+
+ int err = btt_set_zero(pbp->bttp, lane, (uint64_t)blockno);
+
+ lane_exit(pbp, lane);
+
+ return err;
+}
+
+/*
+ * pmemblk_set_error -- set the error state on a block in a block memory pool
+ */
+int
+pmemblk_set_error(PMEMblkpool *pbp, long long blockno)
+{
+ LOG(3, "pbp %p blockno %lld", pbp, blockno);
+
+ if (pbp->rdonly) {
+ ERR("EROFS (pool is read-only)");
+ errno = EROFS;
+ return -1;
+ }
+
+ if (blockno < 0) {
+ ERR("negative block number");
+ errno = EINVAL;
+ return -1;
+ }
+
+ unsigned lane;
+
+ lane_enter(pbp, &lane);
+
+ int err = btt_set_error(pbp->bttp, lane, (uint64_t)blockno);
+
+ lane_exit(pbp, lane);
+
+ return err;
+}
+
+/*
+ * pmemblk_checkU -- block memory pool consistency check
+ */
+#ifndef _WIN32
+static inline
+#endif
+int
+pmemblk_checkU(const char *path, size_t bsize)
+{
+ LOG(3, "path \"%s\" bsize %zu", path, bsize);
+
+ /* map the pool read-only */
+ PMEMblkpool *pbp = blk_open_common(path, bsize, POOL_OPEN_COW);
+ if (pbp == NULL)
+ return -1; /* errno set by blk_open_common() */
+
+ int retval = btt_check(pbp->bttp);
+ int oerrno = errno;
+ pmemblk_close(pbp);
+ errno = oerrno;
+
+ return retval;
+}
+
+#ifndef _WIN32
+/*
+ * pmemblk_check -- block memory pool consistency check
+ */
+int
+pmemblk_check(const char *path, size_t bsize)
+{
+ return pmemblk_checkU(path, bsize);
+}
+#else
+/*
+ * pmemblk_checkW -- block memory pool consistency check
+ */
+int
+pmemblk_checkW(const wchar_t *path, size_t bsize)
+{
+ char *upath = util_toUTF8(path);
+ if (upath == NULL)
+ return -1;
+
+ int ret = pmemblk_checkU(upath, bsize);
+
+ util_free_UTF8(upath);
+ return ret;
+}
+#endif
+
+/*
+ * pmemblk_ctl_getU -- programmatically executes a read ctl query
+ */
+#ifndef _WIN32
+static inline
+#endif
+int
+pmemblk_ctl_getU(PMEMblkpool *pbp, const char *name, void *arg)
+{
+ LOG(3, "pbp %p name %s arg %p", pbp, name, arg);
+ return ctl_query(pbp == NULL ? NULL : pbp->ctl, pbp,
+ CTL_QUERY_PROGRAMMATIC, name, CTL_QUERY_READ, arg);
+}
+
+/*
+ * pmemblk_ctl_setU -- programmatically executes a write ctl query
+ */
+#ifndef _WIN32
+static inline
+#endif
+int
+pmemblk_ctl_setU(PMEMblkpool *pbp, const char *name, void *arg)
+{
+ LOG(3, "pbp %p name %s arg %p", pbp, name, arg);
+ return ctl_query(pbp == NULL ? NULL : pbp->ctl, pbp,
+ CTL_QUERY_PROGRAMMATIC, name, CTL_QUERY_WRITE, arg);
+}
+
+/*
+ * pmemblk_ctl_execU -- programmatically executes a runnable ctl query
+ */
+#ifndef _WIN32
+static inline
+#endif
+int
+pmemblk_ctl_execU(PMEMblkpool *pbp, const char *name, void *arg)
+{
+ LOG(3, "pbp %p name %s arg %p", pbp, name, arg);
+ return ctl_query(pbp == NULL ? NULL : pbp->ctl, pbp,
+ CTL_QUERY_PROGRAMMATIC, name, CTL_QUERY_RUNNABLE, arg);
+}
+
+#ifndef _WIN32
+/*
+ * pmemblk_ctl_get -- programmatically executes a read ctl query
+ */
+int
+pmemblk_ctl_get(PMEMblkpool *pbp, const char *name, void *arg)
+{
+ return pmemblk_ctl_getU(pbp, name, arg);
+}
+
+/*
+ * pmemblk_ctl_set -- programmatically executes a write ctl query
+ */
+int
+pmemblk_ctl_set(PMEMblkpool *pbp, const char *name, void *arg)
+{
+ return pmemblk_ctl_setU(pbp, name, arg);
+}
+
+/*
+ * pmemblk_ctl_exec -- programmatically executes a runnable ctl query
+ */
+int
+pmemblk_ctl_exec(PMEMblkpool *pbp, const char *name, void *arg)
+{
+ return pmemblk_ctl_execU(pbp, name, arg);
+}
+#else
+/*
+ * pmemblk_ctl_getW -- programmatically executes a read ctl query
+ */
+int
+pmemblk_ctl_getW(PMEMblkpool *pbp, const wchar_t *name, void *arg)
+{
+ char *uname = util_toUTF8(name);
+ if (uname == NULL)
+ return -1;
+
+ int ret = pmemblk_ctl_getU(pbp, uname, arg);
+ util_free_UTF8(uname);
+
+ return ret;
+}
+
+/*
+ * pmemblk_ctl_setW -- programmatically executes a write ctl query
+ */
+int
+pmemblk_ctl_setW(PMEMblkpool *pbp, const wchar_t *name, void *arg)
+{
+ char *uname = util_toUTF8(name);
+ if (uname == NULL)
+ return -1;
+
+ int ret = pmemblk_ctl_setU(pbp, uname, arg);
+ util_free_UTF8(uname);
+
+ return ret;
+}
+
+/*
+ * pmemblk_ctl_execW -- programmatically executes a runnable ctl query
+ */
+int
+pmemblk_ctl_execW(PMEMblkpool *pbp, const wchar_t *name, void *arg)
+{
+ char *uname = util_toUTF8(name);
+ if (uname == NULL)
+ return -1;
+
+ int ret = pmemblk_ctl_execU(pbp, uname, arg);
+ util_free_UTF8(uname);
+
+ return ret;
+}
+#endif
+
+#if FAULT_INJECTION
+void
+pmemblk_inject_fault_at(enum pmem_allocation_type type, int nth,
+ const char *at)
+{
+ core_inject_fault_at(type, nth, at);
+}
+
+int
+pmemblk_fault_injection_enabled(void)
+{
+ return core_fault_injection_enabled();
+}
+#endif
diff --git a/src/pmdk/src/libpmemblk/blk.h b/src/pmdk/src/libpmemblk/blk.h
new file mode 100644
index 000000000..095331b8b
--- /dev/null
+++ b/src/pmdk/src/libpmemblk/blk.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2014-2020, Intel Corporation */
+
+/*
+ * blk.h -- internal definitions for libpmem blk module
+ */
+
+#ifndef BLK_H
+#define BLK_H 1
+
+#include <stddef.h>
+
+#include "ctl.h"
+#include "os_thread.h"
+#include "pool_hdr.h"
+#include "page_size.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "alloc.h"
+#include "fault_injection.h"
+
+#define PMEMBLK_LOG_PREFIX "libpmemblk"
+#define PMEMBLK_LOG_LEVEL_VAR "PMEMBLK_LOG_LEVEL"
+#define PMEMBLK_LOG_FILE_VAR "PMEMBLK_LOG_FILE"
+
+/* attributes of the blk memory pool format for the pool header */
+#define BLK_HDR_SIG "PMEMBLK" /* must be 8 bytes including '\0' */
+#define BLK_FORMAT_MAJOR 1
+
+#define BLK_FORMAT_FEAT_DEFAULT \
+ {POOL_FEAT_COMPAT_DEFAULT, POOL_FEAT_INCOMPAT_DEFAULT, 0x0000}
+
+#define BLK_FORMAT_FEAT_CHECK \
+ {POOL_FEAT_COMPAT_VALID, POOL_FEAT_INCOMPAT_VALID, 0x0000}
+
+static const features_t blk_format_feat_default = BLK_FORMAT_FEAT_DEFAULT;
+
+struct pmemblk {
+ struct pool_hdr hdr; /* memory pool header */
+
+ /* root info for on-media format... */
+ uint32_t bsize; /* block size */
+
+ /* flag indicating if the pool was zero-initialized */
+ int is_zeroed;
+
+ /* some run-time state, allocated out of memory pool... */
+ void *addr; /* mapped region */
+ size_t size; /* size of mapped region */
+ int is_pmem; /* true if pool is PMEM */
+ int rdonly; /* true if pool is opened read-only */
+ void *data; /* post-header data area */
+ size_t datasize; /* size of data area */
+ size_t nlba; /* number of LBAs in pool */
+ struct btt *bttp; /* btt handle */
+ unsigned nlane; /* number of lanes */
+ unsigned next_lane; /* used to rotate through lanes */
+ os_mutex_t *locks; /* one per lane */
+ int is_dev_dax; /* true if mapped on device dax */
+ struct ctl *ctl; /* top level node of the ctl tree structure */
+
+ struct pool_set *set; /* pool set info */
+
+#ifdef DEBUG
+ /* held during read/write mprotected sections */
+ os_mutex_t write_lock;
+#endif
+};
+
+/* data area starts at this alignment after the struct pmemblk above */
+#define BLK_FORMAT_DATA_ALIGN ((uintptr_t)PMEM_PAGESIZE)
+
+#if FAULT_INJECTION
+void
+pmemblk_inject_fault_at(enum pmem_allocation_type type, int nth,
+ const char *at);
+
+int
+pmemblk_fault_injection_enabled(void);
+#else
+static inline void
+pmemblk_inject_fault_at(enum pmem_allocation_type type, int nth,
+ const char *at)
+{
+ abort();
+}
+
+static inline int
+pmemblk_fault_injection_enabled(void)
+{
+ return 0;
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/pmdk/src/libpmemblk/btt.c b/src/pmdk/src/libpmemblk/btt.c
new file mode 100644
index 000000000..f59389f21
--- /dev/null
+++ b/src/pmdk/src/libpmemblk/btt.c
@@ -0,0 +1,2051 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/* Copyright 2014-2019, Intel Corporation */
+
+/*
+ * btt.c -- block translation table providing atomic block updates
+ *
+ * This is a user-space implementation of the BTT mechanism providing
+ * single block powerfail write atomicity, as described by:
+ * The NVDIMM Namespace Specification
+ *
+ * To use this module, the caller must provide five routines for
+ * accessing the namespace containing the data (in this context,
+ * "namespace" refers to the storage containing the BTT layout, such
+ * as a file). All namespace I/O is done by these callbacks:
+ *
+ * nsread Read count bytes from namespace at offset off
+ * nswrite Write count bytes to namespace at offset off
+ * nszero Zero count bytes in namespace at offset off
+ * nsmap Return direct access to a range of a namespace
+ * nssync Flush changes made to an nsmap'd range
+ *
+ * Data written by the nswrite callback is flushed out to the media
+ * (made durable) when the call returns. Data written directly via
+ * the nsmap callback must be flushed explicitly using nssync.
+ *
+ * The caller passes these callbacks, along with information such as
+ * namespace size and UUID to btt_init() and gets back an opaque handle
+ * which is then used with the rest of the entry points.
+ *
+ * Here is a brief list of the entry points to this module:
+ *
+ * btt_nlane Returns number of concurrent threads allowed
+ *
+ * btt_nlba Returns the usable size, as a count of LBAs
+ *
+ * btt_read Reads a single block at a given LBA
+ *
+ * btt_write Writes a single block (atomically) at a given LBA
+ *
+ * btt_set_zero Sets a block to read back as zeros
+ *
+ * btt_set_error Sets a block to return error on read
+ *
+ * btt_check Checks the BTT metadata for consistency
+ *
+ * btt_fini Frees run-time state, done using namespace
+ *
+ * If the caller is multi-threaded, it must only allow btt_nlane() threads
+ * to enter this module at a time, each assigned a unique "lane" number
+ * between 0 and btt_nlane() - 1.
+ *
+ * There are a number of static routines defined in this module. Here's
+ * a brief overview of the most important routines:
+ *
+ * read_layout Checks for valid BTT layout and builds run-time state.
+ * A number of helper functions are used by read_layout
+ * to handle various parts of the metadata:
+ * read_info
+ * read_arenas
+ * read_arena
+ * read_flogs
+ * read_flog_pair
+ *
+ * write_layout Generates a new BTT layout when one doesn't exist.
+ * Once a new layout is written, write_layout uses
+ * the same helper functions above to construct the
+ * run-time state.
+ *
+ * invalid_lba Range check done by each entry point that takes
+ * an LBA.
+ *
+ * lba_to_arena_lba
+ * Find the arena and LBA in that arena for a given
+ * external LBA. This is the heart of the arena
+ * range matching logic.
+ *
+ * flog_update Update the BTT free list/log combined data structure
+ * (known as the "flog"). This is the heart of the
+ * logic that makes writes powerfail atomic.
+ *
+ * map_lock These routines provide atomic access to the BTT map
+ * map_unlock data structure in an area.
+ * map_abort
+ *
+ * map_entry_setf Common code for btt_set_zero() and btt_set_error().
+ *
+ * zero_block Generate a block of all zeros (instead of actually
+ * doing a read), when the metadata indicates the
+ * block should read as zeros.
+ *
+ * build_rtt These routines construct the run-time tracking
+ * build_map_locks data structures used during I/O.
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <sys/param.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdint.h>
+#include <endian.h>
+
+#include "out.h"
+#include "uuid.h"
+#include "btt.h"
+#include "btt_layout.h"
+#include "sys_util.h"
+#include "util.h"
+#include "alloc.h"
+
+/*
+ * The opaque btt handle containing state tracked by this module
+ * for the btt namespace. This is created by btt_init(), handed to
+ * all the other btt_* entry points, and deleted by btt_fini().
+ */
+struct btt {
+ unsigned nlane; /* number of concurrent threads allowed per btt */
+
+ /*
+ * The laidout flag indicates whether the namespace contains valid BTT
+ * metadata. It is initialized by read_layout() and if no valid layout
+ * is found, all reads return zeros and the first write will write the
+ * BTT layout. The layout_write_mutex protects the laidout flag so
+ * only one write threads ends up writing the initial metadata by
+ * calling write_layout().
+ */
+ os_mutex_t layout_write_mutex;
+ int laidout;
+
+ /*
+ * UUID of the BTT
+ */
+ uint8_t uuid[BTTINFO_UUID_LEN];
+
+ /*
+ * UUID of the containing namespace, used to validate BTT metadata.
+ */
+ uint8_t parent_uuid[BTTINFO_UUID_LEN];
+
+ /*
+ * Parameters controlling/describing the BTT layout.
+ */
+ uint64_t rawsize; /* size of containing namespace */
+ uint32_t lbasize; /* external LBA size */
+ uint32_t nfree; /* available flog entries */
+ uint64_t nlba; /* total number of external LBAs */
+ unsigned narena; /* number of arenas */
+
+ /* run-time state kept for each arena */
+ struct arena {
+ uint32_t flags; /* arena flags (btt_info) */
+ uint32_t external_nlba; /* LBAs that live in this arena */
+ uint32_t internal_lbasize;
+ uint32_t internal_nlba;
+
+ /*
+ * The following offsets are relative to the beginning of
+ * the encapsulating namespace. This is different from
+ * how these offsets are stored on-media, where they are
+ * relative to the start of the arena. The offset are
+ * converted by read_layout() to make them more convenient
+ * for run-time use.
+ */
+ uint64_t startoff; /* offset to start of arena */
+ uint64_t dataoff; /* offset to arena data area */
+ uint64_t mapoff; /* offset to area map */
+ uint64_t flogoff; /* offset to area flog */
+ uint64_t nextoff; /* offset to next arena */
+
+ /*
+ * Run-time flog state. Indexed by lane.
+ *
+ * The write path uses the flog to find the free block
+ * it writes to before atomically making it the new
+ * active block for an external LBA.
+ *
+ * The read path doesn't use the flog at all.
+ */
+ struct flog_runtime {
+ struct btt_flog flog; /* current info */
+ uint64_t entries[2]; /* offsets for flog pair */
+ int next; /* next write (0 or 1) */
+ } *flogs;
+
+ /*
+ * Read tracking table. Indexed by lane.
+ *
+ * Before using a free block found in the flog, the write path
+ * scans the rtt to see if there are any outstanding reads on
+ * that block (reads that started before the block was freed by
+ * a concurrent write). Unused slots in the rtt are indicated
+ * by setting the error bit, BTT_MAP_ENTRY_ERROR, so that the
+ * entry won't match any post-map LBA when checked.
+ */
+ uint32_t volatile *rtt;
+
+ /*
+ * Map locking. Indexed by pre-map LBA modulo nlane.
+ */
+ os_mutex_t *map_locks;
+
+ /*
+ * Arena info block locking.
+ */
+ os_mutex_t info_lock;
+ } *arenas;
+
+ /*
+ * Callbacks for doing I/O to namespace. These are provided by
+ * the code calling the BTT module, which passes them in to
+ * btt_init(). All namespace I/O is done using these.
+ *
+ * The opaque namespace handle "ns" was provided by the code calling
+ * the BTT module and is passed to each callback to identify the
+ * namespace being accessed.
+ */
+ void *ns;
+ const struct ns_callback *ns_cbp;
+};
+
+/*
+ * Signature for arena info blocks. Total size is 16 bytes, including
+ * the '\0' added to the string by the declaration (the last two bytes
+ * of the string are '\0').
+ */
+static const char Sig[] = BTTINFO_SIG;
+
+/*
+ * Zeroed out flog entry, used when initializing the flog.
+ */
+static const struct btt_flog Zflog;
+
+/*
+ * Lookup table and macro for looking up sequence numbers. These are
+ * the 2-bit numbers that cycle between 01, 10, and 11.
+ *
+ * To advance a sequence number to the next number, use something like:
+ * seq = NSEQ(seq);
+ */
+static const unsigned Nseq[] = { 0, 2, 3, 1 };
+#define NSEQ(seq) (Nseq[(seq) & 3])
+
+/*
+ * get_map_lock_num -- (internal) Calculate offset into map_locks[]
+ *
+ * map_locks[] contains nfree locks which are used to protect the map
+ * from concurrent access to the same cache line. The index into
+ * map_locks[] is calculated by looking at the byte offset into the map
+ * (premap_lba * BTT_MAP_ENTRY_SIZE), figuring out how many cache lines
+ * that is into the map that is (dividing by BTT_MAP_LOCK_ALIGN), and
+ * then selecting one of nfree locks (the modulo at the end).
+ *
+ * The extra cast is to keep gcc from generating a false positive
+ * 64-32 bit conversion error when -fsanitize is set.
+ */
+static inline uint32_t
+get_map_lock_num(uint32_t premap_lba, uint32_t nfree)
+{
+ return (uint32_t)(premap_lba * BTT_MAP_ENTRY_SIZE / BTT_MAP_LOCK_ALIGN)
+ % nfree;
+}
+
+/*
+ * invalid_lba -- (internal) set errno and return true if lba is invalid
+ *
+ * This function is used at the top of the entry points where an external
+ * LBA is provided, like this:
+ *
+ * if (invalid_lba(bttp, lba))
+ * return -1;
+ */
+static int
+invalid_lba(struct btt *bttp, uint64_t lba)
+{
+ LOG(3, "bttp %p lba %" PRIu64, bttp, lba);
+
+ if (lba >= bttp->nlba) {
+ ERR("lba out of range (nlba %" PRIu64 ")", bttp->nlba);
+ errno = EINVAL;
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * read_info -- (internal) convert btt_info to host byte order & validate
+ *
+ * Returns true if info block is valid, and all the integer fields are
+ * converted to host byte order. If the info block is not valid, this
+ * routine returns false and the info block passed in is left in an
+ * unknown state.
+ */
+static int
+read_info(struct btt *bttp, struct btt_info *infop)
+{
+ LOG(3, "infop %p", infop);
+
+ if (memcmp(infop->sig, Sig, BTTINFO_SIG_LEN)) {
+ LOG(3, "signature invalid");
+ return 0;
+ }
+
+ if (memcmp(infop->parent_uuid, bttp->parent_uuid, BTTINFO_UUID_LEN)) {
+ LOG(3, "parent UUID mismatch");
+ return 0;
+ }
+
+ /* to be valid, the fields must checksum correctly */
+ if (!util_checksum(infop, sizeof(*infop), &infop->checksum, 0, 0)) {
+ LOG(3, "invalid checksum");
+ return 0;
+ }
+
+ /* to be valid, info block must have a major version of at least 1 */
+ if ((infop->major = le16toh(infop->major)) == 0) {
+ LOG(3, "invalid major version (0)");
+ return 0;
+ }
+
+ infop->flags = le32toh(infop->flags);
+ infop->minor = le16toh(infop->minor);
+ infop->external_lbasize = le32toh(infop->external_lbasize);
+ infop->external_nlba = le32toh(infop->external_nlba);
+ infop->internal_lbasize = le32toh(infop->internal_lbasize);
+ infop->internal_nlba = le32toh(infop->internal_nlba);
+ infop->nfree = le32toh(infop->nfree);
+ infop->infosize = le32toh(infop->infosize);
+ infop->nextoff = le64toh(infop->nextoff);
+ infop->dataoff = le64toh(infop->dataoff);
+ infop->mapoff = le64toh(infop->mapoff);
+ infop->flogoff = le64toh(infop->flogoff);
+ infop->infooff = le64toh(infop->infooff);
+
+ return 1;
+}
+
+/*
+ * map_entry_is_zero -- (internal) checks if map_entry is in zero state
+ */
+static inline int
+map_entry_is_zero(uint32_t map_entry)
+{
+ return (map_entry & ~BTT_MAP_ENTRY_LBA_MASK) == BTT_MAP_ENTRY_ZERO;
+}
+
+/*
+ * map_entry_is_error -- (internal) checks if map_entry is in error state
+ */
+static inline int
+map_entry_is_error(uint32_t map_entry)
+{
+ return (map_entry & ~BTT_MAP_ENTRY_LBA_MASK) == BTT_MAP_ENTRY_ERROR;
+}
+
+/*
+ * map_entry_is_initial -- checks if map_entry is in initial state
+ */
+int
+map_entry_is_initial(uint32_t map_entry)
+{
+ return (map_entry & ~BTT_MAP_ENTRY_LBA_MASK) == 0;
+}
+
+/*
+ * map_entry_is_zero_or_initial -- (internal) checks if map_entry is in initial
+ * or zero state
+ */
+static inline int
+map_entry_is_zero_or_initial(uint32_t map_entry)
+{
+ uint32_t entry_flags = map_entry & ~BTT_MAP_ENTRY_LBA_MASK;
+ return entry_flags == 0 || entry_flags == BTT_MAP_ENTRY_ZERO;
+}
+
+/*
+ * btt_flog_get_valid -- return valid and current flog entry
+ */
+struct btt_flog *
+btt_flog_get_valid(struct btt_flog *flog_pair, int *next)
+{
+ /*
+ * Interesting cases:
+ * - no valid seq numbers: layout consistency error
+ * - one valid seq number: that's the current entry
+ * - two valid seq numbers: higher number is current entry
+ * - identical seq numbers: layout consistency error
+ */
+ if (flog_pair[0].seq == flog_pair[1].seq) {
+ return NULL;
+ } else if (flog_pair[0].seq == 0) {
+ /* singleton valid flog at flog_pair[1] */
+ *next = 0;
+ return &flog_pair[1];
+ } else if (flog_pair[1].seq == 0) {
+ /* singleton valid flog at flog_pair[0] */
+ *next = 1;
+ return &flog_pair[0];
+ } else if (NSEQ(flog_pair[0].seq) == flog_pair[1].seq) {
+ /* flog_pair[1] has the later sequence number */
+ *next = 0;
+ return &flog_pair[1];
+ } else {
+ /* flog_pair[0] has the later sequence number */
+ *next = 1;
+ return &flog_pair[0];
+ }
+}
+
+/*
+ * read_flog_pair -- (internal) load up a single flog pair
+ *
+ * Zero is returned on success, otherwise -1/errno.
+ */
+static int
+read_flog_pair(struct btt *bttp, unsigned lane, struct arena *arenap,
+ uint64_t flog_off, struct flog_runtime *flog_runtimep, uint32_t flognum)
+{
+ LOG(5, "bttp %p lane %u arenap %p flog_off %" PRIu64 " runtimep %p "
+ "flognum %u", bttp, lane, arenap, flog_off, flog_runtimep,
+ flognum);
+
+ flog_runtimep->entries[0] = flog_off;
+ flog_runtimep->entries[1] = flog_off + sizeof(struct btt_flog);
+
+ if (lane >= bttp->nfree) {
+ ERR("invalid lane %u among nfree %d", lane, bttp->nfree);
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (flog_off == 0) {
+ ERR("invalid flog offset %" PRIu64, flog_off);
+ errno = EINVAL;
+ return -1;
+ }
+
+ struct btt_flog flog_pair[2];
+ if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, flog_pair,
+ sizeof(flog_pair), flog_off) < 0)
+ return -1;
+
+ btt_flog_convert2h(&flog_pair[0]);
+ if (invalid_lba(bttp, flog_pair[0].lba))
+ return -1;
+
+ btt_flog_convert2h(&flog_pair[1]);
+ if (invalid_lba(bttp, flog_pair[1].lba))
+ return -1;
+
+ LOG(6, "flog_pair[0] flog_off %" PRIu64 " old_map %u new_map %u seq %u",
+ flog_off, flog_pair[0].old_map,
+ flog_pair[0].new_map, flog_pair[0].seq);
+ LOG(6, "flog_pair[1] old_map %u new_map %u seq %u",
+ flog_pair[1].old_map, flog_pair[1].new_map,
+ flog_pair[1].seq);
+
+ struct btt_flog *currentp = btt_flog_get_valid(flog_pair,
+ &flog_runtimep->next);
+
+ if (currentp == NULL) {
+ ERR("flog layout error: bad seq numbers %d %d",
+ flog_pair[0].seq, flog_pair[1].seq);
+ arenap->flags |= BTTINFO_FLAG_ERROR;
+ return 0;
+ }
+
+ LOG(6, "run-time flog next is %d", flog_runtimep->next);
+
+ /* copy current flog into run-time flog state */
+ flog_runtimep->flog = *currentp;
+
+ LOG(9, "read flog[%u]: lba %u old %u%s%s%s new %u%s%s%s", flognum,
+ currentp->lba,
+ currentp->old_map & BTT_MAP_ENTRY_LBA_MASK,
+ (map_entry_is_error(currentp->old_map)) ? " ERROR" : "",
+ (map_entry_is_zero(currentp->old_map)) ? " ZERO" : "",
+ (map_entry_is_initial(currentp->old_map)) ? " INIT" : "",
+ currentp->new_map & BTT_MAP_ENTRY_LBA_MASK,
+ (map_entry_is_error(currentp->new_map)) ? " ERROR" : "",
+ (map_entry_is_zero(currentp->new_map)) ? " ZERO" : "",
+ (map_entry_is_initial(currentp->new_map)) ? " INIT" : "");
+
+ /*
+ * Decide if the current flog info represents a completed
+ * operation or an incomplete operation. If completed, the
+ * old_map field will contain the free block to be used for
+ * the next write. But if the operation didn't complete (indicated
+ * by the map entry not being updated), then the operation is
+ * completed now by updating the map entry.
+ *
+ * A special case, used by flog entries when first created, is
+ * when old_map == new_map. This counts as a complete entry
+ * and doesn't require reading the map to see if recovery is
+ * required.
+ */
+ if (currentp->old_map == currentp->new_map) {
+ LOG(9, "flog[%u] entry complete (initial state)", flognum);
+ return 0;
+ }
+
+ /* convert pre-map LBA into an offset into the map */
+ uint64_t map_entry_off = arenap->mapoff +
+ BTT_MAP_ENTRY_SIZE * currentp->lba;
+
+ /* read current map entry */
+ uint32_t entry;
+ if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &entry,
+ sizeof(entry), map_entry_off) < 0)
+ return -1;
+
+ entry = le32toh(entry);
+
+ /* map entry in initial state */
+ if (map_entry_is_initial(entry))
+ entry = currentp->lba | BTT_MAP_ENTRY_NORMAL;
+
+ if (currentp->new_map != entry && currentp->old_map == entry) {
+ /* last update didn't complete */
+ LOG(9, "recover flog[%u]: map[%u]: %u",
+ flognum, currentp->lba, currentp->new_map);
+
+ /*
+ * Recovery step is to complete the transaction by
+ * updating the map entry.
+ */
+ entry = htole32(currentp->new_map);
+ if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &entry,
+ sizeof(uint32_t), map_entry_off) < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * flog_update -- (internal) write out an updated flog entry
+ *
+ * The flog entries are not checksummed. Instead, increasing sequence
+ * numbers are used to atomically switch the active flog entry between
+ * the first and second struct btt_flog in each slot. In order for this
+ * to work, the sequence number must be updated only after all the other
+ * fields in the flog are updated. So the writes to the flog are broken
+ * into two writes, one for the first three fields (lba, old_map, new_map)
+ * and, only after those fields are known to be written durably, the
+ * second write for the seq field is done.
+ *
+ * Returns 0 on success, otherwise -1/errno.
+ */
+static int
+flog_update(struct btt *bttp, unsigned lane, struct arena *arenap,
+ uint32_t lba, uint32_t old_map, uint32_t new_map)
+{
+ LOG(3, "bttp %p lane %u arenap %p lba %u old_map %u new_map %u",
+ bttp, lane, arenap, lba, old_map, new_map);
+
+ /* construct new flog entry in little-endian byte order */
+ struct btt_flog new_flog;
+ new_flog.lba = lba;
+ new_flog.old_map = old_map;
+ new_flog.new_map = new_map;
+ new_flog.seq = NSEQ(arenap->flogs[lane].flog.seq);
+ btt_flog_convert2le(&new_flog);
+
+ uint64_t new_flog_off =
+ arenap->flogs[lane].entries[arenap->flogs[lane].next];
+
+ /* write out first two fields first */
+ if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &new_flog,
+ sizeof(uint32_t) * 2, new_flog_off) < 0)
+ return -1;
+ new_flog_off += sizeof(uint32_t) * 2;
+
+ /* write out new_map and seq field to make it active */
+ if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &new_flog.new_map,
+ sizeof(uint32_t) * 2, new_flog_off) < 0)
+ return -1;
+
+ /* flog entry written successfully, update run-time state */
+ arenap->flogs[lane].next = 1 - arenap->flogs[lane].next;
+ arenap->flogs[lane].flog.lba = lba;
+ arenap->flogs[lane].flog.old_map = old_map;
+ arenap->flogs[lane].flog.new_map = new_map;
+ arenap->flogs[lane].flog.seq = NSEQ(arenap->flogs[lane].flog.seq);
+
+ LOG(9, "update flog[%u]: lba %u old %u%s%s%s new %u%s%s%s", lane, lba,
+ old_map & BTT_MAP_ENTRY_LBA_MASK,
+ (map_entry_is_error(old_map)) ? " ERROR" : "",
+ (map_entry_is_zero(old_map)) ? " ZERO" : "",
+ (map_entry_is_initial(old_map)) ? " INIT" : "",
+ new_map & BTT_MAP_ENTRY_LBA_MASK,
+ (map_entry_is_error(new_map)) ? " ERROR" : "",
+ (map_entry_is_zero(new_map)) ? " ZERO" : "",
+ (map_entry_is_initial(new_map)) ? " INIT" : "");
+
+ return 0;
+}
+
+/*
+ * arena_setf -- (internal) updates the given flag for the arena info block
+ */
+static int
+arena_setf(struct btt *bttp, struct arena *arenap, unsigned lane, uint32_t setf)
+{
+ LOG(3, "bttp %p arenap %p lane %u setf 0x%x", bttp, arenap, lane, setf);
+
+ /* update runtime state */
+ util_fetch_and_or32(&arenap->flags, setf);
+
+ if (!bttp->laidout) {
+ /* no layout yet to update */
+ return 0;
+ }
+
+ /*
+ * Read, modify and write out the info block
+ * at both the beginning and end of the arena.
+ */
+ uint64_t arena_off = arenap->startoff;
+
+ struct btt_info info;
+
+ /* protect from simultaneous writes to the layout */
+ util_mutex_lock(&arenap->info_lock);
+
+ if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &info,
+ sizeof(info), arena_off) < 0) {
+ goto err;
+ }
+
+ uint64_t infooff = le64toh(info.infooff);
+
+ /* update flags */
+ info.flags |= htole32(setf);
+
+ /* update checksum */
+ util_checksum(&info, sizeof(info), &info.checksum, 1, 0);
+
+ if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info,
+ sizeof(info), arena_off) < 0) {
+ goto err;
+ }
+
+ if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info,
+ sizeof(info), arena_off + infooff) < 0) {
+ goto err;
+ }
+
+ util_mutex_unlock(&arenap->info_lock);
+ return 0;
+
+err:
+ util_mutex_unlock(&arenap->info_lock);
+ return -1;
+}
+
+/*
+ * set_arena_error -- (internal) set the error flag for the given arena
+ */
+static int
+set_arena_error(struct btt *bttp, struct arena *arenap, unsigned lane)
+{
+ LOG(3, "bttp %p arena %p lane %u", bttp, arenap, lane);
+
+ return arena_setf(bttp, arenap, lane, BTTINFO_FLAG_ERROR);
+}
+
+/*
+ * read_flogs -- (internal) load up all the flog entries for an arena
+ *
+ * Zero is returned on success, otherwise -1/errno.
+ */
+static int
+read_flogs(struct btt *bttp, unsigned lane, struct arena *arenap)
+{
+ if ((arenap->flogs = Zalloc(bttp->nfree *
+ sizeof(struct flog_runtime))) == NULL) {
+ ERR("!Malloc for %u flog entries", bttp->nfree);
+ return -1;
+ }
+
+ /*
+ * Load up the flog state. read_flog_pair() will determine if
+ * any recovery steps are required take them on the in-memory
+ * data structures it creates. Sets error flag when it
+ * determines an invalid state.
+ */
+ uint64_t flog_off = arenap->flogoff;
+ struct flog_runtime *flog_runtimep = arenap->flogs;
+ for (uint32_t i = 0; i < bttp->nfree; i++) {
+ if (read_flog_pair(bttp, lane, arenap, flog_off,
+ flog_runtimep, i) < 0) {
+ set_arena_error(bttp, arenap, lane);
+ return -1;
+ }
+
+ /* prepare for next time around the loop */
+ flog_off += roundup(2 * sizeof(struct btt_flog),
+ BTT_FLOG_PAIR_ALIGN);
+ flog_runtimep++;
+ }
+
+ return 0;
+}
+
+/*
+ * build_rtt -- (internal) construct a read tracking table for an arena
+ *
+ * Zero is returned on success, otherwise -1/errno.
+ *
+ * The rtt is big enough to hold an entry for each free block (nfree)
+ * since nlane can't be bigger than nfree. nlane may end up smaller,
+ * in which case some of the high rtt entries will be unused.
+ */
+static int
+build_rtt(struct btt *bttp, struct arena *arenap)
+{
+ if ((arenap->rtt = Malloc(bttp->nfree * sizeof(uint32_t)))
+ == NULL) {
+ ERR("!Malloc for %d rtt entries", bttp->nfree);
+ return -1;
+ }
+ for (uint32_t lane = 0; lane < bttp->nfree; lane++)
+ arenap->rtt[lane] = BTT_MAP_ENTRY_ERROR;
+ util_synchronize();
+
+ return 0;
+}
+
+/*
+ * build_map_locks -- (internal) construct map locks
+ *
+ * Zero is returned on success, otherwise -1/errno.
+ */
+static int
+build_map_locks(struct btt *bttp, struct arena *arenap)
+{
+ if ((arenap->map_locks =
+ Malloc(bttp->nfree * sizeof(*arenap->map_locks)))
+ == NULL) {
+ ERR("!Malloc for %d map_lock entries", bttp->nfree);
+ return -1;
+ }
+ for (uint32_t lane = 0; lane < bttp->nfree; lane++)
+ util_mutex_init(&arenap->map_locks[lane]);
+
+ return 0;
+}
+
+/*
+ * read_arena -- (internal) load up an arena and build run-time state
+ *
+ * Zero is returned on success, otherwise -1/errno.
+ */
+static int
+read_arena(struct btt *bttp, unsigned lane, uint64_t arena_off,
+ struct arena *arenap)
+{
+ LOG(3, "bttp %p lane %u arena_off %" PRIu64 " arenap %p",
+ bttp, lane, arena_off, arenap);
+
+ struct btt_info info;
+ if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &info, sizeof(info),
+ arena_off) < 0)
+ return -1;
+
+ arenap->flags = le32toh(info.flags);
+ arenap->external_nlba = le32toh(info.external_nlba);
+ arenap->internal_lbasize = le32toh(info.internal_lbasize);
+ arenap->internal_nlba = le32toh(info.internal_nlba);
+
+ arenap->startoff = arena_off;
+ arenap->dataoff = arena_off + le64toh(info.dataoff);
+ arenap->mapoff = arena_off + le64toh(info.mapoff);
+ arenap->flogoff = arena_off + le64toh(info.flogoff);
+ arenap->nextoff = arena_off + le64toh(info.nextoff);
+
+ if (read_flogs(bttp, lane, arenap) < 0)
+ return -1;
+
+ if (build_rtt(bttp, arenap) < 0)
+ return -1;
+
+ if (build_map_locks(bttp, arenap) < 0)
+ return -1;
+
+ /* initialize the per arena info block lock */
+ util_mutex_init(&arenap->info_lock);
+
+ return 0;
+}
+
+/*
+ * util_convert2h_btt_info -- convert btt_info to host byte order
+ */
+void
+btt_info_convert2h(struct btt_info *infop)
+{
+ infop->flags = le32toh(infop->flags);
+ infop->major = le16toh(infop->major);
+ infop->minor = le16toh(infop->minor);
+ infop->external_lbasize = le32toh(infop->external_lbasize);
+ infop->external_nlba = le32toh(infop->external_nlba);
+ infop->internal_lbasize = le32toh(infop->internal_lbasize);
+ infop->internal_nlba = le32toh(infop->internal_nlba);
+ infop->nfree = le32toh(infop->nfree);
+ infop->infosize = le32toh(infop->infosize);
+ infop->nextoff = le64toh(infop->nextoff);
+ infop->dataoff = le64toh(infop->dataoff);
+ infop->mapoff = le64toh(infop->mapoff);
+ infop->flogoff = le64toh(infop->flogoff);
+ infop->infooff = le64toh(infop->infooff);
+}
+
+/*
+ * btt_info_convert2le -- convert btt_info to little-endian byte order
+ */
+void
+btt_info_convert2le(struct btt_info *infop)
+{
+ infop->flags = le32toh(infop->flags);
+ infop->major = le16toh(infop->major);
+ infop->minor = le16toh(infop->minor);
+ infop->external_lbasize = le32toh(infop->external_lbasize);
+ infop->external_nlba = le32toh(infop->external_nlba);
+ infop->internal_lbasize = le32toh(infop->internal_lbasize);
+ infop->internal_nlba = le32toh(infop->internal_nlba);
+ infop->nfree = le32toh(infop->nfree);
+ infop->infosize = le32toh(infop->infosize);
+ infop->nextoff = le64toh(infop->nextoff);
+ infop->dataoff = le64toh(infop->dataoff);
+ infop->mapoff = le64toh(infop->mapoff);
+ infop->flogoff = le64toh(infop->flogoff);
+ infop->infooff = le64toh(infop->infooff);
+}
+
+/*
+ * btt_flog_convert2h -- convert btt_flog to host byte order
+ */
+void
+btt_flog_convert2h(struct btt_flog *flogp)
+{
+ flogp->lba = le32toh(flogp->lba);
+ flogp->old_map = le32toh(flogp->old_map);
+ flogp->new_map = le32toh(flogp->new_map);
+ flogp->seq = le32toh(flogp->seq);
+}
+
+/*
+ * btt_flog_convert2le -- convert btt_flog to LE byte order
+ */
+void
+btt_flog_convert2le(struct btt_flog *flogp)
+{
+ flogp->lba = htole32(flogp->lba);
+ flogp->old_map = htole32(flogp->old_map);
+ flogp->new_map = htole32(flogp->new_map);
+ flogp->seq = htole32(flogp->seq);
+}
+
+/*
+ * read_arenas -- (internal) load up all arenas and build run-time state
+ *
+ * On entry, layout must be known to be valid, and the number of arenas
+ * must be known. Zero is returned on success, otherwise -1/errno.
+ */
+static int
+read_arenas(struct btt *bttp, unsigned lane, unsigned narena)
+{
+ LOG(3, "bttp %p lane %u narena %d", bttp, lane, narena);
+
+ if ((bttp->arenas = Zalloc(narena * sizeof(*bttp->arenas))) == NULL) {
+ ERR("!Malloc for %u arenas", narena);
+ goto err;
+ }
+
+ uint64_t arena_off = 0;
+ struct arena *arenap = bttp->arenas;
+ for (unsigned i = 0; i < narena; i++) {
+
+ if (read_arena(bttp, lane, arena_off, arenap) < 0)
+ goto err;
+
+ /* prepare for next time around the loop */
+ arena_off = arenap->nextoff;
+ arenap++;
+ }
+
+ bttp->laidout = 1;
+
+ return 0;
+
+err:
+ LOG(4, "error clean up");
+ int oerrno = errno;
+ if (bttp->arenas) {
+ for (unsigned i = 0; i < bttp->narena; i++) {
+ if (bttp->arenas[i].flogs)
+ Free(bttp->arenas[i].flogs);
+ if (bttp->arenas[i].rtt)
+ Free((void *)bttp->arenas[i].rtt);
+ if (bttp->arenas[i].map_locks)
+ Free((void *)bttp->arenas[i].map_locks);
+ }
+ Free(bttp->arenas);
+ bttp->arenas = NULL;
+ }
+ errno = oerrno;
+ return -1;
+}
+
+/*
+ * internal_lbasize -- (internal) calculate internal LBA size
+ */
+static inline uint32_t
+internal_lbasize(uint32_t external_lbasize)
+{
+ uint32_t internal_lbasize = external_lbasize;
+ if (internal_lbasize < BTT_MIN_LBA_SIZE)
+ internal_lbasize = BTT_MIN_LBA_SIZE;
+ internal_lbasize =
+ roundup(internal_lbasize, BTT_INTERNAL_LBA_ALIGNMENT);
+ /* check for overflow */
+ if (internal_lbasize < BTT_INTERNAL_LBA_ALIGNMENT) {
+ errno = EINVAL;
+ ERR("!Invalid lba size after alignment: %u ", internal_lbasize);
+ return 0;
+ }
+
+ return internal_lbasize;
+}
+
+/*
+ * btt_flog_size -- calculate flog data size
+ */
+uint64_t
+btt_flog_size(uint32_t nfree)
+{
+ uint64_t flog_size = nfree * roundup(2 * sizeof(struct btt_flog),
+ BTT_FLOG_PAIR_ALIGN);
+ return roundup(flog_size, BTT_ALIGNMENT);
+}
+
+/*
+ * btt_map_size -- calculate map data size
+ */
+uint64_t
+btt_map_size(uint32_t external_nlba)
+{
+ return roundup(external_nlba * BTT_MAP_ENTRY_SIZE, BTT_ALIGNMENT);
+}
+
+/*
+ * btt_arena_datasize -- whole arena size without BTT Info header, backup and
+ * flog means size of blocks and map
+ */
+uint64_t
+btt_arena_datasize(uint64_t arena_size, uint32_t nfree)
+{
+ return arena_size - 2 * sizeof(struct btt_info) - btt_flog_size(nfree);
+}
+
+/*
+ * btt_info_set_params -- (internal) calculate and set BTT Info
+ * external_lbasize, internal_lbasize, nfree, infosize, external_nlba and
+ * internal_nlba
+ */
+static int
+btt_info_set_params(struct btt_info *info, uint32_t external_lbasize,
+ uint32_t internal_lbasize, uint32_t nfree, uint64_t arena_size)
+{
+ info->external_lbasize = external_lbasize;
+ info->internal_lbasize = internal_lbasize;
+ info->nfree = nfree;
+ info->infosize = sizeof(*info);
+
+ uint64_t arena_data_size = btt_arena_datasize(arena_size, nfree);
+
+ /* allow for map alignment padding */
+ uint64_t internal_nlba = (arena_data_size - BTT_ALIGNMENT) /
+ (info->internal_lbasize + BTT_MAP_ENTRY_SIZE);
+
+ /* ensure the number of blocks is at least 2*nfree */
+ if (internal_nlba < 2 * nfree) {
+ errno = EINVAL;
+ ERR("!number of internal blocks: %" PRIu64
+ " expected at least %u",
+ internal_nlba, 2 * nfree);
+ return -1;
+ }
+
+ ASSERT(internal_nlba <= UINT32_MAX);
+ uint32_t internal_nlba_u32 = (uint32_t)internal_nlba;
+
+ info->internal_nlba = internal_nlba_u32;
+ /* external LBA does not include free blocks */
+ info->external_nlba = internal_nlba_u32 - info->nfree;
+
+ ASSERT((arena_data_size - btt_map_size(info->external_nlba)) /
+ internal_lbasize >= internal_nlba);
+
+ return 0;
+}
+
+/*
+ * btt_info_set_offs -- (internal) calculate and set the BTT Info dataoff,
+ * nextoff, infooff, flogoff and mapoff. These are all relative to the
+ * beginning of the arena.
+ */
+static void
+btt_info_set_offs(struct btt_info *info, uint64_t arena_size,
+ uint64_t space_left)
+{
+ info->dataoff = info->infosize;
+
+ /* set offset to next valid arena */
+ if (space_left >= BTT_MIN_SIZE)
+ info->nextoff = arena_size;
+ else
+ info->nextoff = 0;
+
+ info->infooff = arena_size - sizeof(struct btt_info);
+ info->flogoff = info->infooff - btt_flog_size(info->nfree);
+ info->mapoff = info->flogoff - btt_map_size(info->external_nlba);
+
+ ASSERTeq(btt_arena_datasize(arena_size, info->nfree) -
+ btt_map_size(info->external_nlba), info->mapoff -
+ info->dataoff);
+}
+
+/*
+ * btt_info_set -- set BTT Info params and offsets
+ */
+int
+btt_info_set(struct btt_info *info, uint32_t external_lbasize,
+ uint32_t nfree, uint64_t arena_size, uint64_t space_left)
+{
+ /* calculate internal LBA size */
+ uint32_t internal_lba_size = internal_lbasize(external_lbasize);
+ if (internal_lba_size == 0)
+ return -1;
+
+ /* set params and offsets */
+ if (btt_info_set_params(info, external_lbasize,
+ internal_lba_size, nfree, arena_size))
+ return -1;
+
+ btt_info_set_offs(info, arena_size, space_left);
+
+ return 0;
+}
+
+/*
+ * write_layout -- (internal) write out the initial btt metadata layout
+ *
+ * Called with write == 1 only once in the life time of a btt namespace, when
+ * the first write happens. The caller of this routine is responsible for
+ * locking out multiple threads. This routine doesn't read anything -- by the
+ * time it is called, it is known there's no layout in the namespace and a new
+ * layout should be written.
+ *
+ * Calling with write == 0 tells this routine to do the calculations for
+ * bttp->narena and bttp->nlba, but don't write out any metadata.
+ *
+ * If successful, sets bttp->layout to 1 and returns 0. Otherwise -1
+ * is returned and errno is set, and bttp->layout remains 0 so that
+ * later attempts to write will try again to create the layout.
+ */
+static int
+write_layout(struct btt *bttp, unsigned lane, int write)
+{
+ LOG(3, "bttp %p lane %u write %d", bttp, lane, write);
+
+ ASSERT(bttp->rawsize >= BTT_MIN_SIZE);
+ ASSERT(bttp->nfree);
+
+ /*
+ * If a new layout is being written, generate the BTT's UUID.
+ */
+ if (write) {
+ int ret = util_uuid_generate(bttp->uuid);
+ if (ret < 0) {
+ LOG(2, "util_uuid_generate failed");
+ return -1;
+ }
+ }
+
+ /*
+ * The number of arenas is the number of full arena of
+ * size BTT_MAX_ARENA that fit into rawsize and then, if
+ * the remainder is at least BTT_MIN_SIZE in size, then
+ * that adds one more arena.
+ */
+ bttp->narena = (unsigned)(bttp->rawsize / BTT_MAX_ARENA);
+ if (bttp->rawsize % BTT_MAX_ARENA >= BTT_MIN_SIZE)
+ bttp->narena++;
+ LOG(4, "narena %u", bttp->narena);
+
+ uint32_t internal_lba_size = internal_lbasize(bttp->lbasize);
+ if (internal_lba_size == 0)
+ return -1;
+ LOG(4, "adjusted internal_lbasize %u", internal_lba_size);
+
+ uint64_t total_nlba = 0;
+ uint64_t rawsize = bttp->rawsize;
+ unsigned arena_num = 0;
+ uint64_t arena_off = 0;
+
+ /*
+ * for each arena...
+ */
+ while (rawsize >= BTT_MIN_SIZE) {
+ LOG(4, "layout arena %u", arena_num);
+
+ uint64_t arena_rawsize = rawsize;
+ if (arena_rawsize > BTT_MAX_ARENA) {
+ arena_rawsize = BTT_MAX_ARENA;
+ }
+ rawsize -= arena_rawsize;
+ arena_num++;
+
+ struct btt_info info;
+ memset(&info, '\0', sizeof(info));
+ if (btt_info_set_params(&info, bttp->lbasize,
+ internal_lba_size, bttp->nfree, arena_rawsize))
+ return -1;
+
+ LOG(4, "internal_nlba %u external_nlba %u",
+ info.internal_nlba, info.external_nlba);
+
+ total_nlba += info.external_nlba;
+
+ /*
+ * The rest of the loop body calculates metadata structures
+ * and lays it out for this arena. So only continue if
+ * the write flag is set.
+ */
+ if (!write)
+ continue;
+
+ btt_info_set_offs(&info, arena_rawsize, rawsize);
+
+ LOG(4, "nextoff 0x%016" PRIx64, info.nextoff);
+ LOG(4, "dataoff 0x%016" PRIx64, info.dataoff);
+ LOG(4, "mapoff 0x%016" PRIx64, info.mapoff);
+ LOG(4, "flogoff 0x%016" PRIx64, info.flogoff);
+ LOG(4, "infooff 0x%016" PRIx64, info.infooff);
+
+ /* zero map if ns is not zero-initialized */
+ if (!bttp->ns_cbp->ns_is_zeroed) {
+ uint64_t mapsize = btt_map_size(info.external_nlba);
+ if ((*bttp->ns_cbp->nszero)(bttp->ns, lane, mapsize,
+ info.mapoff) < 0)
+ return -1;
+ }
+
+ /* write out the initial flog */
+ uint64_t flog_entry_off = arena_off + info.flogoff;
+ uint32_t next_free_lba = info.external_nlba;
+ for (uint32_t i = 0; i < bttp->nfree; i++) {
+ struct btt_flog flog;
+ flog.lba = htole32(i);
+ flog.old_map = flog.new_map =
+ htole32(next_free_lba | BTT_MAP_ENTRY_ZERO);
+ flog.seq = htole32(1);
+
+ /*
+ * Write both btt_flog structs in the pair, writing
+ * the second one as all zeros.
+ */
+ LOG(6, "flog[%u] entry off %" PRIu64
+ " initial %u + zero = %u",
+ i, flog_entry_off,
+ next_free_lba,
+ next_free_lba | BTT_MAP_ENTRY_ZERO);
+ if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &flog,
+ sizeof(flog), flog_entry_off) < 0)
+ return -1;
+ flog_entry_off += sizeof(flog);
+
+ LOG(6, "flog[%u] entry off %" PRIu64 " zeros",
+ i, flog_entry_off);
+ if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &Zflog,
+ sizeof(Zflog), flog_entry_off) < 0)
+ return -1;
+ flog_entry_off += sizeof(flog);
+ flog_entry_off = roundup(flog_entry_off,
+ BTT_FLOG_PAIR_ALIGN);
+
+ next_free_lba++;
+ }
+
+ /*
+ * Construct the BTT info block and write it out
+ * at both the beginning and end of the arena.
+ */
+ memcpy(info.sig, Sig, BTTINFO_SIG_LEN);
+ memcpy(info.uuid, bttp->uuid, BTTINFO_UUID_LEN);
+ memcpy(info.parent_uuid, bttp->parent_uuid, BTTINFO_UUID_LEN);
+ info.major = BTTINFO_MAJOR_VERSION;
+ info.minor = BTTINFO_MINOR_VERSION;
+ btt_info_convert2le(&info);
+
+ util_checksum(&info, sizeof(info), &info.checksum, 1, 0);
+
+ if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info,
+ sizeof(info), arena_off) < 0)
+ return -1;
+ if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info,
+ sizeof(info), arena_off + info.infooff) < 0)
+ return -1;
+
+ arena_off += info.nextoff;
+ }
+
+ ASSERTeq(bttp->narena, arena_num);
+
+ bttp->nlba = total_nlba;
+
+ if (write) {
+ /*
+ * The layout is written now, so load up the arenas.
+ */
+ return read_arenas(bttp, lane, bttp->narena);
+ }
+
+ return 0;
+}
+
+/*
+ * read_layout -- (internal) load up layout info from btt namespace
+ *
+ * Called once when the btt namespace is opened for use.
+ * Sets bttp->layout to 0 if no valid layout is found, 1 otherwise.
+ *
+ * Any recovery actions required (as indicated by the flog state) are
+ * performed by this routine.
+ *
+ * Any quick checks for layout consistency are performed by this routine
+ * (quick enough to be done each time a BTT area is opened for use, not
+ * like the slow consistency checks done by btt_check()).
+ *
+ * Returns 0 if no errors are encountered accessing the namespace (in this
+ * context, detecting there's no layout is not an error if the nsread function
+ * didn't have any problems doing the reads). Otherwise, -1 is returned
+ * and errno is set.
+ */
+static int
+read_layout(struct btt *bttp, unsigned lane)
+{
+ LOG(3, "bttp %p", bttp);
+
+ ASSERT(bttp->rawsize >= BTT_MIN_SIZE);
+
+ unsigned narena = 0;
+ uint32_t smallest_nfree = UINT32_MAX;
+ uint64_t rawsize = bttp->rawsize;
+ uint64_t total_nlba = 0;
+ uint64_t arena_off = 0;
+
+ bttp->nfree = BTT_DEFAULT_NFREE;
+
+ /*
+ * For each arena, see if there's a valid info block
+ */
+ while (rawsize >= BTT_MIN_SIZE) {
+ narena++;
+
+ struct btt_info info;
+ if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &info,
+ sizeof(info), arena_off) < 0)
+ return -1;
+
+ if (!read_info(bttp, &info)) {
+ /*
+ * Failed to find complete BTT metadata. Just
+ * calculate the narena and nlba values that will
+ * result when write_layout() gets called. This
+ * allows checks against nlba to work correctly
+ * even before the layout is written.
+ */
+ return write_layout(bttp, lane, 0);
+ }
+ if (info.external_lbasize != bttp->lbasize) {
+ /* can't read it assuming the wrong block size */
+ ERR("inconsistent lbasize");
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (info.nfree == 0) {
+ ERR("invalid nfree");
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (info.external_nlba == 0) {
+ ERR("invalid external_nlba");
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (info.nextoff && (info.nextoff != BTT_MAX_ARENA)) {
+ ERR("invalid arena size");
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (info.nfree < smallest_nfree)
+ smallest_nfree = info.nfree;
+
+ total_nlba += info.external_nlba;
+ arena_off += info.nextoff;
+ if (info.nextoff == 0)
+ break;
+ if (info.nextoff > rawsize) {
+ ERR("invalid next arena offset");
+ errno = EINVAL;
+ return -1;
+ }
+ rawsize -= info.nextoff;
+ }
+
+ ASSERT(narena);
+
+ bttp->narena = narena;
+ bttp->nlba = total_nlba;
+
+ /*
+ * All arenas were valid. nfree should be the smallest value found
+ * among different arenas.
+ */
+ if (smallest_nfree < bttp->nfree)
+ bttp->nfree = smallest_nfree;
+
+ /*
+ * Load up arenas.
+ */
+ return read_arenas(bttp, lane, narena);
+}
+
+/*
+ * zero_block -- (internal) satisfy a read with a block of zeros
+ *
+ * Returns 0 on success, otherwise -1/errno.
+ */
+static int
+zero_block(struct btt *bttp, void *buf)
+{
+ LOG(3, "bttp %p", bttp);
+
+ memset(buf, '\0', bttp->lbasize);
+ return 0;
+}
+
+/*
+ * lba_to_arena_lba -- (internal) calculate the arena & pre-map LBA
+ *
+ * This routine takes the external LBA and matches it to the
+ * appropriate arena, adjusting the lba for use within that arena.
+ *
+ * If successful, zero is returned, *arenapp is a pointer to the appropriate
+ * arena struct in the run-time state, and *premap_lbap is the LBA adjusted
+ * to an arena-internal LBA (also known as the pre-map LBA). Otherwise
+ * -1/errno.
+ */
+static int
+lba_to_arena_lba(struct btt *bttp, uint64_t lba,
+ struct arena **arenapp, uint32_t *premap_lbap)
+{
+ LOG(3, "bttp %p lba %" PRIu64, bttp, lba);
+
+ ASSERT(bttp->laidout);
+
+ unsigned arena;
+ for (arena = 0; arena < bttp->narena; arena++)
+ if (lba < bttp->arenas[arena].external_nlba)
+ break;
+ else
+ lba -= bttp->arenas[arena].external_nlba;
+
+ ASSERT(arena < bttp->narena);
+
+ *arenapp = &bttp->arenas[arena];
+ ASSERT(lba <= UINT32_MAX);
+ *premap_lbap = (uint32_t)lba;
+
+ LOG(3, "arenap %p pre-map LBA %u", *arenapp, *premap_lbap);
+ return 0;
+}
+
+/*
+ * btt_init -- prepare a btt namespace for use, returning an opaque handle
+ *
+ * Returns handle on success, otherwise NULL/errno.
+ *
+ * When submitted a pristine namespace it will be formatted implicitly when
+ * touched for the first time.
+ *
+ * If arenas have different nfree values, we will be using the lowest one
+ * found as limiting to the overall "bandwidth".
+ */
+struct btt *
+btt_init(uint64_t rawsize, uint32_t lbasize, uint8_t parent_uuid[],
+ unsigned maxlane, void *ns, const struct ns_callback *ns_cbp)
+{
+ LOG(3, "rawsize %" PRIu64 " lbasize %u", rawsize, lbasize);
+
+ if (rawsize < BTT_MIN_SIZE) {
+ ERR("rawsize smaller than BTT_MIN_SIZE %u", BTT_MIN_SIZE);
+ errno = EINVAL;
+ return NULL;
+ }
+
+ struct btt *bttp = Zalloc(sizeof(*bttp));
+
+ if (bttp == NULL) {
+ ERR("!Malloc %zu bytes", sizeof(*bttp));
+ return NULL;
+ }
+
+ util_mutex_init(&bttp->layout_write_mutex);
+ memcpy(bttp->parent_uuid, parent_uuid, BTTINFO_UUID_LEN);
+ bttp->rawsize = rawsize;
+ bttp->lbasize = lbasize;
+ bttp->ns = ns;
+ bttp->ns_cbp = ns_cbp;
+
+ /*
+ * Load up layout, if it exists.
+ *
+ * Whether read_layout() finds a valid layout or not, it finishes
+ * updating these layout-related fields:
+ * bttp->nfree
+ * bttp->nlba
+ * bttp->narena
+ * since these fields are used even before a valid layout it written.
+ */
+ if (read_layout(bttp, 0) < 0) {
+ btt_fini(bttp); /* free up any allocations */
+ return NULL;
+ }
+
+ bttp->nlane = bttp->nfree;
+
+ /* maxlane, if provided, is an upper bound on nlane */
+ if (maxlane && bttp->nlane > maxlane)
+ bttp->nlane = maxlane;
+
+ LOG(3, "success, bttp %p nlane %u", bttp, bttp->nlane);
+ return bttp;
+}
+
+/*
+ * btt_nlane -- return the number of "lanes" for this btt namespace
+ *
+ * The number of lanes is the number of threads allowed in this module
+ * concurrently for a given btt. Each thread executing this code must
+ * have a unique "lane" number assigned to it between 0 and btt_nlane() - 1.
+ */
+unsigned
+btt_nlane(struct btt *bttp)
+{
+ LOG(3, "bttp %p", bttp);
+
+ return bttp->nlane;
+}
+
+/*
+ * btt_nlba -- return the number of usable blocks in a btt namespace
+ *
+ * Valid LBAs to pass to btt_read() and btt_write() are 0 through
+ * btt_nlba() - 1.
+ */
+size_t
+btt_nlba(struct btt *bttp)
+{
+ LOG(3, "bttp %p", bttp);
+
+ return bttp->nlba;
+}
+
+/*
+ * btt_read -- read a block from a btt namespace
+ *
+ * Returns 0 on success, otherwise -1/errno.
+ */
+int
+btt_read(struct btt *bttp, unsigned lane, uint64_t lba, void *buf)
+{
+ LOG(3, "bttp %p lane %u lba %" PRIu64, bttp, lane, lba);
+
+ if (invalid_lba(bttp, lba))
+ return -1;
+
+ /* if there's no layout written yet, all reads come back as zeros */
+ if (!bttp->laidout)
+ return zero_block(bttp, buf);
+
+ /* find which arena LBA lives in, and the offset to the map entry */
+ struct arena *arenap;
+ uint32_t premap_lba;
+ uint64_t map_entry_off;
+ if (lba_to_arena_lba(bttp, lba, &arenap, &premap_lba) < 0)
+ return -1;
+
+ /* convert pre-map LBA into an offset into the map */
+ map_entry_off = arenap->mapoff + BTT_MAP_ENTRY_SIZE * premap_lba;
+
+ /*
+ * Read the current map entry to get the post-map LBA for the data
+ * block read.
+ */
+ uint32_t entry;
+
+ if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &entry,
+ sizeof(entry), map_entry_off) < 0)
+ return -1;
+
+ entry = le32toh(entry);
+
+ /*
+ * Retries come back to the top of this loop (for a rare case where
+ * the map is changed by another thread doing writes to the same LBA).
+ */
+ while (1) {
+ if (map_entry_is_error(entry)) {
+ ERR("EIO due to map entry error flag");
+ errno = EIO;
+ return -1;
+ }
+
+ if (map_entry_is_zero_or_initial(entry))
+ return zero_block(bttp, buf);
+
+ /*
+ * Record the post-map LBA in the read tracking table during
+ * the read. The write will check entries in the read tracking
+ * table before allocating a block for a write, waiting for
+ * outstanding reads on that block to complete.
+ *
+ * Since we already checked for error, zero, and initial
+ * states above, the entry must have both error and zero
+ * bits set at this point (BTT_MAP_ENTRY_NORMAL). We store
+ * the entry that way, with those bits set, in the rtt and
+ * btt_write() will check for it the same way, with the bits
+ * both set.
+ */
+ arenap->rtt[lane] = entry;
+ util_synchronize();
+
+ /*
+ * In case this thread was preempted between reading entry and
+ * storing it in the rtt, check to see if the map changed. If
+ * it changed, the block about to be read is at least free now
+ * (in the flog, but that's okay since the data will still be
+ * undisturbed) and potentially allocated and being used for
+ * another write (data disturbed, so not okay to continue).
+ */
+ uint32_t latest_entry;
+ if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &latest_entry,
+ sizeof(latest_entry), map_entry_off) < 0) {
+ arenap->rtt[lane] = BTT_MAP_ENTRY_ERROR;
+ return -1;
+ }
+
+ latest_entry = le32toh(latest_entry);
+
+ if (entry == latest_entry)
+ break; /* map stayed the same */
+ else
+ entry = latest_entry; /* try again */
+ }
+
+ /*
+ * It is safe to read the block now, since the rtt protects the
+ * block from getting re-allocated to something else by a write.
+ */
+ uint64_t data_block_off =
+ arenap->dataoff + (uint64_t)(entry & BTT_MAP_ENTRY_LBA_MASK) *
+ arenap->internal_lbasize;
+ int readret = (*bttp->ns_cbp->nsread)(bttp->ns, lane, buf,
+ bttp->lbasize, data_block_off);
+
+ /* done with read, so clear out rtt entry */
+ arenap->rtt[lane] = BTT_MAP_ENTRY_ERROR;
+
+ return readret;
+}
+
+/*
+ * map_lock -- (internal) grab the map_lock and read a map entry
+ */
+static int
+map_lock(struct btt *bttp, unsigned lane, struct arena *arenap,
+ uint32_t *entryp, uint32_t premap_lba)
+{
+ LOG(3, "bttp %p lane %u arenap %p premap_lba %u",
+ bttp, lane, arenap, premap_lba);
+
+ uint64_t map_entry_off =
+ arenap->mapoff + BTT_MAP_ENTRY_SIZE * premap_lba;
+ uint32_t map_lock_num = get_map_lock_num(premap_lba, bttp->nfree);
+
+ util_mutex_lock(&arenap->map_locks[map_lock_num]);
+
+ /* read the old map entry */
+ if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, entryp,
+ sizeof(uint32_t), map_entry_off) < 0) {
+ util_mutex_unlock(&arenap->map_locks[map_lock_num]);
+ return -1;
+ }
+
+ /* if map entry is in its initial state return premap_lba */
+ if (map_entry_is_initial(*entryp))
+ *entryp = htole32(premap_lba | BTT_MAP_ENTRY_NORMAL);
+
+ LOG(9, "locked map[%d]: %u%s%s", premap_lba,
+ *entryp & BTT_MAP_ENTRY_LBA_MASK,
+ (map_entry_is_error(*entryp)) ? " ERROR" : "",
+ (map_entry_is_zero(*entryp)) ? " ZERO" : "");
+
+ return 0;
+}
+
+/*
+ * map_abort -- (internal) drop the map_lock without updating the entry
+ */
+static void
+map_abort(struct btt *bttp, unsigned lane, struct arena *arenap,
+ uint32_t premap_lba)
+{
+ LOG(3, "bttp %p lane %u arenap %p premap_lba %u",
+ bttp, lane, arenap, premap_lba);
+
+ util_mutex_unlock(&arenap->map_locks[get_map_lock_num(premap_lba,
+ bttp->nfree)]);
+}
+
+/*
+ * map_unlock -- (internal) update the map and drop the map_lock
+ */
+static int
+map_unlock(struct btt *bttp, unsigned lane, struct arena *arenap,
+ uint32_t entry, uint32_t premap_lba)
+{
+ LOG(3, "bttp %p lane %u arenap %p entry %u premap_lba %u",
+ bttp, lane, arenap, entry, premap_lba);
+
+ uint64_t map_entry_off =
+ arenap->mapoff + BTT_MAP_ENTRY_SIZE * premap_lba;
+
+ /* write the new map entry */
+ int err = (*bttp->ns_cbp->nswrite)(bttp->ns, lane, &entry,
+ sizeof(uint32_t), map_entry_off);
+
+ util_mutex_unlock(&arenap->map_locks[get_map_lock_num(premap_lba,
+ bttp->nfree)]);
+
+ LOG(9, "unlocked map[%d]: %u%s%s", premap_lba,
+ entry & BTT_MAP_ENTRY_LBA_MASK,
+ (map_entry_is_error(entry)) ? " ERROR" : "",
+ (map_entry_is_zero(entry)) ? " ZERO" : "");
+
+ return err;
+}
+
+/*
+ * btt_write -- write a block to a btt namespace
+ *
+ * Returns 0 on success, otherwise -1/errno.
+ */
+int
+btt_write(struct btt *bttp, unsigned lane, uint64_t lba, const void *buf)
+{
+ LOG(3, "bttp %p lane %u lba %" PRIu64, bttp, lane, lba);
+
+ if (invalid_lba(bttp, lba))
+ return -1;
+
+ /* first write through here will initialize the metadata layout */
+ if (!bttp->laidout) {
+ int err = 0;
+
+ util_mutex_lock(&bttp->layout_write_mutex);
+
+ if (!bttp->laidout)
+ err = write_layout(bttp, lane, 1);
+
+ util_mutex_unlock(&bttp->layout_write_mutex);
+
+ if (err < 0)
+ return err;
+ }
+
+ /* find which arena LBA lives in, and the offset to the map entry */
+ struct arena *arenap;
+ uint32_t premap_lba;
+ if (lba_to_arena_lba(bttp, lba, &arenap, &premap_lba) < 0)
+ return -1;
+
+ /* if the arena is in an error state, writing is not allowed */
+ if (arenap->flags & BTTINFO_FLAG_ERROR_MASK) {
+ ERR("EIO due to btt_info error flags 0x%x",
+ arenap->flags & BTTINFO_FLAG_ERROR_MASK);
+ errno = EIO;
+ return -1;
+ }
+
+ /*
+ * This routine was passed a unique "lane" which is an index
+ * into the flog. That means the free block held by flog[lane]
+ * is assigned to this thread and to no other threads (no additional
+ * locking required). So start by performing the write to the
+ * free block. It is only safe to write to a free block if it
+ * doesn't appear in the read tracking table, so scan that first
+ * and if found, wait for the thread reading from it to finish.
+ */
+ uint32_t free_entry = (arenap->flogs[lane].flog.old_map &
+ BTT_MAP_ENTRY_LBA_MASK) | BTT_MAP_ENTRY_NORMAL;
+
+ LOG(3, "free_entry %u (before mask %u)", free_entry,
+ arenap->flogs[lane].flog.old_map);
+
+ /* wait for other threads to finish any reads on free block */
+ for (unsigned i = 0; i < bttp->nlane; i++)
+ while (arenap->rtt[i] == free_entry)
+ ;
+
+ /* it is now safe to perform write to the free block */
+ uint64_t data_block_off = arenap->dataoff +
+ (uint64_t)(free_entry & BTT_MAP_ENTRY_LBA_MASK) *
+ arenap->internal_lbasize;
+ if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, buf,
+ bttp->lbasize, data_block_off) < 0)
+ return -1;
+
+ /*
+ * Make the new block active atomically by updating the on-media flog
+ * and then updating the map.
+ */
+ uint32_t old_entry;
+ if (map_lock(bttp, lane, arenap, &old_entry, premap_lba) < 0)
+ return -1;
+
+ old_entry = le32toh(old_entry);
+
+ /* update the flog */
+ if (flog_update(bttp, lane, arenap, premap_lba,
+ old_entry, free_entry) < 0) {
+ map_abort(bttp, lane, arenap, premap_lba);
+ return -1;
+ }
+
+ if (map_unlock(bttp, lane, arenap, htole32(free_entry),
+ premap_lba) < 0) {
+ /*
+ * A critical write error occurred, set the arena's
+ * info block error bit.
+ */
+ set_arena_error(bttp, arenap, lane);
+ errno = EIO;
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * map_entry_setf -- (internal) set a given flag on a map entry
+ *
+ * Returns 0 on success, otherwise -1/errno.
+ */
+static int
+map_entry_setf(struct btt *bttp, unsigned lane, uint64_t lba, uint32_t setf)
+{
+ LOG(3, "bttp %p lane %u lba %" PRIu64 " setf 0x%x",
+ bttp, lane, lba, setf);
+
+ if (invalid_lba(bttp, lba))
+ return -1;
+
+ if (!bttp->laidout) {
+ /*
+ * No layout is written yet. If the flag being set
+ * is the zero flag, it is superfluous since all blocks
+ * read as zero at this point.
+ */
+ if (setf == BTT_MAP_ENTRY_ZERO)
+ return 0;
+
+ /*
+ * Treat this like the first write and write out
+ * the metadata layout at this point.
+ */
+ int err = 0;
+ util_mutex_lock(&bttp->layout_write_mutex);
+
+ if (!bttp->laidout)
+ err = write_layout(bttp, lane, 1);
+
+ util_mutex_unlock(&bttp->layout_write_mutex);
+
+ if (err < 0)
+ return err;
+ }
+
+ /* find which arena LBA lives in, and the offset to the map entry */
+ struct arena *arenap;
+ uint32_t premap_lba;
+ if (lba_to_arena_lba(bttp, lba, &arenap, &premap_lba) < 0)
+ return -1;
+
+ /* if the arena is in an error state, writing is not allowed */
+ if (arenap->flags & BTTINFO_FLAG_ERROR_MASK) {
+ ERR("EIO due to btt_info error flags 0x%x",
+ arenap->flags & BTTINFO_FLAG_ERROR_MASK);
+ errno = EIO;
+ return -1;
+ }
+
+ /*
+ * Set the flags in the map entry. To do this, read the
+ * current map entry, set the flags, and write out the update.
+ */
+ uint32_t old_entry;
+ uint32_t new_entry;
+
+ if (map_lock(bttp, lane, arenap, &old_entry, premap_lba) < 0)
+ return -1;
+
+ old_entry = le32toh(old_entry);
+
+ if (setf == BTT_MAP_ENTRY_ZERO &&
+ map_entry_is_zero_or_initial(old_entry)) {
+ map_abort(bttp, lane, arenap, premap_lba);
+ return 0; /* block already zero, nothing to do */
+ }
+
+ /* create the new map entry */
+ new_entry = (old_entry & BTT_MAP_ENTRY_LBA_MASK) | setf;
+
+ if (map_unlock(bttp, lane, arenap, htole32(new_entry), premap_lba) < 0)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * btt_set_zero -- mark a block as zeroed in a btt namespace
+ *
+ * Returns 0 on success, otherwise -1/errno.
+ */
+int
+btt_set_zero(struct btt *bttp, unsigned lane, uint64_t lba)
+{
+ LOG(3, "bttp %p lane %u lba %" PRIu64, bttp, lane, lba);
+
+ return map_entry_setf(bttp, lane, lba, BTT_MAP_ENTRY_ZERO);
+}
+
+/*
+ * btt_set_error -- mark a block as in an error state in a btt namespace
+ *
+ * Returns 0 on success, otherwise -1/errno.
+ */
+int
+btt_set_error(struct btt *bttp, unsigned lane, uint64_t lba)
+{
+ LOG(3, "bttp %p lane %u lba %" PRIu64, bttp, lane, lba);
+
+ return map_entry_setf(bttp, lane, lba, BTT_MAP_ENTRY_ERROR);
+}
+
+/*
+ * check_arena -- (internal) perform a consistency check on an arena
+ */
+static int
+check_arena(struct btt *bttp, struct arena *arenap)
+{
+ LOG(3, "bttp %p arenap %p", bttp, arenap);
+
+ int consistent = 1;
+
+ uint64_t map_entry_off = arenap->mapoff;
+ uint32_t bitmapsize = howmany(arenap->internal_nlba, 8);
+ uint8_t *bitmap = Zalloc(bitmapsize);
+ if (bitmap == NULL) {
+ ERR("!Malloc for bitmap");
+ return -1;
+ }
+
+ /*
+ * Go through every post-map LBA mentioned in the map and make sure
+ * there are no duplicates. bitmap is used to track which LBAs have
+ * been seen so far.
+ */
+ uint32_t *mapp = NULL;
+ ssize_t mlen;
+ int next_index = 0;
+ size_t remaining = 0;
+ for (uint32_t i = 0; i < arenap->external_nlba; i++) {
+ uint32_t entry;
+
+ if (remaining == 0) {
+ /* request a mapping of remaining map area */
+ size_t req_len =
+ (arenap->external_nlba - i) * sizeof(uint32_t);
+ mlen = (*bttp->ns_cbp->nsmap)(bttp->ns, 0,
+ (void **)&mapp, req_len, map_entry_off);
+
+ if (mlen < 0)
+ return -1;
+
+ remaining = (size_t)mlen;
+ next_index = 0;
+ }
+ entry = le32toh(mapp[next_index]);
+
+ /* for debug, dump non-zero map entries at log level 11 */
+ if (map_entry_is_zero_or_initial(entry) == 0)
+ LOG(11, "map[%d]: %u%s", i,
+ entry & BTT_MAP_ENTRY_LBA_MASK,
+ (map_entry_is_error(entry)) ? " ERROR" : "");
+
+ /* this is an uninitialized map entry, set the default value */
+ if (map_entry_is_initial(entry))
+ entry = i;
+ else
+ entry &= BTT_MAP_ENTRY_LBA_MASK;
+
+ /* check if entry is valid */
+ if (entry >= arenap->internal_nlba) {
+ ERR("map[%d] entry out of bounds: %u", i, entry);
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (util_isset(bitmap, entry)) {
+ ERR("map[%d] duplicate entry: %u", i, entry);
+ consistent = 0;
+ } else
+ util_setbit(bitmap, entry);
+
+ map_entry_off += sizeof(uint32_t);
+ next_index++;
+ ASSERT(remaining >= sizeof(uint32_t));
+ remaining -= sizeof(uint32_t);
+ }
+
+ /*
+ * Go through the free blocks in the flog, adding them to bitmap
+ * and checking for duplications. It is sufficient to read the
+ * run-time flog here, avoiding more calls to nsread.
+ */
+ for (uint32_t i = 0; i < bttp->nfree; i++) {
+ uint32_t entry = arenap->flogs[i].flog.old_map;
+ entry &= BTT_MAP_ENTRY_LBA_MASK;
+
+ if (util_isset(bitmap, entry)) {
+ ERR("flog[%u] duplicate entry: %u", i, entry);
+ consistent = 0;
+ } else
+ util_setbit(bitmap, entry);
+ }
+
+ /*
+ * Make sure every possible post-map LBA was accounted for
+ * in the two loops above.
+ */
+ for (uint32_t i = 0; i < arenap->internal_nlba; i++)
+ if (util_isclr(bitmap, i)) {
+ ERR("unreferenced lba: %d", i);
+ consistent = 0;
+ }
+
+ Free(bitmap);
+
+ return consistent;
+}
+
+/*
+ * btt_check -- perform a consistency check on a btt namespace
+ *
+ * This routine contains a fairly high-impact set of consistency checks.
+ * It may use a good amount of dynamic memory and CPU time performing
+ * the checks. Any lightweight, quick consistency checks are included
+ * in read_layout() so they happen every time the BTT area is opened
+ * for use.
+ *
+ * Returns true if consistent, zero if inconsistent, -1/error if checking
+ * cannot happen due to other errors.
+ *
+ * No lane number required here because only one thread is allowed -- all
+ * other threads must be locked out of all btt routines for this btt
+ * namespace while this is running.
+ */
+int
+btt_check(struct btt *bttp)
+{
+ LOG(3, "bttp %p", bttp);
+
+ int consistent = 1;
+
+ if (!bttp->laidout) {
+ /* consistent by definition */
+ LOG(3, "no layout yet");
+ return consistent;
+ }
+
+ /* XXX report issues found during read_layout (from flags) */
+
+ /* for each arena... */
+ struct arena *arenap = bttp->arenas;
+ for (unsigned i = 0; i < bttp->narena; i++, arenap++) {
+ /*
+ * Perform the consistency checks for the arena.
+ */
+ int retval = check_arena(bttp, arenap);
+ if (retval < 0)
+ return retval;
+ else if (retval == 0)
+ consistent = 0;
+ }
+
+ /* XXX stub */
+ return consistent;
+}
+
+/*
+ * btt_fini -- delete opaque btt info, done using btt namespace
+ */
+void
+btt_fini(struct btt *bttp)
+{
+ LOG(3, "bttp %p", bttp);
+
+ if (bttp->arenas) {
+ for (unsigned i = 0; i < bttp->narena; i++) {
+ if (bttp->arenas[i].flogs)
+ Free(bttp->arenas[i].flogs);
+ if (bttp->arenas[i].rtt)
+ Free((void *)bttp->arenas[i].rtt);
+ if (bttp->arenas[i].rtt)
+ Free((void *)bttp->arenas[i].map_locks);
+ }
+ Free(bttp->arenas);
+ }
+ Free(bttp);
+}
diff --git a/src/pmdk/src/libpmemblk/btt.h b/src/pmdk/src/libpmemblk/btt.h
new file mode 100644
index 000000000..94f699427
--- /dev/null
+++ b/src/pmdk/src/libpmemblk/btt.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2014-2020, Intel Corporation */
+
+/*
+ * btt.h -- btt module definitions
+ */
+
+#ifndef BTT_H
+#define BTT_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* callback functions passed to btt_init() */
+struct ns_callback {
+ int (*nsread)(void *ns, unsigned lane,
+ void *buf, size_t count, uint64_t off);
+ int (*nswrite)(void *ns, unsigned lane,
+ const void *buf, size_t count, uint64_t off);
+ int (*nszero)(void *ns, unsigned lane, size_t count, uint64_t off);
+ ssize_t (*nsmap)(void *ns, unsigned lane, void **addrp,
+ size_t len, uint64_t off);
+ void (*nssync)(void *ns, unsigned lane, void *addr, size_t len);
+
+ int ns_is_zeroed;
+};
+
+struct btt_info;
+
+struct btt *btt_init(uint64_t rawsize, uint32_t lbasize, uint8_t parent_uuid[],
+ unsigned maxlane, void *ns, const struct ns_callback *ns_cbp);
+unsigned btt_nlane(struct btt *bttp);
+size_t btt_nlba(struct btt *bttp);
+int btt_read(struct btt *bttp, unsigned lane, uint64_t lba, void *buf);
+int btt_write(struct btt *bttp, unsigned lane, uint64_t lba, const void *buf);
+int btt_set_zero(struct btt *bttp, unsigned lane, uint64_t lba);
+int btt_set_error(struct btt *bttp, unsigned lane, uint64_t lba);
+int btt_check(struct btt *bttp);
+void btt_fini(struct btt *bttp);
+
+uint64_t btt_flog_size(uint32_t nfree);
+uint64_t btt_map_size(uint32_t external_nlba);
+uint64_t btt_arena_datasize(uint64_t arena_size, uint32_t nfree);
+int btt_info_set(struct btt_info *info, uint32_t external_lbasize,
+ uint32_t nfree, uint64_t arena_size, uint64_t space_left);
+
+struct btt_flog *btt_flog_get_valid(struct btt_flog *flog_pair, int *next);
+int map_entry_is_initial(uint32_t map_entry);
+void btt_info_convert2h(struct btt_info *infop);
+void btt_info_convert2le(struct btt_info *infop);
+void btt_flog_convert2h(struct btt_flog *flogp);
+void btt_flog_convert2le(struct btt_flog *flogp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/pmdk/src/libpmemblk/btt_layout.h b/src/pmdk/src/libpmemblk/btt_layout.h
new file mode 100644
index 000000000..8fa33f985
--- /dev/null
+++ b/src/pmdk/src/libpmemblk/btt_layout.h
@@ -0,0 +1,107 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Copyright 2014-2020, Intel Corporation */
+
+/*
+ * btt_layout.h -- block translation table on-media layout definitions
+ */
+
+/*
+ * Layout of BTT info block. All integers are stored little-endian.
+ */
+
+#ifndef BTT_LAYOUT_H
+#define BTT_LAYOUT_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BTT_ALIGNMENT ((uintptr_t)4096) /* alignment of all BTT structures */
+#define BTTINFO_SIG_LEN 16
+#define BTTINFO_UUID_LEN 16
+#define BTTINFO_UNUSED_LEN 3968
+#define BTTINFO_SIG "BTT_ARENA_INFO\0"
+
+struct btt_info {
+ char sig[BTTINFO_SIG_LEN]; /* must be "BTT_ARENA_INFO\0\0" */
+ uint8_t uuid[BTTINFO_UUID_LEN]; /* BTT UUID */
+ uint8_t parent_uuid[BTTINFO_UUID_LEN]; /* UUID of container */
+ uint32_t flags; /* see flag bits below */
+ uint16_t major; /* major version */
+ uint16_t minor; /* minor version */
+ uint32_t external_lbasize; /* advertised LBA size (bytes) */
+ uint32_t external_nlba; /* advertised LBAs in this arena */
+ uint32_t internal_lbasize; /* size of data area blocks (bytes) */
+ uint32_t internal_nlba; /* number of blocks in data area */
+ uint32_t nfree; /* number of free blocks */
+ uint32_t infosize; /* size of this info block */
+
+ /*
+ * The following offsets are relative to the beginning of
+ * the btt_info block.
+ */
+ uint64_t nextoff; /* offset to next arena (or zero) */
+ uint64_t dataoff; /* offset to arena data area */
+ uint64_t mapoff; /* offset to area map */
+ uint64_t flogoff; /* offset to area flog */
+ uint64_t infooff; /* offset to backup info block */
+
+ char unused[BTTINFO_UNUSED_LEN]; /* must be zero */
+
+ uint64_t checksum; /* Fletcher64 of all fields */
+};
+
+/*
+ * Definitions for flags mask for btt_info structure above.
+ */
+#define BTTINFO_FLAG_ERROR 0x00000001 /* error state (read-only) */
+#define BTTINFO_FLAG_ERROR_MASK 0x00000001 /* all error bits */
+
+/*
+ * Current on-media format versions.
+ */
+#define BTTINFO_MAJOR_VERSION 1
+#define BTTINFO_MINOR_VERSION 1
+
+/*
+ * Layout of a BTT "flog" entry. All integers are stored little-endian.
+ *
+ * The "nfree" field in the BTT info block determines how many of these
+ * flog entries there are, and each entry consists of two of the following
+ * structs (entry updates alternate between the two structs), padded up
+ * to a cache line boundary to isolate adjacent updates.
+ */
+
+#define BTT_FLOG_PAIR_ALIGN ((uintptr_t)64)
+
+struct btt_flog {
+ uint32_t lba; /* last pre-map LBA using this entry */
+ uint32_t old_map; /* old post-map LBA (the freed block) */
+ uint32_t new_map; /* new post-map LBA */
+ uint32_t seq; /* sequence number (01, 10, 11) */
+};
+
+/*
+ * Layout of a BTT "map" entry. 4-byte internal LBA offset, little-endian.
+ */
+#define BTT_MAP_ENTRY_SIZE 4
+#define BTT_MAP_ENTRY_ERROR 0x40000000U
+#define BTT_MAP_ENTRY_ZERO 0x80000000U
+#define BTT_MAP_ENTRY_NORMAL 0xC0000000U
+#define BTT_MAP_ENTRY_LBA_MASK 0x3fffffffU
+#define BTT_MAP_LOCK_ALIGN ((uintptr_t)64)
+
+/*
+ * BTT layout properties...
+ */
+#define BTT_MIN_SIZE ((1u << 20) * 16)
+#define BTT_MAX_ARENA (1ull << 39) /* 512GB per arena */
+#define BTT_MIN_LBA_SIZE (size_t)512
+#define BTT_INTERNAL_LBA_ALIGNMENT 256U
+#define BTT_DEFAULT_NFREE 256
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/pmdk/src/libpmemblk/libpmemblk.c b/src/pmdk/src/libpmemblk/libpmemblk.c
new file mode 100644
index 000000000..21675eff7
--- /dev/null
+++ b/src/pmdk/src/libpmemblk/libpmemblk.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/* Copyright 2014-2018, Intel Corporation */
+
+/*
+ * libpmemblk.c -- pmem entry points for libpmemblk
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+
+#include "libpmemblk.h"
+#include "ctl_global.h"
+
+#include "pmemcommon.h"
+#include "blk.h"
+
+/*
+ * The variable from which the config is directly loaded. The string
+ * cannot contain any comments or extraneous white characters.
+ */
+#define BLK_CONFIG_ENV_VARIABLE "PMEMBLK_CONF"
+
+/*
+ * The variable that points to a config file from which the config is loaded.
+ */
+#define BLK_CONFIG_FILE_ENV_VARIABLE "PMEMBLK_CONF_FILE"
+
+/*
+ * blk_ctl_init_and_load -- (static) initializes CTL and loads configuration
+ * from env variable and file
+ */
+static int
+blk_ctl_init_and_load(PMEMblkpool *pbp)
+{
+ LOG(3, "pbp %p", pbp);
+
+ if (pbp != NULL && (pbp->ctl = ctl_new()) == NULL) {
+ LOG(2, "!ctl_new");
+ return -1;
+ }
+
+ char *env_config = os_getenv(BLK_CONFIG_ENV_VARIABLE);
+ if (env_config != NULL) {
+ if (ctl_load_config_from_string(pbp ? pbp->ctl : NULL,
+ pbp, env_config) != 0) {
+ LOG(2, "unable to parse config stored in %s "
+ "environment variable",
+ BLK_CONFIG_ENV_VARIABLE);
+ goto err;
+ }
+ }
+
+ char *env_config_file = os_getenv(BLK_CONFIG_FILE_ENV_VARIABLE);
+ if (env_config_file != NULL && env_config_file[0] != '\0') {
+ if (ctl_load_config_from_file(pbp ? pbp->ctl : NULL,
+ pbp, env_config_file) != 0) {
+ LOG(2, "unable to parse config stored in %s "
+ "file (from %s environment variable)",
+ env_config_file,
+ BLK_CONFIG_FILE_ENV_VARIABLE);
+ goto err;
+ }
+ }
+
+ return 0;
+err:
+ if (pbp)
+ ctl_delete(pbp->ctl);
+ return -1;
+}
+
+/*
+ * libpmemblk_init -- (internal) load-time initialization for blk
+ *
+ * Called automatically by the run-time loader.
+ */
+ATTR_CONSTRUCTOR
+void
+libpmemblk_init(void)
+{
+ ctl_global_register();
+
+ if (blk_ctl_init_and_load(NULL))
+ FATAL("error: %s", pmemblk_errormsg());
+
+ common_init(PMEMBLK_LOG_PREFIX, PMEMBLK_LOG_LEVEL_VAR,
+ PMEMBLK_LOG_FILE_VAR, PMEMBLK_MAJOR_VERSION,
+ PMEMBLK_MINOR_VERSION);
+ LOG(3, NULL);
+}
+
+/*
+ * libpmemblk_fini -- libpmemblk cleanup routine
+ *
+ * Called automatically when the process terminates.
+ */
+ATTR_DESTRUCTOR
+void
+libpmemblk_fini(void)
+{
+ LOG(3, NULL);
+ common_fini();
+}
+
+/*
+ * pmemblk_check_versionU -- see if lib meets application version requirements
+ */
+#ifndef _WIN32
+static inline
+#endif
+const char *
+pmemblk_check_versionU(unsigned major_required, unsigned minor_required)
+{
+ LOG(3, "major_required %u minor_required %u",
+ major_required, minor_required);
+
+ if (major_required != PMEMBLK_MAJOR_VERSION) {
+ ERR("libpmemblk major version mismatch (need %u, found %u)",
+ major_required, PMEMBLK_MAJOR_VERSION);
+ return out_get_errormsg();
+ }
+
+ if (minor_required > PMEMBLK_MINOR_VERSION) {
+ ERR("libpmemblk minor version mismatch (need %u, found %u)",
+ minor_required, PMEMBLK_MINOR_VERSION);
+ return out_get_errormsg();
+ }
+
+ return NULL;
+}
+
+#ifndef _WIN32
+/*
+ * pmemblk_check_version -- see if lib meets application version requirements
+ */
+const char *
+pmemblk_check_version(unsigned major_required, unsigned minor_required)
+{
+ return pmemblk_check_versionU(major_required, minor_required);
+}
+#else
+/*
+ * pmemblk_check_versionW -- see if lib meets application version requirements
+ */
+const wchar_t *
+pmemblk_check_versionW(unsigned major_required, unsigned minor_required)
+{
+ if (pmemblk_check_versionU(major_required, minor_required) != NULL)
+ return out_get_errormsgW();
+ else
+ return NULL;
+}
+#endif
+
+/*
+ * pmemblk_set_funcs -- allow overriding libpmemblk's call to malloc, etc.
+ */
+void
+pmemblk_set_funcs(
+ void *(*malloc_func)(size_t size),
+ void (*free_func)(void *ptr),
+ void *(*realloc_func)(void *ptr, size_t size),
+ char *(*strdup_func)(const char *s))
+{
+ LOG(3, NULL);
+
+ util_set_alloc_funcs(malloc_func, free_func, realloc_func, strdup_func);
+}
+
+/*
+ * pmemblk_errormsgU -- return last error message
+ */
+#ifndef _WIN32
+static inline
+#endif
+const char *
+pmemblk_errormsgU(void)
+{
+ return out_get_errormsg();
+}
+
+#ifndef _WIN32
+/*
+ * pmemblk_errormsg -- return last error message
+ */
+const char *
+pmemblk_errormsg(void)
+{
+ return pmemblk_errormsgU();
+}
+#else
+/*
+ * pmemblk_errormsgW -- return last error message as wchar_t
+ */
+const wchar_t *
+pmemblk_errormsgW(void)
+{
+ return out_get_errormsgW();
+}
+#endif
diff --git a/src/pmdk/src/libpmemblk/libpmemblk.def b/src/pmdk/src/libpmemblk/libpmemblk.def
new file mode 100644
index 000000000..fa7f91f14
--- /dev/null
+++ b/src/pmdk/src/libpmemblk/libpmemblk.def
@@ -0,0 +1,36 @@
+;;;; Begin Copyright Notice
+; SPDX-License-Identifier: BSD-3-Clause
+; Copyright 2015-2018, Intel Corporation
+;;;; End Copyright Notice
+
+LIBRARY libpmemblk
+
+VERSION 1.0
+
+EXPORTS
+ pmemblk_check_versionU
+ pmemblk_check_versionW
+ pmemblk_set_funcs
+ pmemblk_errormsgU
+ pmemblk_errormsgW
+ pmemblk_createU
+ pmemblk_createW
+ pmemblk_openU
+ pmemblk_openW
+ pmemblk_close
+ pmemblk_checkU
+ pmemblk_checkW
+ pmemblk_ctl_execU;
+ pmemblk_ctl_execW;
+ pmemblk_ctl_getU;
+ pmemblk_ctl_getW;
+ pmemblk_ctl_setU;
+ pmemblk_ctl_setW;
+ pmemblk_bsize
+ pmemblk_nblock
+ pmemblk_read
+ pmemblk_write
+ pmemblk_set_zero
+ pmemblk_set_error
+
+ DllMain
diff --git a/src/pmdk/src/libpmemblk/libpmemblk.link.in b/src/pmdk/src/libpmemblk/libpmemblk.link.in
new file mode 100644
index 000000000..b61e83ced
--- /dev/null
+++ b/src/pmdk/src/libpmemblk/libpmemblk.link.in
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2014-2019, Intel Corporation
+#
+#
+# src/libpmemblk.link -- linker link file for libpmemblk
+#
+LIBPMEMBLK_1.0 {
+ global:
+ pmemblk_check_version;
+ pmemblk_set_funcs;
+ pmemblk_errormsg;
+ pmemblk_create;
+ pmemblk_open;
+ pmemblk_close;
+ pmemblk_check;
+ pmemblk_ctl_exec;
+ pmemblk_ctl_get;
+ pmemblk_ctl_set;
+ pmemblk_nblock;
+ pmemblk_read;
+ pmemblk_write;
+ pmemblk_set_zero;
+ pmemblk_set_error;
+ pmemblk_bsize;
+ fault_injection;
+ local:
+ *;
+};
diff --git a/src/pmdk/src/libpmemblk/libpmemblk.rc b/src/pmdk/src/libpmemblk/libpmemblk.rc
new file mode 100644
index 000000000..b95b6252e
--- /dev/null
+++ b/src/pmdk/src/libpmemblk/libpmemblk.rc
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/* Copyright 2016, Intel Corporation */
+
+/*
+ * libpmemblk.rc -- libpmemblk resource file
+ */
+
+#include <windows.h>
+#define FILE_NAME "libpmemblk.dll"
+#define DESCRIPTION "libpmemblk - persistent memory resident array of blocks"
+#define TYPE VFT_DLL
+#include <common.rc> \ No newline at end of file
diff --git a/src/pmdk/src/libpmemblk/libpmemblk.vcxproj b/src/pmdk/src/libpmemblk/libpmemblk.vcxproj
new file mode 100644
index 000000000..680052a3f
--- /dev/null
+++ b/src/pmdk/src/libpmemblk/libpmemblk.vcxproj
@@ -0,0 +1,133 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="..\..\src\libpmemblk\blk.c" />
+ <ClCompile Include="..\..\src\libpmemblk\btt.c" />
+ <ClCompile Include="..\..\src\libpmemblk\libpmemblk.c" />
+ <ClCompile Include="..\core\alloc.c" />
+ <ClCompile Include="..\common\set_badblocks.c" />
+ <ClCompile Include="..\common\ctl.c" />
+ <ClCompile Include="..\common\ctl_cow.c" />
+ <ClCompile Include="..\common\ctl_prefault.c" />
+ <ClCompile Include="..\common\ctl_sds.c" />
+ <ClCompile Include="..\common\ctl_fallocate.c" />
+ <ClCompile Include="..\common\file.c" />
+ <ClCompile Include="..\common\file_windows.c" />
+ <ClCompile Include="..\common\mmap.c" />
+ <ClCompile Include="..\common\mmap_windows.c" />
+ <ClCompile Include="..\core\fs_windows.c" />
+ <ClCompile Include="..\common\bad_blocks.c" />
+ <ClCompile Include="..\common\os_deep_windows.c" />
+ <ClCompile Include="..\core\os_thread_windows.c" />
+ <ClCompile Include="..\core\os_windows.c" />
+ <ClCompile Include="..\core\out.c" />
+ <ClCompile Include="..\common\pool_hdr.c" />
+ <ClCompile Include="..\common\set.c" />
+ <ClCompile Include="..\common\shutdown_state.c" />
+ <ClCompile Include="..\core\util.c" />
+ <ClCompile Include="..\core\util_windows.c" />
+ <ClCompile Include="..\common\uuid.c" />
+ <ClCompile Include="..\common\uuid_windows.c" />
+ <ClCompile Include="..\libpmem2\auto_flush_windows.c" />
+ <ClCompile Include="..\libpmem2\usc_windows.c" />
+ <ClCompile Include="libpmemblk_main.c" />
+ <ClCompile Include="..\libpmem2\config.c" />
+ <ClCompile Include="..\libpmem2\badblocks.c" />
+ <ClCompile Include="..\libpmem2\badblocks_none.c" />
+ <ClCompile Include="..\libpmem2\source.c" />
+ <ClCompile Include="..\libpmem2\source_windows.c" />
+ <ClCompile Include="..\libpmem2\pmem2_utils.c" />
+ <ClCompile Include="..\libpmem2\pmem2_utils_other.c" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="..\..\src\core\out.h" />
+ <ClInclude Include="..\..\src\core\util.h" />
+ <ClInclude Include="..\..\src\common\valgrind_internal.h" />
+ <ClInclude Include="..\..\src\include\libpmemblk.h" />
+ <ClInclude Include="..\..\src\libpmemblk\blk.h" />
+ <ClInclude Include="..\..\src\libpmemblk\btt.h" />
+ <ClInclude Include="..\..\src\libpmemblk\btt_layout.h" />
+ <ClInclude Include="..\core\alloc.h" />
+ <ClInclude Include="..\common\ctl.h" />
+ <ClInclude Include="..\common\ctl_global.h" />
+ <ClInclude Include="..\common\dlsym.h" />
+ <ClInclude Include="..\core\fault_injection.h" />
+ <ClInclude Include="..\common\file.h" />
+ <ClInclude Include="..\core\fs.h" />
+ <ClInclude Include="..\common\mmap.h" />
+ <ClInclude Include="..\core\os.h" />
+ <ClInclude Include="..\common\os_deep.h" />
+ <ClInclude Include="..\core\os_thread.h" />
+ <ClInclude Include="..\common\pmemcommon.h" />
+ <ClInclude Include="..\common\pool_hdr.h" />
+ <ClInclude Include="..\common\set.h" />
+ <ClInclude Include="..\common\sys_util.h" />
+ <ClInclude Include="..\common\uuid.h" />
+ <ClInclude Include="..\libpmem2\auto_flush.h" />
+ <ClInclude Include="..\libpmem2\auto_flush_windows.h" />
+ <ClInclude Include="..\libpmem2\config.h" />
+ <ClInclude Include="..\libpmem2\pmem2_utils.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <None Include="libpmemblk.def" />
+ </ItemGroup>
+ <ItemGroup>
+ <ProjectReference Include="..\libpmem\libpmem.vcxproj">
+ <Project>{9e9e3d25-2139-4a5d-9200-18148ddead45}</Project>
+ </ProjectReference>
+ <ProjectReference Include="..\windows\srcversion\srcversion.vcxproj">
+ <Project>{901f04db-e1a5-4a41-8b81-9d31c19acd59}</Project>
+ </ProjectReference>
+ </ItemGroup>
+ <ItemGroup>
+ <ResourceCompile Include="libpmemblk.rc" />
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{f7c6c6b6-4142-4c82-8699-4a9d8183181b}</ProjectGuid>
+ <Keyword>DynamicLibrary</Keyword>
+ <ProjectName>libpmemblk</ProjectName>
+ <RootNamespace>libpmemblk</RootNamespace>
+ <DefaultLanguage>en-US</DefaultLanguage>
+ <MinimumVisualStudioVersion>14.0</MinimumVisualStudioVersion>
+ <WindowsTargetPlatformVersion>10.0.17134.0</WindowsTargetPlatformVersion>
+ <WindowsTargetPlatformMinVersion>10.0.10240.0</WindowsTargetPlatformMinVersion>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>DynamicLibrary</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <PlatformToolset>v140</PlatformToolset>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>DynamicLibrary</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <WholeProgramOptimization>false</WholeProgramOptimization>
+ <PlatformToolset>v140</PlatformToolset>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ <Import Project="..\windows\libs_debug.props" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ <Import Project="..\windows\libs_release.props" />
+ </ImportGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
+</Project> \ No newline at end of file
diff --git a/src/pmdk/src/libpmemblk/libpmemblk.vcxproj.filters b/src/pmdk/src/libpmemblk/libpmemblk.vcxproj.filters
new file mode 100644
index 000000000..198595944
--- /dev/null
+++ b/src/pmdk/src/libpmemblk/libpmemblk.vcxproj.filters
@@ -0,0 +1,217 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <Filter Include="Header Files">
+ <UniqueIdentifier>{5f4b56cf-a674-4f35-abfa-d867d9d91f68}</UniqueIdentifier>
+ </Filter>
+ <Filter Include="Source Files">
+ <UniqueIdentifier>{dee0ff57-9af8-485a-888b-0087d6e11cf8}</UniqueIdentifier>
+ </Filter>
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="..\..\src\libpmemblk\blk.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\libpmemblk\btt.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\libpmemblk\libpmemblk.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="libpmemblk_main.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\common\uuid_windows.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\common\uuid.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\core\util_windows.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\core\util.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\common\set.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\common\pool_hdr.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\core\out.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\common\mmap_windows.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\core\fs_windows.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\common\mmap.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\common\file_windows.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\common\file.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\core\os_windows.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\core\os_thread_windows.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\common\shutdown_state.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\common\os_deep_windows.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\common\set_badblocks.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\common\bad_blocks.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\common\ctl.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\common\ctl_prefault.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\common\ctl_fallocate.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\common\ctl_sds.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\core\alloc.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\common\ctl_cow.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\libpmem2\auto_flush_windows.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\libpmem2\config.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\libpmem2\badblocks.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\libpmem2\badblocks_none.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\libpmem2\source.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\libpmem2\source_windows.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\libpmem2\pmem2_utils.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\libpmem2\pmem2_utils_other.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="..\libpmem2\usc_windows.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="..\..\src\include\libpmemblk.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\libpmemblk\blk.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\libpmemblk\btt.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\libpmemblk\btt_layout.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\core\out.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\core\util.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\common\valgrind_internal.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\common\uuid.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\common\sys_util.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\common\set.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\common\pool_hdr.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\common\pmemcommon.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\common\mmap.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\common\file.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\common\dlsym.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\core\fs.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\core\os.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\core\os_thread.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\common\os_deep.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\common\ctl_global.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\common\ctl.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\core\alloc.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\core\fault_injection.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\libpmem2\auto_flush.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\libpmem2\auto_flush_windows.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\libpmem2\config.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ <ClInclude Include="..\libpmem2\pmem2_utils.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ </ItemGroup>
+ <ItemGroup>
+ <None Include="libpmemblk.def">
+ <Filter>Source Files</Filter>
+ </None>
+ </ItemGroup>
+ <ItemGroup>
+ <ResourceCompile Include="libpmemblk.rc">
+ <Filter>Source Files</Filter>
+ </ResourceCompile>
+ </ItemGroup>
+</Project> \ No newline at end of file
diff --git a/src/pmdk/src/libpmemblk/libpmemblk_main.c b/src/pmdk/src/libpmemblk/libpmemblk_main.c
new file mode 100644
index 000000000..28c547d2e
--- /dev/null
+++ b/src/pmdk/src/libpmemblk/libpmemblk_main.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/* Copyright 2015-2017, Intel Corporation */
+
+/*
+ * libpmemblk_main.c -- entry point for libpmemblk.dll
+ *
+ * XXX - This is a placeholder. All the library initialization/cleanup
+ * that is done in library ctors/dtors, as well as TLS initialization
+ * should be moved here.
+ */
+
+void libpmemblk_init(void);
+void libpmemblk_fini(void);
+
+int APIENTRY
+DllMain(HINSTANCE hInstance, DWORD dwReason, LPVOID lpReserved)
+{
+ switch (dwReason) {
+ case DLL_PROCESS_ATTACH:
+ libpmemblk_init();
+ break;
+
+ case DLL_THREAD_ATTACH:
+ case DLL_THREAD_DETACH:
+ break;
+
+ case DLL_PROCESS_DETACH:
+ libpmemblk_fini();
+ break;
+ }
+ return TRUE;
+}