diff options
Diffstat (limited to 'src/pmdk/src/libpmemblk/btt.c')
-rw-r--r-- | src/pmdk/src/libpmemblk/btt.c | 2051 |
1 files changed, 2051 insertions, 0 deletions
diff --git a/src/pmdk/src/libpmemblk/btt.c b/src/pmdk/src/libpmemblk/btt.c new file mode 100644 index 000000000..f59389f21 --- /dev/null +++ b/src/pmdk/src/libpmemblk/btt.c @@ -0,0 +1,2051 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* Copyright 2014-2019, Intel Corporation */ + +/* + * btt.c -- block translation table providing atomic block updates + * + * This is a user-space implementation of the BTT mechanism providing + * single block powerfail write atomicity, as described by: + * The NVDIMM Namespace Specification + * + * To use this module, the caller must provide five routines for + * accessing the namespace containing the data (in this context, + * "namespace" refers to the storage containing the BTT layout, such + * as a file). All namespace I/O is done by these callbacks: + * + * nsread Read count bytes from namespace at offset off + * nswrite Write count bytes to namespace at offset off + * nszero Zero count bytes in namespace at offset off + * nsmap Return direct access to a range of a namespace + * nssync Flush changes made to an nsmap'd range + * + * Data written by the nswrite callback is flushed out to the media + * (made durable) when the call returns. Data written directly via + * the nsmap callback must be flushed explicitly using nssync. + * + * The caller passes these callbacks, along with information such as + * namespace size and UUID to btt_init() and gets back an opaque handle + * which is then used with the rest of the entry points. + * + * Here is a brief list of the entry points to this module: + * + * btt_nlane Returns number of concurrent threads allowed + * + * btt_nlba Returns the usable size, as a count of LBAs + * + * btt_read Reads a single block at a given LBA + * + * btt_write Writes a single block (atomically) at a given LBA + * + * btt_set_zero Sets a block to read back as zeros + * + * btt_set_error Sets a block to return error on read + * + * btt_check Checks the BTT metadata for consistency + * + * btt_fini Frees run-time state, done using namespace + * + * If the caller is multi-threaded, it must only allow btt_nlane() threads + * to enter this module at a time, each assigned a unique "lane" number + * between 0 and btt_nlane() - 1. + * + * There are a number of static routines defined in this module. Here's + * a brief overview of the most important routines: + * + * read_layout Checks for valid BTT layout and builds run-time state. + * A number of helper functions are used by read_layout + * to handle various parts of the metadata: + * read_info + * read_arenas + * read_arena + * read_flogs + * read_flog_pair + * + * write_layout Generates a new BTT layout when one doesn't exist. + * Once a new layout is written, write_layout uses + * the same helper functions above to construct the + * run-time state. + * + * invalid_lba Range check done by each entry point that takes + * an LBA. + * + * lba_to_arena_lba + * Find the arena and LBA in that arena for a given + * external LBA. This is the heart of the arena + * range matching logic. + * + * flog_update Update the BTT free list/log combined data structure + * (known as the "flog"). This is the heart of the + * logic that makes writes powerfail atomic. + * + * map_lock These routines provide atomic access to the BTT map + * map_unlock data structure in an area. + * map_abort + * + * map_entry_setf Common code for btt_set_zero() and btt_set_error(). + * + * zero_block Generate a block of all zeros (instead of actually + * doing a read), when the metadata indicates the + * block should read as zeros. + * + * build_rtt These routines construct the run-time tracking + * build_map_locks data structures used during I/O. + */ + +#include <inttypes.h> +#include <stdio.h> +#include <sys/param.h> +#include <unistd.h> +#include <errno.h> +#include <string.h> +#include <stdint.h> +#include <endian.h> + +#include "out.h" +#include "uuid.h" +#include "btt.h" +#include "btt_layout.h" +#include "sys_util.h" +#include "util.h" +#include "alloc.h" + +/* + * The opaque btt handle containing state tracked by this module + * for the btt namespace. This is created by btt_init(), handed to + * all the other btt_* entry points, and deleted by btt_fini(). + */ +struct btt { + unsigned nlane; /* number of concurrent threads allowed per btt */ + + /* + * The laidout flag indicates whether the namespace contains valid BTT + * metadata. It is initialized by read_layout() and if no valid layout + * is found, all reads return zeros and the first write will write the + * BTT layout. The layout_write_mutex protects the laidout flag so + * only one write threads ends up writing the initial metadata by + * calling write_layout(). + */ + os_mutex_t layout_write_mutex; + int laidout; + + /* + * UUID of the BTT + */ + uint8_t uuid[BTTINFO_UUID_LEN]; + + /* + * UUID of the containing namespace, used to validate BTT metadata. + */ + uint8_t parent_uuid[BTTINFO_UUID_LEN]; + + /* + * Parameters controlling/describing the BTT layout. + */ + uint64_t rawsize; /* size of containing namespace */ + uint32_t lbasize; /* external LBA size */ + uint32_t nfree; /* available flog entries */ + uint64_t nlba; /* total number of external LBAs */ + unsigned narena; /* number of arenas */ + + /* run-time state kept for each arena */ + struct arena { + uint32_t flags; /* arena flags (btt_info) */ + uint32_t external_nlba; /* LBAs that live in this arena */ + uint32_t internal_lbasize; + uint32_t internal_nlba; + + /* + * The following offsets are relative to the beginning of + * the encapsulating namespace. This is different from + * how these offsets are stored on-media, where they are + * relative to the start of the arena. The offset are + * converted by read_layout() to make them more convenient + * for run-time use. + */ + uint64_t startoff; /* offset to start of arena */ + uint64_t dataoff; /* offset to arena data area */ + uint64_t mapoff; /* offset to area map */ + uint64_t flogoff; /* offset to area flog */ + uint64_t nextoff; /* offset to next arena */ + + /* + * Run-time flog state. Indexed by lane. + * + * The write path uses the flog to find the free block + * it writes to before atomically making it the new + * active block for an external LBA. + * + * The read path doesn't use the flog at all. + */ + struct flog_runtime { + struct btt_flog flog; /* current info */ + uint64_t entries[2]; /* offsets for flog pair */ + int next; /* next write (0 or 1) */ + } *flogs; + + /* + * Read tracking table. Indexed by lane. + * + * Before using a free block found in the flog, the write path + * scans the rtt to see if there are any outstanding reads on + * that block (reads that started before the block was freed by + * a concurrent write). Unused slots in the rtt are indicated + * by setting the error bit, BTT_MAP_ENTRY_ERROR, so that the + * entry won't match any post-map LBA when checked. + */ + uint32_t volatile *rtt; + + /* + * Map locking. Indexed by pre-map LBA modulo nlane. + */ + os_mutex_t *map_locks; + + /* + * Arena info block locking. + */ + os_mutex_t info_lock; + } *arenas; + + /* + * Callbacks for doing I/O to namespace. These are provided by + * the code calling the BTT module, which passes them in to + * btt_init(). All namespace I/O is done using these. + * + * The opaque namespace handle "ns" was provided by the code calling + * the BTT module and is passed to each callback to identify the + * namespace being accessed. + */ + void *ns; + const struct ns_callback *ns_cbp; +}; + +/* + * Signature for arena info blocks. Total size is 16 bytes, including + * the '\0' added to the string by the declaration (the last two bytes + * of the string are '\0'). + */ +static const char Sig[] = BTTINFO_SIG; + +/* + * Zeroed out flog entry, used when initializing the flog. + */ +static const struct btt_flog Zflog; + +/* + * Lookup table and macro for looking up sequence numbers. These are + * the 2-bit numbers that cycle between 01, 10, and 11. + * + * To advance a sequence number to the next number, use something like: + * seq = NSEQ(seq); + */ +static const unsigned Nseq[] = { 0, 2, 3, 1 }; +#define NSEQ(seq) (Nseq[(seq) & 3]) + +/* + * get_map_lock_num -- (internal) Calculate offset into map_locks[] + * + * map_locks[] contains nfree locks which are used to protect the map + * from concurrent access to the same cache line. The index into + * map_locks[] is calculated by looking at the byte offset into the map + * (premap_lba * BTT_MAP_ENTRY_SIZE), figuring out how many cache lines + * that is into the map that is (dividing by BTT_MAP_LOCK_ALIGN), and + * then selecting one of nfree locks (the modulo at the end). + * + * The extra cast is to keep gcc from generating a false positive + * 64-32 bit conversion error when -fsanitize is set. + */ +static inline uint32_t +get_map_lock_num(uint32_t premap_lba, uint32_t nfree) +{ + return (uint32_t)(premap_lba * BTT_MAP_ENTRY_SIZE / BTT_MAP_LOCK_ALIGN) + % nfree; +} + +/* + * invalid_lba -- (internal) set errno and return true if lba is invalid + * + * This function is used at the top of the entry points where an external + * LBA is provided, like this: + * + * if (invalid_lba(bttp, lba)) + * return -1; + */ +static int +invalid_lba(struct btt *bttp, uint64_t lba) +{ + LOG(3, "bttp %p lba %" PRIu64, bttp, lba); + + if (lba >= bttp->nlba) { + ERR("lba out of range (nlba %" PRIu64 ")", bttp->nlba); + errno = EINVAL; + return 1; + } + + return 0; +} + +/* + * read_info -- (internal) convert btt_info to host byte order & validate + * + * Returns true if info block is valid, and all the integer fields are + * converted to host byte order. If the info block is not valid, this + * routine returns false and the info block passed in is left in an + * unknown state. + */ +static int +read_info(struct btt *bttp, struct btt_info *infop) +{ + LOG(3, "infop %p", infop); + + if (memcmp(infop->sig, Sig, BTTINFO_SIG_LEN)) { + LOG(3, "signature invalid"); + return 0; + } + + if (memcmp(infop->parent_uuid, bttp->parent_uuid, BTTINFO_UUID_LEN)) { + LOG(3, "parent UUID mismatch"); + return 0; + } + + /* to be valid, the fields must checksum correctly */ + if (!util_checksum(infop, sizeof(*infop), &infop->checksum, 0, 0)) { + LOG(3, "invalid checksum"); + return 0; + } + + /* to be valid, info block must have a major version of at least 1 */ + if ((infop->major = le16toh(infop->major)) == 0) { + LOG(3, "invalid major version (0)"); + return 0; + } + + infop->flags = le32toh(infop->flags); + infop->minor = le16toh(infop->minor); + infop->external_lbasize = le32toh(infop->external_lbasize); + infop->external_nlba = le32toh(infop->external_nlba); + infop->internal_lbasize = le32toh(infop->internal_lbasize); + infop->internal_nlba = le32toh(infop->internal_nlba); + infop->nfree = le32toh(infop->nfree); + infop->infosize = le32toh(infop->infosize); + infop->nextoff = le64toh(infop->nextoff); + infop->dataoff = le64toh(infop->dataoff); + infop->mapoff = le64toh(infop->mapoff); + infop->flogoff = le64toh(infop->flogoff); + infop->infooff = le64toh(infop->infooff); + + return 1; +} + +/* + * map_entry_is_zero -- (internal) checks if map_entry is in zero state + */ +static inline int +map_entry_is_zero(uint32_t map_entry) +{ + return (map_entry & ~BTT_MAP_ENTRY_LBA_MASK) == BTT_MAP_ENTRY_ZERO; +} + +/* + * map_entry_is_error -- (internal) checks if map_entry is in error state + */ +static inline int +map_entry_is_error(uint32_t map_entry) +{ + return (map_entry & ~BTT_MAP_ENTRY_LBA_MASK) == BTT_MAP_ENTRY_ERROR; +} + +/* + * map_entry_is_initial -- checks if map_entry is in initial state + */ +int +map_entry_is_initial(uint32_t map_entry) +{ + return (map_entry & ~BTT_MAP_ENTRY_LBA_MASK) == 0; +} + +/* + * map_entry_is_zero_or_initial -- (internal) checks if map_entry is in initial + * or zero state + */ +static inline int +map_entry_is_zero_or_initial(uint32_t map_entry) +{ + uint32_t entry_flags = map_entry & ~BTT_MAP_ENTRY_LBA_MASK; + return entry_flags == 0 || entry_flags == BTT_MAP_ENTRY_ZERO; +} + +/* + * btt_flog_get_valid -- return valid and current flog entry + */ +struct btt_flog * +btt_flog_get_valid(struct btt_flog *flog_pair, int *next) +{ + /* + * Interesting cases: + * - no valid seq numbers: layout consistency error + * - one valid seq number: that's the current entry + * - two valid seq numbers: higher number is current entry + * - identical seq numbers: layout consistency error + */ + if (flog_pair[0].seq == flog_pair[1].seq) { + return NULL; + } else if (flog_pair[0].seq == 0) { + /* singleton valid flog at flog_pair[1] */ + *next = 0; + return &flog_pair[1]; + } else if (flog_pair[1].seq == 0) { + /* singleton valid flog at flog_pair[0] */ + *next = 1; + return &flog_pair[0]; + } else if (NSEQ(flog_pair[0].seq) == flog_pair[1].seq) { + /* flog_pair[1] has the later sequence number */ + *next = 0; + return &flog_pair[1]; + } else { + /* flog_pair[0] has the later sequence number */ + *next = 1; + return &flog_pair[0]; + } +} + +/* + * read_flog_pair -- (internal) load up a single flog pair + * + * Zero is returned on success, otherwise -1/errno. + */ +static int +read_flog_pair(struct btt *bttp, unsigned lane, struct arena *arenap, + uint64_t flog_off, struct flog_runtime *flog_runtimep, uint32_t flognum) +{ + LOG(5, "bttp %p lane %u arenap %p flog_off %" PRIu64 " runtimep %p " + "flognum %u", bttp, lane, arenap, flog_off, flog_runtimep, + flognum); + + flog_runtimep->entries[0] = flog_off; + flog_runtimep->entries[1] = flog_off + sizeof(struct btt_flog); + + if (lane >= bttp->nfree) { + ERR("invalid lane %u among nfree %d", lane, bttp->nfree); + errno = EINVAL; + return -1; + } + + if (flog_off == 0) { + ERR("invalid flog offset %" PRIu64, flog_off); + errno = EINVAL; + return -1; + } + + struct btt_flog flog_pair[2]; + if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, flog_pair, + sizeof(flog_pair), flog_off) < 0) + return -1; + + btt_flog_convert2h(&flog_pair[0]); + if (invalid_lba(bttp, flog_pair[0].lba)) + return -1; + + btt_flog_convert2h(&flog_pair[1]); + if (invalid_lba(bttp, flog_pair[1].lba)) + return -1; + + LOG(6, "flog_pair[0] flog_off %" PRIu64 " old_map %u new_map %u seq %u", + flog_off, flog_pair[0].old_map, + flog_pair[0].new_map, flog_pair[0].seq); + LOG(6, "flog_pair[1] old_map %u new_map %u seq %u", + flog_pair[1].old_map, flog_pair[1].new_map, + flog_pair[1].seq); + + struct btt_flog *currentp = btt_flog_get_valid(flog_pair, + &flog_runtimep->next); + + if (currentp == NULL) { + ERR("flog layout error: bad seq numbers %d %d", + flog_pair[0].seq, flog_pair[1].seq); + arenap->flags |= BTTINFO_FLAG_ERROR; + return 0; + } + + LOG(6, "run-time flog next is %d", flog_runtimep->next); + + /* copy current flog into run-time flog state */ + flog_runtimep->flog = *currentp; + + LOG(9, "read flog[%u]: lba %u old %u%s%s%s new %u%s%s%s", flognum, + currentp->lba, + currentp->old_map & BTT_MAP_ENTRY_LBA_MASK, + (map_entry_is_error(currentp->old_map)) ? " ERROR" : "", + (map_entry_is_zero(currentp->old_map)) ? " ZERO" : "", + (map_entry_is_initial(currentp->old_map)) ? " INIT" : "", + currentp->new_map & BTT_MAP_ENTRY_LBA_MASK, + (map_entry_is_error(currentp->new_map)) ? " ERROR" : "", + (map_entry_is_zero(currentp->new_map)) ? " ZERO" : "", + (map_entry_is_initial(currentp->new_map)) ? " INIT" : ""); + + /* + * Decide if the current flog info represents a completed + * operation or an incomplete operation. If completed, the + * old_map field will contain the free block to be used for + * the next write. But if the operation didn't complete (indicated + * by the map entry not being updated), then the operation is + * completed now by updating the map entry. + * + * A special case, used by flog entries when first created, is + * when old_map == new_map. This counts as a complete entry + * and doesn't require reading the map to see if recovery is + * required. + */ + if (currentp->old_map == currentp->new_map) { + LOG(9, "flog[%u] entry complete (initial state)", flognum); + return 0; + } + + /* convert pre-map LBA into an offset into the map */ + uint64_t map_entry_off = arenap->mapoff + + BTT_MAP_ENTRY_SIZE * currentp->lba; + + /* read current map entry */ + uint32_t entry; + if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &entry, + sizeof(entry), map_entry_off) < 0) + return -1; + + entry = le32toh(entry); + + /* map entry in initial state */ + if (map_entry_is_initial(entry)) + entry = currentp->lba | BTT_MAP_ENTRY_NORMAL; + + if (currentp->new_map != entry && currentp->old_map == entry) { + /* last update didn't complete */ + LOG(9, "recover flog[%u]: map[%u]: %u", + flognum, currentp->lba, currentp->new_map); + + /* + * Recovery step is to complete the transaction by + * updating the map entry. + */ + entry = htole32(currentp->new_map); + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &entry, + sizeof(uint32_t), map_entry_off) < 0) + return -1; + } + + return 0; +} + +/* + * flog_update -- (internal) write out an updated flog entry + * + * The flog entries are not checksummed. Instead, increasing sequence + * numbers are used to atomically switch the active flog entry between + * the first and second struct btt_flog in each slot. In order for this + * to work, the sequence number must be updated only after all the other + * fields in the flog are updated. So the writes to the flog are broken + * into two writes, one for the first three fields (lba, old_map, new_map) + * and, only after those fields are known to be written durably, the + * second write for the seq field is done. + * + * Returns 0 on success, otherwise -1/errno. + */ +static int +flog_update(struct btt *bttp, unsigned lane, struct arena *arenap, + uint32_t lba, uint32_t old_map, uint32_t new_map) +{ + LOG(3, "bttp %p lane %u arenap %p lba %u old_map %u new_map %u", + bttp, lane, arenap, lba, old_map, new_map); + + /* construct new flog entry in little-endian byte order */ + struct btt_flog new_flog; + new_flog.lba = lba; + new_flog.old_map = old_map; + new_flog.new_map = new_map; + new_flog.seq = NSEQ(arenap->flogs[lane].flog.seq); + btt_flog_convert2le(&new_flog); + + uint64_t new_flog_off = + arenap->flogs[lane].entries[arenap->flogs[lane].next]; + + /* write out first two fields first */ + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &new_flog, + sizeof(uint32_t) * 2, new_flog_off) < 0) + return -1; + new_flog_off += sizeof(uint32_t) * 2; + + /* write out new_map and seq field to make it active */ + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &new_flog.new_map, + sizeof(uint32_t) * 2, new_flog_off) < 0) + return -1; + + /* flog entry written successfully, update run-time state */ + arenap->flogs[lane].next = 1 - arenap->flogs[lane].next; + arenap->flogs[lane].flog.lba = lba; + arenap->flogs[lane].flog.old_map = old_map; + arenap->flogs[lane].flog.new_map = new_map; + arenap->flogs[lane].flog.seq = NSEQ(arenap->flogs[lane].flog.seq); + + LOG(9, "update flog[%u]: lba %u old %u%s%s%s new %u%s%s%s", lane, lba, + old_map & BTT_MAP_ENTRY_LBA_MASK, + (map_entry_is_error(old_map)) ? " ERROR" : "", + (map_entry_is_zero(old_map)) ? " ZERO" : "", + (map_entry_is_initial(old_map)) ? " INIT" : "", + new_map & BTT_MAP_ENTRY_LBA_MASK, + (map_entry_is_error(new_map)) ? " ERROR" : "", + (map_entry_is_zero(new_map)) ? " ZERO" : "", + (map_entry_is_initial(new_map)) ? " INIT" : ""); + + return 0; +} + +/* + * arena_setf -- (internal) updates the given flag for the arena info block + */ +static int +arena_setf(struct btt *bttp, struct arena *arenap, unsigned lane, uint32_t setf) +{ + LOG(3, "bttp %p arenap %p lane %u setf 0x%x", bttp, arenap, lane, setf); + + /* update runtime state */ + util_fetch_and_or32(&arenap->flags, setf); + + if (!bttp->laidout) { + /* no layout yet to update */ + return 0; + } + + /* + * Read, modify and write out the info block + * at both the beginning and end of the arena. + */ + uint64_t arena_off = arenap->startoff; + + struct btt_info info; + + /* protect from simultaneous writes to the layout */ + util_mutex_lock(&arenap->info_lock); + + if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &info, + sizeof(info), arena_off) < 0) { + goto err; + } + + uint64_t infooff = le64toh(info.infooff); + + /* update flags */ + info.flags |= htole32(setf); + + /* update checksum */ + util_checksum(&info, sizeof(info), &info.checksum, 1, 0); + + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info, + sizeof(info), arena_off) < 0) { + goto err; + } + + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info, + sizeof(info), arena_off + infooff) < 0) { + goto err; + } + + util_mutex_unlock(&arenap->info_lock); + return 0; + +err: + util_mutex_unlock(&arenap->info_lock); + return -1; +} + +/* + * set_arena_error -- (internal) set the error flag for the given arena + */ +static int +set_arena_error(struct btt *bttp, struct arena *arenap, unsigned lane) +{ + LOG(3, "bttp %p arena %p lane %u", bttp, arenap, lane); + + return arena_setf(bttp, arenap, lane, BTTINFO_FLAG_ERROR); +} + +/* + * read_flogs -- (internal) load up all the flog entries for an arena + * + * Zero is returned on success, otherwise -1/errno. + */ +static int +read_flogs(struct btt *bttp, unsigned lane, struct arena *arenap) +{ + if ((arenap->flogs = Zalloc(bttp->nfree * + sizeof(struct flog_runtime))) == NULL) { + ERR("!Malloc for %u flog entries", bttp->nfree); + return -1; + } + + /* + * Load up the flog state. read_flog_pair() will determine if + * any recovery steps are required take them on the in-memory + * data structures it creates. Sets error flag when it + * determines an invalid state. + */ + uint64_t flog_off = arenap->flogoff; + struct flog_runtime *flog_runtimep = arenap->flogs; + for (uint32_t i = 0; i < bttp->nfree; i++) { + if (read_flog_pair(bttp, lane, arenap, flog_off, + flog_runtimep, i) < 0) { + set_arena_error(bttp, arenap, lane); + return -1; + } + + /* prepare for next time around the loop */ + flog_off += roundup(2 * sizeof(struct btt_flog), + BTT_FLOG_PAIR_ALIGN); + flog_runtimep++; + } + + return 0; +} + +/* + * build_rtt -- (internal) construct a read tracking table for an arena + * + * Zero is returned on success, otherwise -1/errno. + * + * The rtt is big enough to hold an entry for each free block (nfree) + * since nlane can't be bigger than nfree. nlane may end up smaller, + * in which case some of the high rtt entries will be unused. + */ +static int +build_rtt(struct btt *bttp, struct arena *arenap) +{ + if ((arenap->rtt = Malloc(bttp->nfree * sizeof(uint32_t))) + == NULL) { + ERR("!Malloc for %d rtt entries", bttp->nfree); + return -1; + } + for (uint32_t lane = 0; lane < bttp->nfree; lane++) + arenap->rtt[lane] = BTT_MAP_ENTRY_ERROR; + util_synchronize(); + + return 0; +} + +/* + * build_map_locks -- (internal) construct map locks + * + * Zero is returned on success, otherwise -1/errno. + */ +static int +build_map_locks(struct btt *bttp, struct arena *arenap) +{ + if ((arenap->map_locks = + Malloc(bttp->nfree * sizeof(*arenap->map_locks))) + == NULL) { + ERR("!Malloc for %d map_lock entries", bttp->nfree); + return -1; + } + for (uint32_t lane = 0; lane < bttp->nfree; lane++) + util_mutex_init(&arenap->map_locks[lane]); + + return 0; +} + +/* + * read_arena -- (internal) load up an arena and build run-time state + * + * Zero is returned on success, otherwise -1/errno. + */ +static int +read_arena(struct btt *bttp, unsigned lane, uint64_t arena_off, + struct arena *arenap) +{ + LOG(3, "bttp %p lane %u arena_off %" PRIu64 " arenap %p", + bttp, lane, arena_off, arenap); + + struct btt_info info; + if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &info, sizeof(info), + arena_off) < 0) + return -1; + + arenap->flags = le32toh(info.flags); + arenap->external_nlba = le32toh(info.external_nlba); + arenap->internal_lbasize = le32toh(info.internal_lbasize); + arenap->internal_nlba = le32toh(info.internal_nlba); + + arenap->startoff = arena_off; + arenap->dataoff = arena_off + le64toh(info.dataoff); + arenap->mapoff = arena_off + le64toh(info.mapoff); + arenap->flogoff = arena_off + le64toh(info.flogoff); + arenap->nextoff = arena_off + le64toh(info.nextoff); + + if (read_flogs(bttp, lane, arenap) < 0) + return -1; + + if (build_rtt(bttp, arenap) < 0) + return -1; + + if (build_map_locks(bttp, arenap) < 0) + return -1; + + /* initialize the per arena info block lock */ + util_mutex_init(&arenap->info_lock); + + return 0; +} + +/* + * util_convert2h_btt_info -- convert btt_info to host byte order + */ +void +btt_info_convert2h(struct btt_info *infop) +{ + infop->flags = le32toh(infop->flags); + infop->major = le16toh(infop->major); + infop->minor = le16toh(infop->minor); + infop->external_lbasize = le32toh(infop->external_lbasize); + infop->external_nlba = le32toh(infop->external_nlba); + infop->internal_lbasize = le32toh(infop->internal_lbasize); + infop->internal_nlba = le32toh(infop->internal_nlba); + infop->nfree = le32toh(infop->nfree); + infop->infosize = le32toh(infop->infosize); + infop->nextoff = le64toh(infop->nextoff); + infop->dataoff = le64toh(infop->dataoff); + infop->mapoff = le64toh(infop->mapoff); + infop->flogoff = le64toh(infop->flogoff); + infop->infooff = le64toh(infop->infooff); +} + +/* + * btt_info_convert2le -- convert btt_info to little-endian byte order + */ +void +btt_info_convert2le(struct btt_info *infop) +{ + infop->flags = le32toh(infop->flags); + infop->major = le16toh(infop->major); + infop->minor = le16toh(infop->minor); + infop->external_lbasize = le32toh(infop->external_lbasize); + infop->external_nlba = le32toh(infop->external_nlba); + infop->internal_lbasize = le32toh(infop->internal_lbasize); + infop->internal_nlba = le32toh(infop->internal_nlba); + infop->nfree = le32toh(infop->nfree); + infop->infosize = le32toh(infop->infosize); + infop->nextoff = le64toh(infop->nextoff); + infop->dataoff = le64toh(infop->dataoff); + infop->mapoff = le64toh(infop->mapoff); + infop->flogoff = le64toh(infop->flogoff); + infop->infooff = le64toh(infop->infooff); +} + +/* + * btt_flog_convert2h -- convert btt_flog to host byte order + */ +void +btt_flog_convert2h(struct btt_flog *flogp) +{ + flogp->lba = le32toh(flogp->lba); + flogp->old_map = le32toh(flogp->old_map); + flogp->new_map = le32toh(flogp->new_map); + flogp->seq = le32toh(flogp->seq); +} + +/* + * btt_flog_convert2le -- convert btt_flog to LE byte order + */ +void +btt_flog_convert2le(struct btt_flog *flogp) +{ + flogp->lba = htole32(flogp->lba); + flogp->old_map = htole32(flogp->old_map); + flogp->new_map = htole32(flogp->new_map); + flogp->seq = htole32(flogp->seq); +} + +/* + * read_arenas -- (internal) load up all arenas and build run-time state + * + * On entry, layout must be known to be valid, and the number of arenas + * must be known. Zero is returned on success, otherwise -1/errno. + */ +static int +read_arenas(struct btt *bttp, unsigned lane, unsigned narena) +{ + LOG(3, "bttp %p lane %u narena %d", bttp, lane, narena); + + if ((bttp->arenas = Zalloc(narena * sizeof(*bttp->arenas))) == NULL) { + ERR("!Malloc for %u arenas", narena); + goto err; + } + + uint64_t arena_off = 0; + struct arena *arenap = bttp->arenas; + for (unsigned i = 0; i < narena; i++) { + + if (read_arena(bttp, lane, arena_off, arenap) < 0) + goto err; + + /* prepare for next time around the loop */ + arena_off = arenap->nextoff; + arenap++; + } + + bttp->laidout = 1; + + return 0; + +err: + LOG(4, "error clean up"); + int oerrno = errno; + if (bttp->arenas) { + for (unsigned i = 0; i < bttp->narena; i++) { + if (bttp->arenas[i].flogs) + Free(bttp->arenas[i].flogs); + if (bttp->arenas[i].rtt) + Free((void *)bttp->arenas[i].rtt); + if (bttp->arenas[i].map_locks) + Free((void *)bttp->arenas[i].map_locks); + } + Free(bttp->arenas); + bttp->arenas = NULL; + } + errno = oerrno; + return -1; +} + +/* + * internal_lbasize -- (internal) calculate internal LBA size + */ +static inline uint32_t +internal_lbasize(uint32_t external_lbasize) +{ + uint32_t internal_lbasize = external_lbasize; + if (internal_lbasize < BTT_MIN_LBA_SIZE) + internal_lbasize = BTT_MIN_LBA_SIZE; + internal_lbasize = + roundup(internal_lbasize, BTT_INTERNAL_LBA_ALIGNMENT); + /* check for overflow */ + if (internal_lbasize < BTT_INTERNAL_LBA_ALIGNMENT) { + errno = EINVAL; + ERR("!Invalid lba size after alignment: %u ", internal_lbasize); + return 0; + } + + return internal_lbasize; +} + +/* + * btt_flog_size -- calculate flog data size + */ +uint64_t +btt_flog_size(uint32_t nfree) +{ + uint64_t flog_size = nfree * roundup(2 * sizeof(struct btt_flog), + BTT_FLOG_PAIR_ALIGN); + return roundup(flog_size, BTT_ALIGNMENT); +} + +/* + * btt_map_size -- calculate map data size + */ +uint64_t +btt_map_size(uint32_t external_nlba) +{ + return roundup(external_nlba * BTT_MAP_ENTRY_SIZE, BTT_ALIGNMENT); +} + +/* + * btt_arena_datasize -- whole arena size without BTT Info header, backup and + * flog means size of blocks and map + */ +uint64_t +btt_arena_datasize(uint64_t arena_size, uint32_t nfree) +{ + return arena_size - 2 * sizeof(struct btt_info) - btt_flog_size(nfree); +} + +/* + * btt_info_set_params -- (internal) calculate and set BTT Info + * external_lbasize, internal_lbasize, nfree, infosize, external_nlba and + * internal_nlba + */ +static int +btt_info_set_params(struct btt_info *info, uint32_t external_lbasize, + uint32_t internal_lbasize, uint32_t nfree, uint64_t arena_size) +{ + info->external_lbasize = external_lbasize; + info->internal_lbasize = internal_lbasize; + info->nfree = nfree; + info->infosize = sizeof(*info); + + uint64_t arena_data_size = btt_arena_datasize(arena_size, nfree); + + /* allow for map alignment padding */ + uint64_t internal_nlba = (arena_data_size - BTT_ALIGNMENT) / + (info->internal_lbasize + BTT_MAP_ENTRY_SIZE); + + /* ensure the number of blocks is at least 2*nfree */ + if (internal_nlba < 2 * nfree) { + errno = EINVAL; + ERR("!number of internal blocks: %" PRIu64 + " expected at least %u", + internal_nlba, 2 * nfree); + return -1; + } + + ASSERT(internal_nlba <= UINT32_MAX); + uint32_t internal_nlba_u32 = (uint32_t)internal_nlba; + + info->internal_nlba = internal_nlba_u32; + /* external LBA does not include free blocks */ + info->external_nlba = internal_nlba_u32 - info->nfree; + + ASSERT((arena_data_size - btt_map_size(info->external_nlba)) / + internal_lbasize >= internal_nlba); + + return 0; +} + +/* + * btt_info_set_offs -- (internal) calculate and set the BTT Info dataoff, + * nextoff, infooff, flogoff and mapoff. These are all relative to the + * beginning of the arena. + */ +static void +btt_info_set_offs(struct btt_info *info, uint64_t arena_size, + uint64_t space_left) +{ + info->dataoff = info->infosize; + + /* set offset to next valid arena */ + if (space_left >= BTT_MIN_SIZE) + info->nextoff = arena_size; + else + info->nextoff = 0; + + info->infooff = arena_size - sizeof(struct btt_info); + info->flogoff = info->infooff - btt_flog_size(info->nfree); + info->mapoff = info->flogoff - btt_map_size(info->external_nlba); + + ASSERTeq(btt_arena_datasize(arena_size, info->nfree) - + btt_map_size(info->external_nlba), info->mapoff - + info->dataoff); +} + +/* + * btt_info_set -- set BTT Info params and offsets + */ +int +btt_info_set(struct btt_info *info, uint32_t external_lbasize, + uint32_t nfree, uint64_t arena_size, uint64_t space_left) +{ + /* calculate internal LBA size */ + uint32_t internal_lba_size = internal_lbasize(external_lbasize); + if (internal_lba_size == 0) + return -1; + + /* set params and offsets */ + if (btt_info_set_params(info, external_lbasize, + internal_lba_size, nfree, arena_size)) + return -1; + + btt_info_set_offs(info, arena_size, space_left); + + return 0; +} + +/* + * write_layout -- (internal) write out the initial btt metadata layout + * + * Called with write == 1 only once in the life time of a btt namespace, when + * the first write happens. The caller of this routine is responsible for + * locking out multiple threads. This routine doesn't read anything -- by the + * time it is called, it is known there's no layout in the namespace and a new + * layout should be written. + * + * Calling with write == 0 tells this routine to do the calculations for + * bttp->narena and bttp->nlba, but don't write out any metadata. + * + * If successful, sets bttp->layout to 1 and returns 0. Otherwise -1 + * is returned and errno is set, and bttp->layout remains 0 so that + * later attempts to write will try again to create the layout. + */ +static int +write_layout(struct btt *bttp, unsigned lane, int write) +{ + LOG(3, "bttp %p lane %u write %d", bttp, lane, write); + + ASSERT(bttp->rawsize >= BTT_MIN_SIZE); + ASSERT(bttp->nfree); + + /* + * If a new layout is being written, generate the BTT's UUID. + */ + if (write) { + int ret = util_uuid_generate(bttp->uuid); + if (ret < 0) { + LOG(2, "util_uuid_generate failed"); + return -1; + } + } + + /* + * The number of arenas is the number of full arena of + * size BTT_MAX_ARENA that fit into rawsize and then, if + * the remainder is at least BTT_MIN_SIZE in size, then + * that adds one more arena. + */ + bttp->narena = (unsigned)(bttp->rawsize / BTT_MAX_ARENA); + if (bttp->rawsize % BTT_MAX_ARENA >= BTT_MIN_SIZE) + bttp->narena++; + LOG(4, "narena %u", bttp->narena); + + uint32_t internal_lba_size = internal_lbasize(bttp->lbasize); + if (internal_lba_size == 0) + return -1; + LOG(4, "adjusted internal_lbasize %u", internal_lba_size); + + uint64_t total_nlba = 0; + uint64_t rawsize = bttp->rawsize; + unsigned arena_num = 0; + uint64_t arena_off = 0; + + /* + * for each arena... + */ + while (rawsize >= BTT_MIN_SIZE) { + LOG(4, "layout arena %u", arena_num); + + uint64_t arena_rawsize = rawsize; + if (arena_rawsize > BTT_MAX_ARENA) { + arena_rawsize = BTT_MAX_ARENA; + } + rawsize -= arena_rawsize; + arena_num++; + + struct btt_info info; + memset(&info, '\0', sizeof(info)); + if (btt_info_set_params(&info, bttp->lbasize, + internal_lba_size, bttp->nfree, arena_rawsize)) + return -1; + + LOG(4, "internal_nlba %u external_nlba %u", + info.internal_nlba, info.external_nlba); + + total_nlba += info.external_nlba; + + /* + * The rest of the loop body calculates metadata structures + * and lays it out for this arena. So only continue if + * the write flag is set. + */ + if (!write) + continue; + + btt_info_set_offs(&info, arena_rawsize, rawsize); + + LOG(4, "nextoff 0x%016" PRIx64, info.nextoff); + LOG(4, "dataoff 0x%016" PRIx64, info.dataoff); + LOG(4, "mapoff 0x%016" PRIx64, info.mapoff); + LOG(4, "flogoff 0x%016" PRIx64, info.flogoff); + LOG(4, "infooff 0x%016" PRIx64, info.infooff); + + /* zero map if ns is not zero-initialized */ + if (!bttp->ns_cbp->ns_is_zeroed) { + uint64_t mapsize = btt_map_size(info.external_nlba); + if ((*bttp->ns_cbp->nszero)(bttp->ns, lane, mapsize, + info.mapoff) < 0) + return -1; + } + + /* write out the initial flog */ + uint64_t flog_entry_off = arena_off + info.flogoff; + uint32_t next_free_lba = info.external_nlba; + for (uint32_t i = 0; i < bttp->nfree; i++) { + struct btt_flog flog; + flog.lba = htole32(i); + flog.old_map = flog.new_map = + htole32(next_free_lba | BTT_MAP_ENTRY_ZERO); + flog.seq = htole32(1); + + /* + * Write both btt_flog structs in the pair, writing + * the second one as all zeros. + */ + LOG(6, "flog[%u] entry off %" PRIu64 + " initial %u + zero = %u", + i, flog_entry_off, + next_free_lba, + next_free_lba | BTT_MAP_ENTRY_ZERO); + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &flog, + sizeof(flog), flog_entry_off) < 0) + return -1; + flog_entry_off += sizeof(flog); + + LOG(6, "flog[%u] entry off %" PRIu64 " zeros", + i, flog_entry_off); + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &Zflog, + sizeof(Zflog), flog_entry_off) < 0) + return -1; + flog_entry_off += sizeof(flog); + flog_entry_off = roundup(flog_entry_off, + BTT_FLOG_PAIR_ALIGN); + + next_free_lba++; + } + + /* + * Construct the BTT info block and write it out + * at both the beginning and end of the arena. + */ + memcpy(info.sig, Sig, BTTINFO_SIG_LEN); + memcpy(info.uuid, bttp->uuid, BTTINFO_UUID_LEN); + memcpy(info.parent_uuid, bttp->parent_uuid, BTTINFO_UUID_LEN); + info.major = BTTINFO_MAJOR_VERSION; + info.minor = BTTINFO_MINOR_VERSION; + btt_info_convert2le(&info); + + util_checksum(&info, sizeof(info), &info.checksum, 1, 0); + + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info, + sizeof(info), arena_off) < 0) + return -1; + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info, + sizeof(info), arena_off + info.infooff) < 0) + return -1; + + arena_off += info.nextoff; + } + + ASSERTeq(bttp->narena, arena_num); + + bttp->nlba = total_nlba; + + if (write) { + /* + * The layout is written now, so load up the arenas. + */ + return read_arenas(bttp, lane, bttp->narena); + } + + return 0; +} + +/* + * read_layout -- (internal) load up layout info from btt namespace + * + * Called once when the btt namespace is opened for use. + * Sets bttp->layout to 0 if no valid layout is found, 1 otherwise. + * + * Any recovery actions required (as indicated by the flog state) are + * performed by this routine. + * + * Any quick checks for layout consistency are performed by this routine + * (quick enough to be done each time a BTT area is opened for use, not + * like the slow consistency checks done by btt_check()). + * + * Returns 0 if no errors are encountered accessing the namespace (in this + * context, detecting there's no layout is not an error if the nsread function + * didn't have any problems doing the reads). Otherwise, -1 is returned + * and errno is set. + */ +static int +read_layout(struct btt *bttp, unsigned lane) +{ + LOG(3, "bttp %p", bttp); + + ASSERT(bttp->rawsize >= BTT_MIN_SIZE); + + unsigned narena = 0; + uint32_t smallest_nfree = UINT32_MAX; + uint64_t rawsize = bttp->rawsize; + uint64_t total_nlba = 0; + uint64_t arena_off = 0; + + bttp->nfree = BTT_DEFAULT_NFREE; + + /* + * For each arena, see if there's a valid info block + */ + while (rawsize >= BTT_MIN_SIZE) { + narena++; + + struct btt_info info; + if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &info, + sizeof(info), arena_off) < 0) + return -1; + + if (!read_info(bttp, &info)) { + /* + * Failed to find complete BTT metadata. Just + * calculate the narena and nlba values that will + * result when write_layout() gets called. This + * allows checks against nlba to work correctly + * even before the layout is written. + */ + return write_layout(bttp, lane, 0); + } + if (info.external_lbasize != bttp->lbasize) { + /* can't read it assuming the wrong block size */ + ERR("inconsistent lbasize"); + errno = EINVAL; + return -1; + } + + if (info.nfree == 0) { + ERR("invalid nfree"); + errno = EINVAL; + return -1; + } + + if (info.external_nlba == 0) { + ERR("invalid external_nlba"); + errno = EINVAL; + return -1; + } + + if (info.nextoff && (info.nextoff != BTT_MAX_ARENA)) { + ERR("invalid arena size"); + errno = EINVAL; + return -1; + } + + if (info.nfree < smallest_nfree) + smallest_nfree = info.nfree; + + total_nlba += info.external_nlba; + arena_off += info.nextoff; + if (info.nextoff == 0) + break; + if (info.nextoff > rawsize) { + ERR("invalid next arena offset"); + errno = EINVAL; + return -1; + } + rawsize -= info.nextoff; + } + + ASSERT(narena); + + bttp->narena = narena; + bttp->nlba = total_nlba; + + /* + * All arenas were valid. nfree should be the smallest value found + * among different arenas. + */ + if (smallest_nfree < bttp->nfree) + bttp->nfree = smallest_nfree; + + /* + * Load up arenas. + */ + return read_arenas(bttp, lane, narena); +} + +/* + * zero_block -- (internal) satisfy a read with a block of zeros + * + * Returns 0 on success, otherwise -1/errno. + */ +static int +zero_block(struct btt *bttp, void *buf) +{ + LOG(3, "bttp %p", bttp); + + memset(buf, '\0', bttp->lbasize); + return 0; +} + +/* + * lba_to_arena_lba -- (internal) calculate the arena & pre-map LBA + * + * This routine takes the external LBA and matches it to the + * appropriate arena, adjusting the lba for use within that arena. + * + * If successful, zero is returned, *arenapp is a pointer to the appropriate + * arena struct in the run-time state, and *premap_lbap is the LBA adjusted + * to an arena-internal LBA (also known as the pre-map LBA). Otherwise + * -1/errno. + */ +static int +lba_to_arena_lba(struct btt *bttp, uint64_t lba, + struct arena **arenapp, uint32_t *premap_lbap) +{ + LOG(3, "bttp %p lba %" PRIu64, bttp, lba); + + ASSERT(bttp->laidout); + + unsigned arena; + for (arena = 0; arena < bttp->narena; arena++) + if (lba < bttp->arenas[arena].external_nlba) + break; + else + lba -= bttp->arenas[arena].external_nlba; + + ASSERT(arena < bttp->narena); + + *arenapp = &bttp->arenas[arena]; + ASSERT(lba <= UINT32_MAX); + *premap_lbap = (uint32_t)lba; + + LOG(3, "arenap %p pre-map LBA %u", *arenapp, *premap_lbap); + return 0; +} + +/* + * btt_init -- prepare a btt namespace for use, returning an opaque handle + * + * Returns handle on success, otherwise NULL/errno. + * + * When submitted a pristine namespace it will be formatted implicitly when + * touched for the first time. + * + * If arenas have different nfree values, we will be using the lowest one + * found as limiting to the overall "bandwidth". + */ +struct btt * +btt_init(uint64_t rawsize, uint32_t lbasize, uint8_t parent_uuid[], + unsigned maxlane, void *ns, const struct ns_callback *ns_cbp) +{ + LOG(3, "rawsize %" PRIu64 " lbasize %u", rawsize, lbasize); + + if (rawsize < BTT_MIN_SIZE) { + ERR("rawsize smaller than BTT_MIN_SIZE %u", BTT_MIN_SIZE); + errno = EINVAL; + return NULL; + } + + struct btt *bttp = Zalloc(sizeof(*bttp)); + + if (bttp == NULL) { + ERR("!Malloc %zu bytes", sizeof(*bttp)); + return NULL; + } + + util_mutex_init(&bttp->layout_write_mutex); + memcpy(bttp->parent_uuid, parent_uuid, BTTINFO_UUID_LEN); + bttp->rawsize = rawsize; + bttp->lbasize = lbasize; + bttp->ns = ns; + bttp->ns_cbp = ns_cbp; + + /* + * Load up layout, if it exists. + * + * Whether read_layout() finds a valid layout or not, it finishes + * updating these layout-related fields: + * bttp->nfree + * bttp->nlba + * bttp->narena + * since these fields are used even before a valid layout it written. + */ + if (read_layout(bttp, 0) < 0) { + btt_fini(bttp); /* free up any allocations */ + return NULL; + } + + bttp->nlane = bttp->nfree; + + /* maxlane, if provided, is an upper bound on nlane */ + if (maxlane && bttp->nlane > maxlane) + bttp->nlane = maxlane; + + LOG(3, "success, bttp %p nlane %u", bttp, bttp->nlane); + return bttp; +} + +/* + * btt_nlane -- return the number of "lanes" for this btt namespace + * + * The number of lanes is the number of threads allowed in this module + * concurrently for a given btt. Each thread executing this code must + * have a unique "lane" number assigned to it between 0 and btt_nlane() - 1. + */ +unsigned +btt_nlane(struct btt *bttp) +{ + LOG(3, "bttp %p", bttp); + + return bttp->nlane; +} + +/* + * btt_nlba -- return the number of usable blocks in a btt namespace + * + * Valid LBAs to pass to btt_read() and btt_write() are 0 through + * btt_nlba() - 1. + */ +size_t +btt_nlba(struct btt *bttp) +{ + LOG(3, "bttp %p", bttp); + + return bttp->nlba; +} + +/* + * btt_read -- read a block from a btt namespace + * + * Returns 0 on success, otherwise -1/errno. + */ +int +btt_read(struct btt *bttp, unsigned lane, uint64_t lba, void *buf) +{ + LOG(3, "bttp %p lane %u lba %" PRIu64, bttp, lane, lba); + + if (invalid_lba(bttp, lba)) + return -1; + + /* if there's no layout written yet, all reads come back as zeros */ + if (!bttp->laidout) + return zero_block(bttp, buf); + + /* find which arena LBA lives in, and the offset to the map entry */ + struct arena *arenap; + uint32_t premap_lba; + uint64_t map_entry_off; + if (lba_to_arena_lba(bttp, lba, &arenap, &premap_lba) < 0) + return -1; + + /* convert pre-map LBA into an offset into the map */ + map_entry_off = arenap->mapoff + BTT_MAP_ENTRY_SIZE * premap_lba; + + /* + * Read the current map entry to get the post-map LBA for the data + * block read. + */ + uint32_t entry; + + if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &entry, + sizeof(entry), map_entry_off) < 0) + return -1; + + entry = le32toh(entry); + + /* + * Retries come back to the top of this loop (for a rare case where + * the map is changed by another thread doing writes to the same LBA). + */ + while (1) { + if (map_entry_is_error(entry)) { + ERR("EIO due to map entry error flag"); + errno = EIO; + return -1; + } + + if (map_entry_is_zero_or_initial(entry)) + return zero_block(bttp, buf); + + /* + * Record the post-map LBA in the read tracking table during + * the read. The write will check entries in the read tracking + * table before allocating a block for a write, waiting for + * outstanding reads on that block to complete. + * + * Since we already checked for error, zero, and initial + * states above, the entry must have both error and zero + * bits set at this point (BTT_MAP_ENTRY_NORMAL). We store + * the entry that way, with those bits set, in the rtt and + * btt_write() will check for it the same way, with the bits + * both set. + */ + arenap->rtt[lane] = entry; + util_synchronize(); + + /* + * In case this thread was preempted between reading entry and + * storing it in the rtt, check to see if the map changed. If + * it changed, the block about to be read is at least free now + * (in the flog, but that's okay since the data will still be + * undisturbed) and potentially allocated and being used for + * another write (data disturbed, so not okay to continue). + */ + uint32_t latest_entry; + if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &latest_entry, + sizeof(latest_entry), map_entry_off) < 0) { + arenap->rtt[lane] = BTT_MAP_ENTRY_ERROR; + return -1; + } + + latest_entry = le32toh(latest_entry); + + if (entry == latest_entry) + break; /* map stayed the same */ + else + entry = latest_entry; /* try again */ + } + + /* + * It is safe to read the block now, since the rtt protects the + * block from getting re-allocated to something else by a write. + */ + uint64_t data_block_off = + arenap->dataoff + (uint64_t)(entry & BTT_MAP_ENTRY_LBA_MASK) * + arenap->internal_lbasize; + int readret = (*bttp->ns_cbp->nsread)(bttp->ns, lane, buf, + bttp->lbasize, data_block_off); + + /* done with read, so clear out rtt entry */ + arenap->rtt[lane] = BTT_MAP_ENTRY_ERROR; + + return readret; +} + +/* + * map_lock -- (internal) grab the map_lock and read a map entry + */ +static int +map_lock(struct btt *bttp, unsigned lane, struct arena *arenap, + uint32_t *entryp, uint32_t premap_lba) +{ + LOG(3, "bttp %p lane %u arenap %p premap_lba %u", + bttp, lane, arenap, premap_lba); + + uint64_t map_entry_off = + arenap->mapoff + BTT_MAP_ENTRY_SIZE * premap_lba; + uint32_t map_lock_num = get_map_lock_num(premap_lba, bttp->nfree); + + util_mutex_lock(&arenap->map_locks[map_lock_num]); + + /* read the old map entry */ + if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, entryp, + sizeof(uint32_t), map_entry_off) < 0) { + util_mutex_unlock(&arenap->map_locks[map_lock_num]); + return -1; + } + + /* if map entry is in its initial state return premap_lba */ + if (map_entry_is_initial(*entryp)) + *entryp = htole32(premap_lba | BTT_MAP_ENTRY_NORMAL); + + LOG(9, "locked map[%d]: %u%s%s", premap_lba, + *entryp & BTT_MAP_ENTRY_LBA_MASK, + (map_entry_is_error(*entryp)) ? " ERROR" : "", + (map_entry_is_zero(*entryp)) ? " ZERO" : ""); + + return 0; +} + +/* + * map_abort -- (internal) drop the map_lock without updating the entry + */ +static void +map_abort(struct btt *bttp, unsigned lane, struct arena *arenap, + uint32_t premap_lba) +{ + LOG(3, "bttp %p lane %u arenap %p premap_lba %u", + bttp, lane, arenap, premap_lba); + + util_mutex_unlock(&arenap->map_locks[get_map_lock_num(premap_lba, + bttp->nfree)]); +} + +/* + * map_unlock -- (internal) update the map and drop the map_lock + */ +static int +map_unlock(struct btt *bttp, unsigned lane, struct arena *arenap, + uint32_t entry, uint32_t premap_lba) +{ + LOG(3, "bttp %p lane %u arenap %p entry %u premap_lba %u", + bttp, lane, arenap, entry, premap_lba); + + uint64_t map_entry_off = + arenap->mapoff + BTT_MAP_ENTRY_SIZE * premap_lba; + + /* write the new map entry */ + int err = (*bttp->ns_cbp->nswrite)(bttp->ns, lane, &entry, + sizeof(uint32_t), map_entry_off); + + util_mutex_unlock(&arenap->map_locks[get_map_lock_num(premap_lba, + bttp->nfree)]); + + LOG(9, "unlocked map[%d]: %u%s%s", premap_lba, + entry & BTT_MAP_ENTRY_LBA_MASK, + (map_entry_is_error(entry)) ? " ERROR" : "", + (map_entry_is_zero(entry)) ? " ZERO" : ""); + + return err; +} + +/* + * btt_write -- write a block to a btt namespace + * + * Returns 0 on success, otherwise -1/errno. + */ +int +btt_write(struct btt *bttp, unsigned lane, uint64_t lba, const void *buf) +{ + LOG(3, "bttp %p lane %u lba %" PRIu64, bttp, lane, lba); + + if (invalid_lba(bttp, lba)) + return -1; + + /* first write through here will initialize the metadata layout */ + if (!bttp->laidout) { + int err = 0; + + util_mutex_lock(&bttp->layout_write_mutex); + + if (!bttp->laidout) + err = write_layout(bttp, lane, 1); + + util_mutex_unlock(&bttp->layout_write_mutex); + + if (err < 0) + return err; + } + + /* find which arena LBA lives in, and the offset to the map entry */ + struct arena *arenap; + uint32_t premap_lba; + if (lba_to_arena_lba(bttp, lba, &arenap, &premap_lba) < 0) + return -1; + + /* if the arena is in an error state, writing is not allowed */ + if (arenap->flags & BTTINFO_FLAG_ERROR_MASK) { + ERR("EIO due to btt_info error flags 0x%x", + arenap->flags & BTTINFO_FLAG_ERROR_MASK); + errno = EIO; + return -1; + } + + /* + * This routine was passed a unique "lane" which is an index + * into the flog. That means the free block held by flog[lane] + * is assigned to this thread and to no other threads (no additional + * locking required). So start by performing the write to the + * free block. It is only safe to write to a free block if it + * doesn't appear in the read tracking table, so scan that first + * and if found, wait for the thread reading from it to finish. + */ + uint32_t free_entry = (arenap->flogs[lane].flog.old_map & + BTT_MAP_ENTRY_LBA_MASK) | BTT_MAP_ENTRY_NORMAL; + + LOG(3, "free_entry %u (before mask %u)", free_entry, + arenap->flogs[lane].flog.old_map); + + /* wait for other threads to finish any reads on free block */ + for (unsigned i = 0; i < bttp->nlane; i++) + while (arenap->rtt[i] == free_entry) + ; + + /* it is now safe to perform write to the free block */ + uint64_t data_block_off = arenap->dataoff + + (uint64_t)(free_entry & BTT_MAP_ENTRY_LBA_MASK) * + arenap->internal_lbasize; + if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, buf, + bttp->lbasize, data_block_off) < 0) + return -1; + + /* + * Make the new block active atomically by updating the on-media flog + * and then updating the map. + */ + uint32_t old_entry; + if (map_lock(bttp, lane, arenap, &old_entry, premap_lba) < 0) + return -1; + + old_entry = le32toh(old_entry); + + /* update the flog */ + if (flog_update(bttp, lane, arenap, premap_lba, + old_entry, free_entry) < 0) { + map_abort(bttp, lane, arenap, premap_lba); + return -1; + } + + if (map_unlock(bttp, lane, arenap, htole32(free_entry), + premap_lba) < 0) { + /* + * A critical write error occurred, set the arena's + * info block error bit. + */ + set_arena_error(bttp, arenap, lane); + errno = EIO; + return -1; + } + + return 0; +} + +/* + * map_entry_setf -- (internal) set a given flag on a map entry + * + * Returns 0 on success, otherwise -1/errno. + */ +static int +map_entry_setf(struct btt *bttp, unsigned lane, uint64_t lba, uint32_t setf) +{ + LOG(3, "bttp %p lane %u lba %" PRIu64 " setf 0x%x", + bttp, lane, lba, setf); + + if (invalid_lba(bttp, lba)) + return -1; + + if (!bttp->laidout) { + /* + * No layout is written yet. If the flag being set + * is the zero flag, it is superfluous since all blocks + * read as zero at this point. + */ + if (setf == BTT_MAP_ENTRY_ZERO) + return 0; + + /* + * Treat this like the first write and write out + * the metadata layout at this point. + */ + int err = 0; + util_mutex_lock(&bttp->layout_write_mutex); + + if (!bttp->laidout) + err = write_layout(bttp, lane, 1); + + util_mutex_unlock(&bttp->layout_write_mutex); + + if (err < 0) + return err; + } + + /* find which arena LBA lives in, and the offset to the map entry */ + struct arena *arenap; + uint32_t premap_lba; + if (lba_to_arena_lba(bttp, lba, &arenap, &premap_lba) < 0) + return -1; + + /* if the arena is in an error state, writing is not allowed */ + if (arenap->flags & BTTINFO_FLAG_ERROR_MASK) { + ERR("EIO due to btt_info error flags 0x%x", + arenap->flags & BTTINFO_FLAG_ERROR_MASK); + errno = EIO; + return -1; + } + + /* + * Set the flags in the map entry. To do this, read the + * current map entry, set the flags, and write out the update. + */ + uint32_t old_entry; + uint32_t new_entry; + + if (map_lock(bttp, lane, arenap, &old_entry, premap_lba) < 0) + return -1; + + old_entry = le32toh(old_entry); + + if (setf == BTT_MAP_ENTRY_ZERO && + map_entry_is_zero_or_initial(old_entry)) { + map_abort(bttp, lane, arenap, premap_lba); + return 0; /* block already zero, nothing to do */ + } + + /* create the new map entry */ + new_entry = (old_entry & BTT_MAP_ENTRY_LBA_MASK) | setf; + + if (map_unlock(bttp, lane, arenap, htole32(new_entry), premap_lba) < 0) + return -1; + + return 0; +} + +/* + * btt_set_zero -- mark a block as zeroed in a btt namespace + * + * Returns 0 on success, otherwise -1/errno. + */ +int +btt_set_zero(struct btt *bttp, unsigned lane, uint64_t lba) +{ + LOG(3, "bttp %p lane %u lba %" PRIu64, bttp, lane, lba); + + return map_entry_setf(bttp, lane, lba, BTT_MAP_ENTRY_ZERO); +} + +/* + * btt_set_error -- mark a block as in an error state in a btt namespace + * + * Returns 0 on success, otherwise -1/errno. + */ +int +btt_set_error(struct btt *bttp, unsigned lane, uint64_t lba) +{ + LOG(3, "bttp %p lane %u lba %" PRIu64, bttp, lane, lba); + + return map_entry_setf(bttp, lane, lba, BTT_MAP_ENTRY_ERROR); +} + +/* + * check_arena -- (internal) perform a consistency check on an arena + */ +static int +check_arena(struct btt *bttp, struct arena *arenap) +{ + LOG(3, "bttp %p arenap %p", bttp, arenap); + + int consistent = 1; + + uint64_t map_entry_off = arenap->mapoff; + uint32_t bitmapsize = howmany(arenap->internal_nlba, 8); + uint8_t *bitmap = Zalloc(bitmapsize); + if (bitmap == NULL) { + ERR("!Malloc for bitmap"); + return -1; + } + + /* + * Go through every post-map LBA mentioned in the map and make sure + * there are no duplicates. bitmap is used to track which LBAs have + * been seen so far. + */ + uint32_t *mapp = NULL; + ssize_t mlen; + int next_index = 0; + size_t remaining = 0; + for (uint32_t i = 0; i < arenap->external_nlba; i++) { + uint32_t entry; + + if (remaining == 0) { + /* request a mapping of remaining map area */ + size_t req_len = + (arenap->external_nlba - i) * sizeof(uint32_t); + mlen = (*bttp->ns_cbp->nsmap)(bttp->ns, 0, + (void **)&mapp, req_len, map_entry_off); + + if (mlen < 0) + return -1; + + remaining = (size_t)mlen; + next_index = 0; + } + entry = le32toh(mapp[next_index]); + + /* for debug, dump non-zero map entries at log level 11 */ + if (map_entry_is_zero_or_initial(entry) == 0) + LOG(11, "map[%d]: %u%s", i, + entry & BTT_MAP_ENTRY_LBA_MASK, + (map_entry_is_error(entry)) ? " ERROR" : ""); + + /* this is an uninitialized map entry, set the default value */ + if (map_entry_is_initial(entry)) + entry = i; + else + entry &= BTT_MAP_ENTRY_LBA_MASK; + + /* check if entry is valid */ + if (entry >= arenap->internal_nlba) { + ERR("map[%d] entry out of bounds: %u", i, entry); + errno = EINVAL; + return -1; + } + + if (util_isset(bitmap, entry)) { + ERR("map[%d] duplicate entry: %u", i, entry); + consistent = 0; + } else + util_setbit(bitmap, entry); + + map_entry_off += sizeof(uint32_t); + next_index++; + ASSERT(remaining >= sizeof(uint32_t)); + remaining -= sizeof(uint32_t); + } + + /* + * Go through the free blocks in the flog, adding them to bitmap + * and checking for duplications. It is sufficient to read the + * run-time flog here, avoiding more calls to nsread. + */ + for (uint32_t i = 0; i < bttp->nfree; i++) { + uint32_t entry = arenap->flogs[i].flog.old_map; + entry &= BTT_MAP_ENTRY_LBA_MASK; + + if (util_isset(bitmap, entry)) { + ERR("flog[%u] duplicate entry: %u", i, entry); + consistent = 0; + } else + util_setbit(bitmap, entry); + } + + /* + * Make sure every possible post-map LBA was accounted for + * in the two loops above. + */ + for (uint32_t i = 0; i < arenap->internal_nlba; i++) + if (util_isclr(bitmap, i)) { + ERR("unreferenced lba: %d", i); + consistent = 0; + } + + Free(bitmap); + + return consistent; +} + +/* + * btt_check -- perform a consistency check on a btt namespace + * + * This routine contains a fairly high-impact set of consistency checks. + * It may use a good amount of dynamic memory and CPU time performing + * the checks. Any lightweight, quick consistency checks are included + * in read_layout() so they happen every time the BTT area is opened + * for use. + * + * Returns true if consistent, zero if inconsistent, -1/error if checking + * cannot happen due to other errors. + * + * No lane number required here because only one thread is allowed -- all + * other threads must be locked out of all btt routines for this btt + * namespace while this is running. + */ +int +btt_check(struct btt *bttp) +{ + LOG(3, "bttp %p", bttp); + + int consistent = 1; + + if (!bttp->laidout) { + /* consistent by definition */ + LOG(3, "no layout yet"); + return consistent; + } + + /* XXX report issues found during read_layout (from flags) */ + + /* for each arena... */ + struct arena *arenap = bttp->arenas; + for (unsigned i = 0; i < bttp->narena; i++, arenap++) { + /* + * Perform the consistency checks for the arena. + */ + int retval = check_arena(bttp, arenap); + if (retval < 0) + return retval; + else if (retval == 0) + consistent = 0; + } + + /* XXX stub */ + return consistent; +} + +/* + * btt_fini -- delete opaque btt info, done using btt namespace + */ +void +btt_fini(struct btt *bttp) +{ + LOG(3, "bttp %p", bttp); + + if (bttp->arenas) { + for (unsigned i = 0; i < bttp->narena; i++) { + if (bttp->arenas[i].flogs) + Free(bttp->arenas[i].flogs); + if (bttp->arenas[i].rtt) + Free((void *)bttp->arenas[i].rtt); + if (bttp->arenas[i].rtt) + Free((void *)bttp->arenas[i].map_locks); + } + Free(bttp->arenas); + } + Free(bttp); +} |