diff options
Diffstat (limited to '')
-rw-r--r-- | drivers/md/bcache/btree.c | 2800 |
1 files changed, 2800 insertions, 0 deletions
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c new file mode 100644 index 000000000..6a2f57ae0 --- /dev/null +++ b/drivers/md/bcache/btree.c @@ -0,0 +1,2800 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> + * + * Uses a block device as cache for other block devices; optimized for SSDs. + * All allocation is done in buckets, which should match the erase block size + * of the device. + * + * Buckets containing cached data are kept on a heap sorted by priority; + * bucket priority is increased on cache hit, and periodically all the buckets + * on the heap have their priority scaled down. This currently is just used as + * an LRU but in the future should allow for more intelligent heuristics. + * + * Buckets have an 8 bit counter; freeing is accomplished by incrementing the + * counter. Garbage collection is used to remove stale pointers. + * + * Indexing is done via a btree; nodes are not necessarily fully sorted, rather + * as keys are inserted we only sort the pages that have not yet been written. + * When garbage collection is run, we resort the entire node. + * + * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "extents.h" + +#include <linux/slab.h> +#include <linux/bitops.h> +#include <linux/hash.h> +#include <linux/kthread.h> +#include <linux/prefetch.h> +#include <linux/random.h> +#include <linux/rcupdate.h> +#include <linux/sched/clock.h> +#include <linux/rculist.h> +#include <linux/delay.h> +#include <trace/events/bcache.h> + +/* + * Todo: + * register_bcache: Return errors out to userspace correctly + * + * Writeback: don't undirty key until after a cache flush + * + * Create an iterator for key pointers + * + * On btree write error, mark bucket such that it won't be freed from the cache + * + * Journalling: + * Check for bad keys in replay + * Propagate barriers + * Refcount journal entries in journal_replay + * + * Garbage collection: + * Finish incremental gc + * Gc should free old UUIDs, data for invalid UUIDs + * + * Provide a way to list backing device UUIDs we have data cached for, and + * probably how long it's been since we've seen them, and a way to invalidate + * dirty data for devices that will never be attached again + * + * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so + * that based on that and how much dirty data we have we can keep writeback + * from being starved + * + * Add a tracepoint or somesuch to watch for writeback starvation + * + * When btree depth > 1 and splitting an interior node, we have to make sure + * alloc_bucket() cannot fail. This should be true but is not completely + * obvious. + * + * Plugging? + * + * If data write is less than hard sector size of ssd, round up offset in open + * bucket to the next whole sector + * + * Superblock needs to be fleshed out for multiple cache devices + * + * Add a sysfs tunable for the number of writeback IOs in flight + * + * Add a sysfs tunable for the number of open data buckets + * + * IO tracking: Can we track when one process is doing io on behalf of another? + * IO tracking: Don't use just an average, weigh more recent stuff higher + * + * Test module load/unload + */ + +#define MAX_NEED_GC 64 +#define MAX_SAVE_PRIO 72 +#define MAX_GC_TIMES 100 +#define MIN_GC_NODES 100 +#define GC_SLEEP_MS 100 + +#define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) + +#define PTR_HASH(c, k) \ + (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) + +static struct workqueue_struct *btree_io_wq; + +#define insert_lock(s, b) ((b)->level <= (s)->lock) + + +static inline struct bset *write_block(struct btree *b) +{ + return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c->cache); +} + +static void bch_btree_init_next(struct btree *b) +{ + /* If not a leaf node, always sort */ + if (b->level && b->keys.nsets) + bch_btree_sort(&b->keys, &b->c->sort); + else + bch_btree_sort_lazy(&b->keys, &b->c->sort); + + if (b->written < btree_blocks(b)) + bch_bset_init_next(&b->keys, write_block(b), + bset_magic(&b->c->cache->sb)); + +} + +/* Btree key manipulation */ + +void bkey_put(struct cache_set *c, struct bkey *k) +{ + unsigned int i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) + atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); +} + +/* Btree IO */ + +static uint64_t btree_csum_set(struct btree *b, struct bset *i) +{ + uint64_t crc = b->key.ptr[0]; + void *data = (void *) i + 8, *end = bset_bkey_last(i); + + crc = crc64_be(crc, data, end - data); + return crc ^ 0xffffffffffffffffULL; +} + +void bch_btree_node_read_done(struct btree *b) +{ + const char *err = "bad btree header"; + struct bset *i = btree_bset_first(b); + struct btree_iter *iter; + + /* + * c->fill_iter can allocate an iterator with more memory space + * than static MAX_BSETS. + * See the comment arount cache_set->fill_iter. + */ + iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); + iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; + iter->used = 0; + +#ifdef CONFIG_BCACHE_DEBUG + iter->b = &b->keys; +#endif + + if (!i->seq) + goto err; + + for (; + b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq; + i = write_block(b)) { + err = "unsupported bset version"; + if (i->version > BCACHE_BSET_VERSION) + goto err; + + err = "bad btree header"; + if (b->written + set_blocks(i, block_bytes(b->c->cache)) > + btree_blocks(b)) + goto err; + + err = "bad magic"; + if (i->magic != bset_magic(&b->c->cache->sb)) + goto err; + + err = "bad checksum"; + switch (i->version) { + case 0: + if (i->csum != csum_set(i)) + goto err; + break; + case BCACHE_BSET_VERSION: + if (i->csum != btree_csum_set(b, i)) + goto err; + break; + } + + err = "empty set"; + if (i != b->keys.set[0].data && !i->keys) + goto err; + + bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); + + b->written += set_blocks(i, block_bytes(b->c->cache)); + } + + err = "corrupted btree"; + for (i = write_block(b); + bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); + i = ((void *) i) + block_bytes(b->c->cache)) + if (i->seq == b->keys.set[0].data->seq) + goto err; + + bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); + + i = b->keys.set[0].data; + err = "short btree key"; + if (b->keys.set[0].size && + bkey_cmp(&b->key, &b->keys.set[0].end) < 0) + goto err; + + if (b->written < btree_blocks(b)) + bch_bset_init_next(&b->keys, write_block(b), + bset_magic(&b->c->cache->sb)); +out: + mempool_free(iter, &b->c->fill_iter); + return; +err: + set_btree_node_io_error(b); + bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys", + err, PTR_BUCKET_NR(b->c, &b->key, 0), + bset_block_offset(b, i), i->keys); + goto out; +} + +static void btree_node_read_endio(struct bio *bio) +{ + struct closure *cl = bio->bi_private; + + closure_put(cl); +} + +static void bch_btree_node_read(struct btree *b) +{ + uint64_t start_time = local_clock(); + struct closure cl; + struct bio *bio; + + trace_bcache_btree_read(b); + + closure_init_stack(&cl); + + bio = bch_bbio_alloc(b->c); + bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9; + bio->bi_end_io = btree_node_read_endio; + bio->bi_private = &cl; + bio->bi_opf = REQ_OP_READ | REQ_META; + + bch_bio_map(bio, b->keys.set[0].data); + + bch_submit_bbio(bio, b->c, &b->key, 0); + closure_sync(&cl); + + if (bio->bi_status) + set_btree_node_io_error(b); + + bch_bbio_free(bio, b->c); + + if (btree_node_io_error(b)) + goto err; + + bch_btree_node_read_done(b); + bch_time_stats_update(&b->c->btree_read_time, start_time); + + return; +err: + bch_cache_set_error(b->c, "io error reading bucket %zu", + PTR_BUCKET_NR(b->c, &b->key, 0)); +} + +static void btree_complete_write(struct btree *b, struct btree_write *w) +{ + if (w->prio_blocked && + !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) + wake_up_allocators(b->c); + + if (w->journal) { + atomic_dec_bug(w->journal); + __closure_wake_up(&b->c->journal.wait); + } + + w->prio_blocked = 0; + w->journal = NULL; +} + +static void btree_node_write_unlock(struct closure *cl) +{ + struct btree *b = container_of(cl, struct btree, io); + + up(&b->io_mutex); +} + +static void __btree_node_write_done(struct closure *cl) +{ + struct btree *b = container_of(cl, struct btree, io); + struct btree_write *w = btree_prev_write(b); + + bch_bbio_free(b->bio, b->c); + b->bio = NULL; + btree_complete_write(b, w); + + if (btree_node_dirty(b)) + queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); + + closure_return_with_destructor(cl, btree_node_write_unlock); +} + +static void btree_node_write_done(struct closure *cl) +{ + struct btree *b = container_of(cl, struct btree, io); + + bio_free_pages(b->bio); + __btree_node_write_done(cl); +} + +static void btree_node_write_endio(struct bio *bio) +{ + struct closure *cl = bio->bi_private; + struct btree *b = container_of(cl, struct btree, io); + + if (bio->bi_status) + set_btree_node_io_error(b); + + bch_bbio_count_io_errors(b->c, bio, bio->bi_status, "writing btree"); + closure_put(cl); +} + +static void do_btree_node_write(struct btree *b) +{ + struct closure *cl = &b->io; + struct bset *i = btree_bset_last(b); + BKEY_PADDED(key) k; + + i->version = BCACHE_BSET_VERSION; + i->csum = btree_csum_set(b, i); + + BUG_ON(b->bio); + b->bio = bch_bbio_alloc(b->c); + + b->bio->bi_end_io = btree_node_write_endio; + b->bio->bi_private = cl; + b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c->cache)); + b->bio->bi_opf = REQ_OP_WRITE | REQ_META | REQ_FUA; + bch_bio_map(b->bio, i); + + /* + * If we're appending to a leaf node, we don't technically need FUA - + * this write just needs to be persisted before the next journal write, + * which will be marked FLUSH|FUA. + * + * Similarly if we're writing a new btree root - the pointer is going to + * be in the next journal entry. + * + * But if we're writing a new btree node (that isn't a root) or + * appending to a non leaf btree node, we need either FUA or a flush + * when we write the parent with the new pointer. FUA is cheaper than a + * flush, and writes appending to leaf nodes aren't blocking anything so + * just make all btree node writes FUA to keep things sane. + */ + + bkey_copy(&k.key, &b->key); + SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + + bset_sector_offset(&b->keys, i)); + + if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { + struct bio_vec *bv; + void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bv, b->bio, iter_all) { + memcpy(page_address(bv->bv_page), addr, PAGE_SIZE); + addr += PAGE_SIZE; + } + + bch_submit_bbio(b->bio, b->c, &k.key, 0); + + continue_at(cl, btree_node_write_done, NULL); + } else { + /* + * No problem for multipage bvec since the bio is + * just allocated + */ + b->bio->bi_vcnt = 0; + bch_bio_map(b->bio, i); + + bch_submit_bbio(b->bio, b->c, &k.key, 0); + + closure_sync(cl); + continue_at_nobarrier(cl, __btree_node_write_done, NULL); + } +} + +void __bch_btree_node_write(struct btree *b, struct closure *parent) +{ + struct bset *i = btree_bset_last(b); + + lockdep_assert_held(&b->write_lock); + + trace_bcache_btree_write(b); + + BUG_ON(current->bio_list); + BUG_ON(b->written >= btree_blocks(b)); + BUG_ON(b->written && !i->keys); + BUG_ON(btree_bset_first(b)->seq != i->seq); + bch_check_keys(&b->keys, "writing"); + + cancel_delayed_work(&b->work); + + /* If caller isn't waiting for write, parent refcount is cache set */ + down(&b->io_mutex); + closure_init(&b->io, parent ?: &b->c->cl); + + clear_bit(BTREE_NODE_dirty, &b->flags); + change_bit(BTREE_NODE_write_idx, &b->flags); + + do_btree_node_write(b); + + atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size, + &b->c->cache->btree_sectors_written); + + b->written += set_blocks(i, block_bytes(b->c->cache)); +} + +void bch_btree_node_write(struct btree *b, struct closure *parent) +{ + unsigned int nsets = b->keys.nsets; + + lockdep_assert_held(&b->lock); + + __bch_btree_node_write(b, parent); + + /* + * do verify if there was more than one set initially (i.e. we did a + * sort) and we sorted down to a single set: + */ + if (nsets && !b->keys.nsets) + bch_btree_verify(b); + + bch_btree_init_next(b); +} + +static void bch_btree_node_write_sync(struct btree *b) +{ + struct closure cl; + + closure_init_stack(&cl); + + mutex_lock(&b->write_lock); + bch_btree_node_write(b, &cl); + mutex_unlock(&b->write_lock); + + closure_sync(&cl); +} + +static void btree_node_write_work(struct work_struct *w) +{ + struct btree *b = container_of(to_delayed_work(w), struct btree, work); + + mutex_lock(&b->write_lock); + if (btree_node_dirty(b)) + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); +} + +static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) +{ + struct bset *i = btree_bset_last(b); + struct btree_write *w = btree_current_write(b); + + lockdep_assert_held(&b->write_lock); + + BUG_ON(!b->written); + BUG_ON(!i->keys); + + if (!btree_node_dirty(b)) + queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); + + set_btree_node_dirty(b); + + /* + * w->journal is always the oldest journal pin of all bkeys + * in the leaf node, to make sure the oldest jset seq won't + * be increased before this btree node is flushed. + */ + if (journal_ref) { + if (w->journal && + journal_pin_cmp(b->c, w->journal, journal_ref)) { + atomic_dec_bug(w->journal); + w->journal = NULL; + } + + if (!w->journal) { + w->journal = journal_ref; + atomic_inc(w->journal); + } + } + + /* Force write if set is too big */ + if (set_bytes(i) > PAGE_SIZE - 48 && + !current->bio_list) + bch_btree_node_write(b, NULL); +} + +/* + * Btree in memory cache - allocation/freeing + * mca -> memory cache + */ + +#define mca_reserve(c) (((!IS_ERR_OR_NULL(c->root) && c->root->level) \ + ? c->root->level : 1) * 8 + 16) +#define mca_can_free(c) \ + max_t(int, 0, c->btree_cache_used - mca_reserve(c)) + +static void mca_data_free(struct btree *b) +{ + BUG_ON(b->io_mutex.count != 1); + + bch_btree_keys_free(&b->keys); + + b->c->btree_cache_used--; + list_move(&b->list, &b->c->btree_cache_freed); +} + +static void mca_bucket_free(struct btree *b) +{ + BUG_ON(btree_node_dirty(b)); + + b->key.ptr[0] = 0; + hlist_del_init_rcu(&b->hash); + list_move(&b->list, &b->c->btree_cache_freeable); +} + +static unsigned int btree_order(struct bkey *k) +{ + return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); +} + +static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) +{ + if (!bch_btree_keys_alloc(&b->keys, + max_t(unsigned int, + ilog2(b->c->btree_pages), + btree_order(k)), + gfp)) { + b->c->btree_cache_used++; + list_move(&b->list, &b->c->btree_cache); + } else { + list_move(&b->list, &b->c->btree_cache_freed); + } +} + +static struct btree *mca_bucket_alloc(struct cache_set *c, + struct bkey *k, gfp_t gfp) +{ + /* + * kzalloc() is necessary here for initialization, + * see code comments in bch_btree_keys_init(). + */ + struct btree *b = kzalloc(sizeof(struct btree), gfp); + + if (!b) + return NULL; + + init_rwsem(&b->lock); + lockdep_set_novalidate_class(&b->lock); + mutex_init(&b->write_lock); + lockdep_set_novalidate_class(&b->write_lock); + INIT_LIST_HEAD(&b->list); + INIT_DELAYED_WORK(&b->work, btree_node_write_work); + b->c = c; + sema_init(&b->io_mutex, 1); + + mca_data_alloc(b, k, gfp); + return b; +} + +static int mca_reap(struct btree *b, unsigned int min_order, bool flush) +{ + struct closure cl; + + closure_init_stack(&cl); + lockdep_assert_held(&b->c->bucket_lock); + + if (!down_write_trylock(&b->lock)) + return -ENOMEM; + + BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data); + + if (b->keys.page_order < min_order) + goto out_unlock; + + if (!flush) { + if (btree_node_dirty(b)) + goto out_unlock; + + if (down_trylock(&b->io_mutex)) + goto out_unlock; + up(&b->io_mutex); + } + +retry: + /* + * BTREE_NODE_dirty might be cleared in btree_flush_btree() by + * __bch_btree_node_write(). To avoid an extra flush, acquire + * b->write_lock before checking BTREE_NODE_dirty bit. + */ + mutex_lock(&b->write_lock); + /* + * If this btree node is selected in btree_flush_write() by journal + * code, delay and retry until the node is flushed by journal code + * and BTREE_NODE_journal_flush bit cleared by btree_flush_write(). + */ + if (btree_node_journal_flush(b)) { + pr_debug("bnode %p is flushing by journal, retry\n", b); + mutex_unlock(&b->write_lock); + udelay(1); + goto retry; + } + + if (btree_node_dirty(b)) + __bch_btree_node_write(b, &cl); + mutex_unlock(&b->write_lock); + + closure_sync(&cl); + + /* wait for any in flight btree write */ + down(&b->io_mutex); + up(&b->io_mutex); + + return 0; +out_unlock: + rw_unlock(true, b); + return -ENOMEM; +} + +static unsigned long bch_mca_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct cache_set *c = container_of(shrink, struct cache_set, shrink); + struct btree *b, *t; + unsigned long i, nr = sc->nr_to_scan; + unsigned long freed = 0; + unsigned int btree_cache_used; + + if (c->shrinker_disabled) + return SHRINK_STOP; + + if (c->btree_cache_alloc_lock) + return SHRINK_STOP; + + /* Return -1 if we can't do anything right now */ + if (sc->gfp_mask & __GFP_IO) + mutex_lock(&c->bucket_lock); + else if (!mutex_trylock(&c->bucket_lock)) + return -1; + + /* + * It's _really_ critical that we don't free too many btree nodes - we + * have to always leave ourselves a reserve. The reserve is how we + * guarantee that allocating memory for a new btree node can always + * succeed, so that inserting keys into the btree can always succeed and + * IO can always make forward progress: + */ + nr /= c->btree_pages; + if (nr == 0) + nr = 1; + nr = min_t(unsigned long, nr, mca_can_free(c)); + + i = 0; + btree_cache_used = c->btree_cache_used; + list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) { + if (nr <= 0) + goto out; + + if (!mca_reap(b, 0, false)) { + mca_data_free(b); + rw_unlock(true, b); + freed++; + } + nr--; + i++; + } + + list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { + if (nr <= 0 || i >= btree_cache_used) + goto out; + + if (!mca_reap(b, 0, false)) { + mca_bucket_free(b); + mca_data_free(b); + rw_unlock(true, b); + freed++; + } + + nr--; + i++; + } +out: + mutex_unlock(&c->bucket_lock); + return freed * c->btree_pages; +} + +static unsigned long bch_mca_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct cache_set *c = container_of(shrink, struct cache_set, shrink); + + if (c->shrinker_disabled) + return 0; + + if (c->btree_cache_alloc_lock) + return 0; + + return mca_can_free(c) * c->btree_pages; +} + +void bch_btree_cache_free(struct cache_set *c) +{ + struct btree *b; + struct closure cl; + + closure_init_stack(&cl); + + if (c->shrink.list.next) + unregister_shrinker(&c->shrink); + + mutex_lock(&c->bucket_lock); + +#ifdef CONFIG_BCACHE_DEBUG + if (c->verify_data) + list_move(&c->verify_data->list, &c->btree_cache); + + free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->cache->sb))); +#endif + + list_splice(&c->btree_cache_freeable, + &c->btree_cache); + + while (!list_empty(&c->btree_cache)) { + b = list_first_entry(&c->btree_cache, struct btree, list); + + /* + * This function is called by cache_set_free(), no I/O + * request on cache now, it is unnecessary to acquire + * b->write_lock before clearing BTREE_NODE_dirty anymore. + */ + if (btree_node_dirty(b)) { + btree_complete_write(b, btree_current_write(b)); + clear_bit(BTREE_NODE_dirty, &b->flags); + } + mca_data_free(b); + } + + while (!list_empty(&c->btree_cache_freed)) { + b = list_first_entry(&c->btree_cache_freed, + struct btree, list); + list_del(&b->list); + cancel_delayed_work_sync(&b->work); + kfree(b); + } + + mutex_unlock(&c->bucket_lock); +} + +int bch_btree_cache_alloc(struct cache_set *c) +{ + unsigned int i; + + for (i = 0; i < mca_reserve(c); i++) + if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) + return -ENOMEM; + + list_splice_init(&c->btree_cache, + &c->btree_cache_freeable); + +#ifdef CONFIG_BCACHE_DEBUG + mutex_init(&c->verify_lock); + + c->verify_ondisk = (void *) + __get_free_pages(GFP_KERNEL|__GFP_COMP, + ilog2(meta_bucket_pages(&c->cache->sb))); + if (!c->verify_ondisk) { + /* + * Don't worry about the mca_rereserve buckets + * allocated in previous for-loop, they will be + * handled properly in bch_cache_set_unregister(). + */ + return -ENOMEM; + } + + c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); + + if (c->verify_data && + c->verify_data->keys.set->data) + list_del_init(&c->verify_data->list); + else + c->verify_data = NULL; +#endif + + c->shrink.count_objects = bch_mca_count; + c->shrink.scan_objects = bch_mca_scan; + c->shrink.seeks = 4; + c->shrink.batch = c->btree_pages * 2; + + if (register_shrinker(&c->shrink, "md-bcache:%pU", c->set_uuid)) + pr_warn("bcache: %s: could not register shrinker\n", + __func__); + + return 0; +} + +/* Btree in memory cache - hash table */ + +static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) +{ + return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; +} + +static struct btree *mca_find(struct cache_set *c, struct bkey *k) +{ + struct btree *b; + + rcu_read_lock(); + hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) + if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) + goto out; + b = NULL; +out: + rcu_read_unlock(); + return b; +} + +static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) +{ + spin_lock(&c->btree_cannibalize_lock); + if (likely(c->btree_cache_alloc_lock == NULL)) { + c->btree_cache_alloc_lock = current; + } else if (c->btree_cache_alloc_lock != current) { + if (op) + prepare_to_wait(&c->btree_cache_wait, &op->wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&c->btree_cannibalize_lock); + return -EINTR; + } + spin_unlock(&c->btree_cannibalize_lock); + + return 0; +} + +static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, + struct bkey *k) +{ + struct btree *b; + + trace_bcache_btree_cache_cannibalize(c); + + if (mca_cannibalize_lock(c, op)) + return ERR_PTR(-EINTR); + + list_for_each_entry_reverse(b, &c->btree_cache, list) + if (!mca_reap(b, btree_order(k), false)) + return b; + + list_for_each_entry_reverse(b, &c->btree_cache, list) + if (!mca_reap(b, btree_order(k), true)) + return b; + + WARN(1, "btree cache cannibalize failed\n"); + return ERR_PTR(-ENOMEM); +} + +/* + * We can only have one thread cannibalizing other cached btree nodes at a time, + * or we'll deadlock. We use an open coded mutex to ensure that, which a + * cannibalize_bucket() will take. This means every time we unlock the root of + * the btree, we need to release this lock if we have it held. + */ +void bch_cannibalize_unlock(struct cache_set *c) +{ + spin_lock(&c->btree_cannibalize_lock); + if (c->btree_cache_alloc_lock == current) { + c->btree_cache_alloc_lock = NULL; + wake_up(&c->btree_cache_wait); + } + spin_unlock(&c->btree_cannibalize_lock); +} + +static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, + struct bkey *k, int level) +{ + struct btree *b; + + BUG_ON(current->bio_list); + + lockdep_assert_held(&c->bucket_lock); + + if (mca_find(c, k)) + return NULL; + + /* btree_free() doesn't free memory; it sticks the node on the end of + * the list. Check if there's any freed nodes there: + */ + list_for_each_entry(b, &c->btree_cache_freeable, list) + if (!mca_reap(b, btree_order(k), false)) + goto out; + + /* We never free struct btree itself, just the memory that holds the on + * disk node. Check the freed list before allocating a new one: + */ + list_for_each_entry(b, &c->btree_cache_freed, list) + if (!mca_reap(b, 0, false)) { + mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); + if (!b->keys.set[0].data) + goto err; + else + goto out; + } + + b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); + if (!b) + goto err; + + BUG_ON(!down_write_trylock(&b->lock)); + if (!b->keys.set->data) + goto err; +out: + BUG_ON(b->io_mutex.count != 1); + + bkey_copy(&b->key, k); + list_move(&b->list, &c->btree_cache); + hlist_del_init_rcu(&b->hash); + hlist_add_head_rcu(&b->hash, mca_hash(c, k)); + + lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); + b->parent = (void *) ~0UL; + b->flags = 0; + b->written = 0; + b->level = level; + + if (!b->level) + bch_btree_keys_init(&b->keys, &bch_extent_keys_ops, + &b->c->expensive_debug_checks); + else + bch_btree_keys_init(&b->keys, &bch_btree_keys_ops, + &b->c->expensive_debug_checks); + + return b; +err: + if (b) + rw_unlock(true, b); + + b = mca_cannibalize(c, op, k); + if (!IS_ERR(b)) + goto out; + + return b; +} + +/* + * bch_btree_node_get - find a btree node in the cache and lock it, reading it + * in from disk if necessary. + * + * If IO is necessary and running under submit_bio_noacct, returns -EAGAIN. + * + * The btree node will have either a read or a write lock held, depending on + * level and op->lock. + * + * Note: Only error code or btree pointer will be returned, it is unncessary + * for callers to check NULL pointer. + */ +struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, + struct bkey *k, int level, bool write, + struct btree *parent) +{ + int i = 0; + struct btree *b; + + BUG_ON(level < 0); +retry: + b = mca_find(c, k); + + if (!b) { + if (current->bio_list) + return ERR_PTR(-EAGAIN); + + mutex_lock(&c->bucket_lock); + b = mca_alloc(c, op, k, level); + mutex_unlock(&c->bucket_lock); + + if (!b) + goto retry; + if (IS_ERR(b)) + return b; + + bch_btree_node_read(b); + + if (!write) + downgrade_write(&b->lock); + } else { + rw_lock(write, b, level); + if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { + rw_unlock(write, b); + goto retry; + } + BUG_ON(b->level != level); + } + + if (btree_node_io_error(b)) { + rw_unlock(write, b); + return ERR_PTR(-EIO); + } + + BUG_ON(!b->written); + + b->parent = parent; + + for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { + prefetch(b->keys.set[i].tree); + prefetch(b->keys.set[i].data); + } + + for (; i <= b->keys.nsets; i++) + prefetch(b->keys.set[i].data); + + return b; +} + +static void btree_node_prefetch(struct btree *parent, struct bkey *k) +{ + struct btree *b; + + mutex_lock(&parent->c->bucket_lock); + b = mca_alloc(parent->c, NULL, k, parent->level - 1); + mutex_unlock(&parent->c->bucket_lock); + + if (!IS_ERR_OR_NULL(b)) { + b->parent = parent; + bch_btree_node_read(b); + rw_unlock(true, b); + } +} + +/* Btree alloc */ + +static void btree_node_free(struct btree *b) +{ + trace_bcache_btree_node_free(b); + + BUG_ON(b == b->c->root); + +retry: + mutex_lock(&b->write_lock); + /* + * If the btree node is selected and flushing in btree_flush_write(), + * delay and retry until the BTREE_NODE_journal_flush bit cleared, + * then it is safe to free the btree node here. Otherwise this btree + * node will be in race condition. + */ + if (btree_node_journal_flush(b)) { + mutex_unlock(&b->write_lock); + pr_debug("bnode %p journal_flush set, retry\n", b); + udelay(1); + goto retry; + } + + if (btree_node_dirty(b)) { + btree_complete_write(b, btree_current_write(b)); + clear_bit(BTREE_NODE_dirty, &b->flags); + } + + mutex_unlock(&b->write_lock); + + cancel_delayed_work(&b->work); + + mutex_lock(&b->c->bucket_lock); + bch_bucket_free(b->c, &b->key); + mca_bucket_free(b); + mutex_unlock(&b->c->bucket_lock); +} + +/* + * Only error code or btree pointer will be returned, it is unncessary for + * callers to check NULL pointer. + */ +struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, + int level, bool wait, + struct btree *parent) +{ + BKEY_PADDED(key) k; + struct btree *b; + + mutex_lock(&c->bucket_lock); +retry: + /* return ERR_PTR(-EAGAIN) when it fails */ + b = ERR_PTR(-EAGAIN); + if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait)) + goto err; + + bkey_put(c, &k.key); + SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); + + b = mca_alloc(c, op, &k.key, level); + if (IS_ERR(b)) + goto err_free; + + if (!b) { + cache_bug(c, + "Tried to allocate bucket that was in btree cache"); + goto retry; + } + + b->parent = parent; + bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->cache->sb)); + + mutex_unlock(&c->bucket_lock); + + trace_bcache_btree_node_alloc(b); + return b; +err_free: + bch_bucket_free(c, &k.key); +err: + mutex_unlock(&c->bucket_lock); + + trace_bcache_btree_node_alloc_fail(c); + return b; +} + +static struct btree *bch_btree_node_alloc(struct cache_set *c, + struct btree_op *op, int level, + struct btree *parent) +{ + return __bch_btree_node_alloc(c, op, level, op != NULL, parent); +} + +static struct btree *btree_node_alloc_replacement(struct btree *b, + struct btree_op *op) +{ + struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent); + + if (!IS_ERR(n)) { + mutex_lock(&n->write_lock); + bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); + bkey_copy_key(&n->key, &b->key); + mutex_unlock(&n->write_lock); + } + + return n; +} + +static void make_btree_freeing_key(struct btree *b, struct bkey *k) +{ + unsigned int i; + + mutex_lock(&b->c->bucket_lock); + + atomic_inc(&b->c->prio_blocked); + + bkey_copy(k, &b->key); + bkey_copy_key(k, &ZERO_KEY); + + for (i = 0; i < KEY_PTRS(k); i++) + SET_PTR_GEN(k, i, + bch_inc_gen(b->c->cache, + PTR_BUCKET(b->c, &b->key, i))); + + mutex_unlock(&b->c->bucket_lock); +} + +static int btree_check_reserve(struct btree *b, struct btree_op *op) +{ + struct cache_set *c = b->c; + struct cache *ca = c->cache; + unsigned int reserve = (c->root->level - b->level) * 2 + 1; + + mutex_lock(&c->bucket_lock); + + if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { + if (op) + prepare_to_wait(&c->btree_cache_wait, &op->wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&c->bucket_lock); + return -EINTR; + } + + mutex_unlock(&c->bucket_lock); + + return mca_cannibalize_lock(b->c, op); +} + +/* Garbage collection */ + +static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, + struct bkey *k) +{ + uint8_t stale = 0; + unsigned int i; + struct bucket *g; + + /* + * ptr_invalid() can't return true for the keys that mark btree nodes as + * freed, but since ptr_bad() returns true we'll never actually use them + * for anything and thus we don't want mark their pointers here + */ + if (!bkey_cmp(k, &ZERO_KEY)) + return stale; + + for (i = 0; i < KEY_PTRS(k); i++) { + if (!ptr_available(c, k, i)) + continue; + + g = PTR_BUCKET(c, k, i); + + if (gen_after(g->last_gc, PTR_GEN(k, i))) + g->last_gc = PTR_GEN(k, i); + + if (ptr_stale(c, k, i)) { + stale = max(stale, ptr_stale(c, k, i)); + continue; + } + + cache_bug_on(GC_MARK(g) && + (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), + c, "inconsistent ptrs: mark = %llu, level = %i", + GC_MARK(g), level); + + if (level) + SET_GC_MARK(g, GC_MARK_METADATA); + else if (KEY_DIRTY(k)) + SET_GC_MARK(g, GC_MARK_DIRTY); + else if (!GC_MARK(g)) + SET_GC_MARK(g, GC_MARK_RECLAIMABLE); + + /* guard against overflow */ + SET_GC_SECTORS_USED(g, min_t(unsigned int, + GC_SECTORS_USED(g) + KEY_SIZE(k), + MAX_GC_SECTORS_USED)); + + BUG_ON(!GC_SECTORS_USED(g)); + } + + return stale; +} + +#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) + +void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) +{ + unsigned int i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i) && + !ptr_stale(c, k, i)) { + struct bucket *b = PTR_BUCKET(c, k, i); + + b->gen = PTR_GEN(k, i); + + if (level && bkey_cmp(k, &ZERO_KEY)) + b->prio = BTREE_PRIO; + else if (!level && b->prio == BTREE_PRIO) + b->prio = INITIAL_PRIO; + } + + __bch_btree_mark_key(c, level, k); +} + +void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats) +{ + stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets; +} + +static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) +{ + uint8_t stale = 0; + unsigned int keys = 0, good_keys = 0; + struct bkey *k; + struct btree_iter iter; + struct bset_tree *t; + + gc->nodes++; + + for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { + stale = max(stale, btree_mark_key(b, k)); + keys++; + + if (bch_ptr_bad(&b->keys, k)) + continue; + + gc->key_bytes += bkey_u64s(k); + gc->nkeys++; + good_keys++; + + gc->data += KEY_SIZE(k); + } + + for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++) + btree_bug_on(t->size && + bset_written(&b->keys, t) && + bkey_cmp(&b->key, &t->end) < 0, + b, "found short btree key in gc"); + + if (b->c->gc_always_rewrite) + return true; + + if (stale > 10) + return true; + + if ((keys - good_keys) * 2 > keys) + return true; + + return false; +} + +#define GC_MERGE_NODES 4U + +struct gc_merge_info { + struct btree *b; + unsigned int keys; +}; + +static int bch_btree_insert_node(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + atomic_t *journal_ref, + struct bkey *replace_key); + +static int btree_gc_coalesce(struct btree *b, struct btree_op *op, + struct gc_stat *gc, struct gc_merge_info *r) +{ + unsigned int i, nodes = 0, keys = 0, blocks; + struct btree *new_nodes[GC_MERGE_NODES]; + struct keylist keylist; + struct closure cl; + struct bkey *k; + + bch_keylist_init(&keylist); + + if (btree_check_reserve(b, NULL)) + return 0; + + memset(new_nodes, 0, sizeof(new_nodes)); + closure_init_stack(&cl); + + while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) + keys += r[nodes++].keys; + + blocks = btree_default_blocks(b->c) * 2 / 3; + + if (nodes < 2 || + __set_blocks(b->keys.set[0].data, keys, + block_bytes(b->c->cache)) > blocks * (nodes - 1)) + return 0; + + for (i = 0; i < nodes; i++) { + new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); + if (IS_ERR(new_nodes[i])) + goto out_nocoalesce; + } + + /* + * We have to check the reserve here, after we've allocated our new + * nodes, to make sure the insert below will succeed - we also check + * before as an optimization to potentially avoid a bunch of expensive + * allocs/sorts + */ + if (btree_check_reserve(b, NULL)) + goto out_nocoalesce; + + for (i = 0; i < nodes; i++) + mutex_lock(&new_nodes[i]->write_lock); + + for (i = nodes - 1; i > 0; --i) { + struct bset *n1 = btree_bset_first(new_nodes[i]); + struct bset *n2 = btree_bset_first(new_nodes[i - 1]); + struct bkey *k, *last = NULL; + + keys = 0; + + if (i > 1) { + for (k = n2->start; + k < bset_bkey_last(n2); + k = bkey_next(k)) { + if (__set_blocks(n1, n1->keys + keys + + bkey_u64s(k), + block_bytes(b->c->cache)) > blocks) + break; + + last = k; + keys += bkey_u64s(k); + } + } else { + /* + * Last node we're not getting rid of - we're getting + * rid of the node at r[0]. Have to try and fit all of + * the remaining keys into this node; we can't ensure + * they will always fit due to rounding and variable + * length keys (shouldn't be possible in practice, + * though) + */ + if (__set_blocks(n1, n1->keys + n2->keys, + block_bytes(b->c->cache)) > + btree_blocks(new_nodes[i])) + goto out_unlock_nocoalesce; + + keys = n2->keys; + /* Take the key of the node we're getting rid of */ + last = &r->b->key; + } + + BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c->cache)) > + btree_blocks(new_nodes[i])); + + if (last) + bkey_copy_key(&new_nodes[i]->key, last); + + memcpy(bset_bkey_last(n1), + n2->start, + (void *) bset_bkey_idx(n2, keys) - (void *) n2->start); + + n1->keys += keys; + r[i].keys = n1->keys; + + memmove(n2->start, + bset_bkey_idx(n2, keys), + (void *) bset_bkey_last(n2) - + (void *) bset_bkey_idx(n2, keys)); + + n2->keys -= keys; + + if (__bch_keylist_realloc(&keylist, + bkey_u64s(&new_nodes[i]->key))) + goto out_unlock_nocoalesce; + + bch_btree_node_write(new_nodes[i], &cl); + bch_keylist_add(&keylist, &new_nodes[i]->key); + } + + for (i = 0; i < nodes; i++) + mutex_unlock(&new_nodes[i]->write_lock); + + closure_sync(&cl); + + /* We emptied out this node */ + BUG_ON(btree_bset_first(new_nodes[0])->keys); + btree_node_free(new_nodes[0]); + rw_unlock(true, new_nodes[0]); + new_nodes[0] = NULL; + + for (i = 0; i < nodes; i++) { + if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key))) + goto out_nocoalesce; + + make_btree_freeing_key(r[i].b, keylist.top); + bch_keylist_push(&keylist); + } + + bch_btree_insert_node(b, op, &keylist, NULL, NULL); + BUG_ON(!bch_keylist_empty(&keylist)); + + for (i = 0; i < nodes; i++) { + btree_node_free(r[i].b); + rw_unlock(true, r[i].b); + + r[i].b = new_nodes[i]; + } + + memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); + r[nodes - 1].b = ERR_PTR(-EINTR); + + trace_bcache_btree_gc_coalesce(nodes); + gc->nodes--; + + bch_keylist_free(&keylist); + + /* Invalidated our iterator */ + return -EINTR; + +out_unlock_nocoalesce: + for (i = 0; i < nodes; i++) + mutex_unlock(&new_nodes[i]->write_lock); + +out_nocoalesce: + closure_sync(&cl); + + while ((k = bch_keylist_pop(&keylist))) + if (!bkey_cmp(k, &ZERO_KEY)) + atomic_dec(&b->c->prio_blocked); + bch_keylist_free(&keylist); + + for (i = 0; i < nodes; i++) + if (!IS_ERR_OR_NULL(new_nodes[i])) { + btree_node_free(new_nodes[i]); + rw_unlock(true, new_nodes[i]); + } + return 0; +} + +static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, + struct btree *replace) +{ + struct keylist keys; + struct btree *n; + + if (btree_check_reserve(b, NULL)) + return 0; + + n = btree_node_alloc_replacement(replace, NULL); + if (IS_ERR(n)) + return 0; + + /* recheck reserve after allocating replacement node */ + if (btree_check_reserve(b, NULL)) { + btree_node_free(n); + rw_unlock(true, n); + return 0; + } + + bch_btree_node_write_sync(n); + + bch_keylist_init(&keys); + bch_keylist_add(&keys, &n->key); + + make_btree_freeing_key(replace, keys.top); + bch_keylist_push(&keys); + + bch_btree_insert_node(b, op, &keys, NULL, NULL); + BUG_ON(!bch_keylist_empty(&keys)); + + btree_node_free(replace); + rw_unlock(true, n); + + /* Invalidated our iterator */ + return -EINTR; +} + +static unsigned int btree_gc_count_keys(struct btree *b) +{ + struct bkey *k; + struct btree_iter iter; + unsigned int ret = 0; + + for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) + ret += bkey_u64s(k); + + return ret; +} + +static size_t btree_gc_min_nodes(struct cache_set *c) +{ + size_t min_nodes; + + /* + * Since incremental GC would stop 100ms when front + * side I/O comes, so when there are many btree nodes, + * if GC only processes constant (100) nodes each time, + * GC would last a long time, and the front side I/Os + * would run out of the buckets (since no new bucket + * can be allocated during GC), and be blocked again. + * So GC should not process constant nodes, but varied + * nodes according to the number of btree nodes, which + * realized by dividing GC into constant(100) times, + * so when there are many btree nodes, GC can process + * more nodes each time, otherwise, GC will process less + * nodes each time (but no less than MIN_GC_NODES) + */ + min_nodes = c->gc_stats.nodes / MAX_GC_TIMES; + if (min_nodes < MIN_GC_NODES) + min_nodes = MIN_GC_NODES; + + return min_nodes; +} + + +static int btree_gc_recurse(struct btree *b, struct btree_op *op, + struct closure *writes, struct gc_stat *gc) +{ + int ret = 0; + bool should_rewrite; + struct bkey *k; + struct btree_iter iter; + struct gc_merge_info r[GC_MERGE_NODES]; + struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; + + bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); + + for (i = r; i < r + ARRAY_SIZE(r); i++) + i->b = ERR_PTR(-EINTR); + + while (1) { + k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); + if (k) { + r->b = bch_btree_node_get(b->c, op, k, b->level - 1, + true, b); + if (IS_ERR(r->b)) { + ret = PTR_ERR(r->b); + break; + } + + r->keys = btree_gc_count_keys(r->b); + + ret = btree_gc_coalesce(b, op, gc, r); + if (ret) + break; + } + + if (!last->b) + break; + + if (!IS_ERR(last->b)) { + should_rewrite = btree_gc_mark_node(last->b, gc); + if (should_rewrite) { + ret = btree_gc_rewrite_node(b, op, last->b); + if (ret) + break; + } + + if (last->b->level) { + ret = btree_gc_recurse(last->b, op, writes, gc); + if (ret) + break; + } + + bkey_copy_key(&b->c->gc_done, &last->b->key); + + /* + * Must flush leaf nodes before gc ends, since replace + * operations aren't journalled + */ + mutex_lock(&last->b->write_lock); + if (btree_node_dirty(last->b)) + bch_btree_node_write(last->b, writes); + mutex_unlock(&last->b->write_lock); + rw_unlock(true, last->b); + } + + memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); + r->b = NULL; + + if (atomic_read(&b->c->search_inflight) && + gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) { + gc->nodes_pre = gc->nodes; + ret = -EAGAIN; + break; + } + + if (need_resched()) { + ret = -EAGAIN; + break; + } + } + + for (i = r; i < r + ARRAY_SIZE(r); i++) + if (!IS_ERR_OR_NULL(i->b)) { + mutex_lock(&i->b->write_lock); + if (btree_node_dirty(i->b)) + bch_btree_node_write(i->b, writes); + mutex_unlock(&i->b->write_lock); + rw_unlock(true, i->b); + } + + return ret; +} + +static int bch_btree_gc_root(struct btree *b, struct btree_op *op, + struct closure *writes, struct gc_stat *gc) +{ + struct btree *n = NULL; + int ret = 0; + bool should_rewrite; + + should_rewrite = btree_gc_mark_node(b, gc); + if (should_rewrite) { + n = btree_node_alloc_replacement(b, NULL); + + if (!IS_ERR(n)) { + bch_btree_node_write_sync(n); + + bch_btree_set_root(n); + btree_node_free(b); + rw_unlock(true, n); + + return -EINTR; + } + } + + __bch_btree_mark_key(b->c, b->level + 1, &b->key); + + if (b->level) { + ret = btree_gc_recurse(b, op, writes, gc); + if (ret) + return ret; + } + + bkey_copy_key(&b->c->gc_done, &b->key); + + return ret; +} + +static void btree_gc_start(struct cache_set *c) +{ + struct cache *ca; + struct bucket *b; + + if (!c->gc_mark_valid) + return; + + mutex_lock(&c->bucket_lock); + + c->gc_mark_valid = 0; + c->gc_done = ZERO_KEY; + + ca = c->cache; + for_each_bucket(b, ca) { + b->last_gc = b->gen; + if (!atomic_read(&b->pin)) { + SET_GC_MARK(b, 0); + SET_GC_SECTORS_USED(b, 0); + } + } + + mutex_unlock(&c->bucket_lock); +} + +static void bch_btree_gc_finish(struct cache_set *c) +{ + struct bucket *b; + struct cache *ca; + unsigned int i, j; + uint64_t *k; + + mutex_lock(&c->bucket_lock); + + set_gc_sectors(c); + c->gc_mark_valid = 1; + c->need_gc = 0; + + for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) + SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), + GC_MARK_METADATA); + + /* don't reclaim buckets to which writeback keys point */ + rcu_read_lock(); + for (i = 0; i < c->devices_max_used; i++) { + struct bcache_device *d = c->devices[i]; + struct cached_dev *dc; + struct keybuf_key *w, *n; + + if (!d || UUID_FLASH_ONLY(&c->uuids[i])) + continue; + dc = container_of(d, struct cached_dev, disk); + + spin_lock(&dc->writeback_keys.lock); + rbtree_postorder_for_each_entry_safe(w, n, + &dc->writeback_keys.keys, node) + for (j = 0; j < KEY_PTRS(&w->key); j++) + SET_GC_MARK(PTR_BUCKET(c, &w->key, j), + GC_MARK_DIRTY); + spin_unlock(&dc->writeback_keys.lock); + } + rcu_read_unlock(); + + c->avail_nbuckets = 0; + + ca = c->cache; + ca->invalidate_needs_gc = 0; + + for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++) + SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); + + for (k = ca->prio_buckets; + k < ca->prio_buckets + prio_buckets(ca) * 2; k++) + SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); + + for_each_bucket(b, ca) { + c->need_gc = max(c->need_gc, bucket_gc_gen(b)); + + if (atomic_read(&b->pin)) + continue; + + BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); + + if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) + c->avail_nbuckets++; + } + + mutex_unlock(&c->bucket_lock); +} + +static void bch_btree_gc(struct cache_set *c) +{ + int ret; + struct gc_stat stats; + struct closure writes; + struct btree_op op; + uint64_t start_time = local_clock(); + + trace_bcache_gc_start(c); + + memset(&stats, 0, sizeof(struct gc_stat)); + closure_init_stack(&writes); + bch_btree_op_init(&op, SHRT_MAX); + + btree_gc_start(c); + + /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */ + do { + ret = bcache_btree_root(gc_root, c, &op, &writes, &stats); + closure_sync(&writes); + cond_resched(); + + if (ret == -EAGAIN) + schedule_timeout_interruptible(msecs_to_jiffies + (GC_SLEEP_MS)); + else if (ret) + pr_warn("gc failed!\n"); + } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); + + bch_btree_gc_finish(c); + wake_up_allocators(c); + + bch_time_stats_update(&c->btree_gc_time, start_time); + + stats.key_bytes *= sizeof(uint64_t); + stats.data <<= 9; + bch_update_bucket_in_use(c, &stats); + memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); + + trace_bcache_gc_end(c); + + bch_moving_gc(c); +} + +static bool gc_should_run(struct cache_set *c) +{ + struct cache *ca = c->cache; + + if (ca->invalidate_needs_gc) + return true; + + if (atomic_read(&c->sectors_to_gc) < 0) + return true; + + return false; +} + +static int bch_gc_thread(void *arg) +{ + struct cache_set *c = arg; + + while (1) { + wait_event_interruptible(c->gc_wait, + kthread_should_stop() || + test_bit(CACHE_SET_IO_DISABLE, &c->flags) || + gc_should_run(c)); + + if (kthread_should_stop() || + test_bit(CACHE_SET_IO_DISABLE, &c->flags)) + break; + + set_gc_sectors(c); + bch_btree_gc(c); + } + + wait_for_kthread_stop(); + return 0; +} + +int bch_gc_thread_start(struct cache_set *c) +{ + c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); + return PTR_ERR_OR_ZERO(c->gc_thread); +} + +/* Initial partial gc */ + +static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) +{ + int ret = 0; + struct bkey *k, *p = NULL; + struct btree_iter iter; + + for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) + bch_initial_mark_key(b->c, b->level, k); + + bch_initial_mark_key(b->c, b->level + 1, &b->key); + + if (b->level) { + bch_btree_iter_init(&b->keys, &iter, NULL); + + do { + k = bch_btree_iter_next_filter(&iter, &b->keys, + bch_ptr_bad); + if (k) { + btree_node_prefetch(b, k); + /* + * initiallize c->gc_stats.nodes + * for incremental GC + */ + b->c->gc_stats.nodes++; + } + + if (p) + ret = bcache_btree(check_recurse, p, b, op); + + p = k; + } while (p && !ret); + } + + return ret; +} + + +static int bch_btree_check_thread(void *arg) +{ + int ret; + struct btree_check_info *info = arg; + struct btree_check_state *check_state = info->state; + struct cache_set *c = check_state->c; + struct btree_iter iter; + struct bkey *k, *p; + int cur_idx, prev_idx, skip_nr; + + k = p = NULL; + cur_idx = prev_idx = 0; + ret = 0; + + /* root node keys are checked before thread created */ + bch_btree_iter_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + BUG_ON(!k); + + p = k; + while (k) { + /* + * Fetch a root node key index, skip the keys which + * should be fetched by other threads, then check the + * sub-tree indexed by the fetched key. + */ + spin_lock(&check_state->idx_lock); + cur_idx = check_state->key_idx; + check_state->key_idx++; + spin_unlock(&check_state->idx_lock); + + skip_nr = cur_idx - prev_idx; + + while (skip_nr) { + k = bch_btree_iter_next_filter(&iter, + &c->root->keys, + bch_ptr_bad); + if (k) + p = k; + else { + /* + * No more keys to check in root node, + * current checking threads are enough, + * stop creating more. + */ + atomic_set(&check_state->enough, 1); + /* Update check_state->enough earlier */ + smp_mb__after_atomic(); + goto out; + } + skip_nr--; + cond_resched(); + } + + if (p) { + struct btree_op op; + + btree_node_prefetch(c->root, p); + c->gc_stats.nodes++; + bch_btree_op_init(&op, 0); + ret = bcache_btree(check_recurse, p, c->root, &op); + /* + * The op may be added to cache_set's btree_cache_wait + * in mca_cannibalize(), must ensure it is removed from + * the list and release btree_cache_alloc_lock before + * free op memory. + * Otherwise, the btree_cache_wait will be damaged. + */ + bch_cannibalize_unlock(c); + finish_wait(&c->btree_cache_wait, &(&op)->wait); + if (ret) + goto out; + } + p = NULL; + prev_idx = cur_idx; + cond_resched(); + } + +out: + info->result = ret; + /* update check_state->started among all CPUs */ + smp_mb__before_atomic(); + if (atomic_dec_and_test(&check_state->started)) + wake_up(&check_state->wait); + + return ret; +} + + + +static int bch_btree_chkthread_nr(void) +{ + int n = num_online_cpus()/2; + + if (n == 0) + n = 1; + else if (n > BCH_BTR_CHKTHREAD_MAX) + n = BCH_BTR_CHKTHREAD_MAX; + + return n; +} + +int bch_btree_check(struct cache_set *c) +{ + int ret = 0; + int i; + struct bkey *k = NULL; + struct btree_iter iter; + struct btree_check_state check_state; + + /* check and mark root node keys */ + for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) + bch_initial_mark_key(c, c->root->level, k); + + bch_initial_mark_key(c, c->root->level + 1, &c->root->key); + + if (c->root->level == 0) + return 0; + + memset(&check_state, 0, sizeof(struct btree_check_state)); + check_state.c = c; + check_state.total_threads = bch_btree_chkthread_nr(); + check_state.key_idx = 0; + spin_lock_init(&check_state.idx_lock); + atomic_set(&check_state.started, 0); + atomic_set(&check_state.enough, 0); + init_waitqueue_head(&check_state.wait); + + rw_lock(0, c->root, c->root->level); + /* + * Run multiple threads to check btree nodes in parallel, + * if check_state.enough is non-zero, it means current + * running check threads are enough, unncessary to create + * more. + */ + for (i = 0; i < check_state.total_threads; i++) { + /* fetch latest check_state.enough earlier */ + smp_mb__before_atomic(); + if (atomic_read(&check_state.enough)) + break; + + check_state.infos[i].result = 0; + check_state.infos[i].state = &check_state; + + check_state.infos[i].thread = + kthread_run(bch_btree_check_thread, + &check_state.infos[i], + "bch_btrchk[%d]", i); + if (IS_ERR(check_state.infos[i].thread)) { + pr_err("fails to run thread bch_btrchk[%d]\n", i); + for (--i; i >= 0; i--) + kthread_stop(check_state.infos[i].thread); + ret = -ENOMEM; + goto out; + } + atomic_inc(&check_state.started); + } + + /* + * Must wait for all threads to stop. + */ + wait_event(check_state.wait, atomic_read(&check_state.started) == 0); + + for (i = 0; i < check_state.total_threads; i++) { + if (check_state.infos[i].result) { + ret = check_state.infos[i].result; + goto out; + } + } + +out: + rw_unlock(0, c->root); + return ret; +} + +void bch_initial_gc_finish(struct cache_set *c) +{ + struct cache *ca = c->cache; + struct bucket *b; + + bch_btree_gc_finish(c); + + mutex_lock(&c->bucket_lock); + + /* + * We need to put some unused buckets directly on the prio freelist in + * order to get the allocator thread started - it needs freed buckets in + * order to rewrite the prios and gens, and it needs to rewrite prios + * and gens in order to free buckets. + * + * This is only safe for buckets that have no live data in them, which + * there should always be some of. + */ + for_each_bucket(b, ca) { + if (fifo_full(&ca->free[RESERVE_PRIO]) && + fifo_full(&ca->free[RESERVE_BTREE])) + break; + + if (bch_can_invalidate_bucket(ca, b) && + !GC_MARK(b)) { + __bch_invalidate_one_bucket(ca, b); + if (!fifo_push(&ca->free[RESERVE_PRIO], + b - ca->buckets)) + fifo_push(&ca->free[RESERVE_BTREE], + b - ca->buckets); + } + } + + mutex_unlock(&c->bucket_lock); +} + +/* Btree insertion */ + +static bool btree_insert_key(struct btree *b, struct bkey *k, + struct bkey *replace_key) +{ + unsigned int status; + + BUG_ON(bkey_cmp(k, &b->key) > 0); + + status = bch_btree_insert_key(&b->keys, k, replace_key); + if (status != BTREE_INSERT_STATUS_NO_INSERT) { + bch_check_keys(&b->keys, "%u for %s", status, + replace_key ? "replace" : "insert"); + + trace_bcache_btree_insert_key(b, k, replace_key != NULL, + status); + return true; + } else + return false; +} + +static size_t insert_u64s_remaining(struct btree *b) +{ + long ret = bch_btree_keys_u64s_remaining(&b->keys); + + /* + * Might land in the middle of an existing extent and have to split it + */ + if (b->keys.ops->is_extents) + ret -= KEY_MAX_U64S; + + return max(ret, 0L); +} + +static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + struct bkey *replace_key) +{ + bool ret = false; + int oldsize = bch_count_data(&b->keys); + + while (!bch_keylist_empty(insert_keys)) { + struct bkey *k = insert_keys->keys; + + if (bkey_u64s(k) > insert_u64s_remaining(b)) + break; + + if (bkey_cmp(k, &b->key) <= 0) { + if (!b->level) + bkey_put(b->c, k); + + ret |= btree_insert_key(b, k, replace_key); + bch_keylist_pop_front(insert_keys); + } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { + BKEY_PADDED(key) temp; + bkey_copy(&temp.key, insert_keys->keys); + + bch_cut_back(&b->key, &temp.key); + bch_cut_front(&b->key, insert_keys->keys); + + ret |= btree_insert_key(b, &temp.key, replace_key); + break; + } else { + break; + } + } + + if (!ret) + op->insert_collision = true; + + BUG_ON(!bch_keylist_empty(insert_keys) && b->level); + + BUG_ON(bch_count_data(&b->keys) < oldsize); + return ret; +} + +static int btree_split(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + struct bkey *replace_key) +{ + bool split; + struct btree *n1, *n2 = NULL, *n3 = NULL; + uint64_t start_time = local_clock(); + struct closure cl; + struct keylist parent_keys; + + closure_init_stack(&cl); + bch_keylist_init(&parent_keys); + + if (btree_check_reserve(b, op)) { + if (!b->level) + return -EINTR; + else + WARN(1, "insufficient reserve for split\n"); + } + + n1 = btree_node_alloc_replacement(b, op); + if (IS_ERR(n1)) + goto err; + + split = set_blocks(btree_bset_first(n1), + block_bytes(n1->c->cache)) > (btree_blocks(b) * 4) / 5; + + if (split) { + unsigned int keys = 0; + + trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); + + n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent); + if (IS_ERR(n2)) + goto err_free1; + + if (!b->parent) { + n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL); + if (IS_ERR(n3)) + goto err_free2; + } + + mutex_lock(&n1->write_lock); + mutex_lock(&n2->write_lock); + + bch_btree_insert_keys(n1, op, insert_keys, replace_key); + + /* + * Has to be a linear search because we don't have an auxiliary + * search tree yet + */ + + while (keys < (btree_bset_first(n1)->keys * 3) / 5) + keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), + keys)); + + bkey_copy_key(&n1->key, + bset_bkey_idx(btree_bset_first(n1), keys)); + keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); + + btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys; + btree_bset_first(n1)->keys = keys; + + memcpy(btree_bset_first(n2)->start, + bset_bkey_last(btree_bset_first(n1)), + btree_bset_first(n2)->keys * sizeof(uint64_t)); + + bkey_copy_key(&n2->key, &b->key); + + bch_keylist_add(&parent_keys, &n2->key); + bch_btree_node_write(n2, &cl); + mutex_unlock(&n2->write_lock); + rw_unlock(true, n2); + } else { + trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); + + mutex_lock(&n1->write_lock); + bch_btree_insert_keys(n1, op, insert_keys, replace_key); + } + + bch_keylist_add(&parent_keys, &n1->key); + bch_btree_node_write(n1, &cl); + mutex_unlock(&n1->write_lock); + + if (n3) { + /* Depth increases, make a new root */ + mutex_lock(&n3->write_lock); + bkey_copy_key(&n3->key, &MAX_KEY); + bch_btree_insert_keys(n3, op, &parent_keys, NULL); + bch_btree_node_write(n3, &cl); + mutex_unlock(&n3->write_lock); + + closure_sync(&cl); + bch_btree_set_root(n3); + rw_unlock(true, n3); + } else if (!b->parent) { + /* Root filled up but didn't need to be split */ + closure_sync(&cl); + bch_btree_set_root(n1); + } else { + /* Split a non root node */ + closure_sync(&cl); + make_btree_freeing_key(b, parent_keys.top); + bch_keylist_push(&parent_keys); + + bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); + BUG_ON(!bch_keylist_empty(&parent_keys)); + } + + btree_node_free(b); + rw_unlock(true, n1); + + bch_time_stats_update(&b->c->btree_split_time, start_time); + + return 0; +err_free2: + bkey_put(b->c, &n2->key); + btree_node_free(n2); + rw_unlock(true, n2); +err_free1: + bkey_put(b->c, &n1->key); + btree_node_free(n1); + rw_unlock(true, n1); +err: + WARN(1, "bcache: btree split failed (level %u)", b->level); + + if (n3 == ERR_PTR(-EAGAIN) || + n2 == ERR_PTR(-EAGAIN) || + n1 == ERR_PTR(-EAGAIN)) + return -EAGAIN; + + return -ENOMEM; +} + +static int bch_btree_insert_node(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + atomic_t *journal_ref, + struct bkey *replace_key) +{ + struct closure cl; + + BUG_ON(b->level && replace_key); + + closure_init_stack(&cl); + + mutex_lock(&b->write_lock); + + if (write_block(b) != btree_bset_last(b) && + b->keys.last_set_unwritten) + bch_btree_init_next(b); /* just wrote a set */ + + if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { + mutex_unlock(&b->write_lock); + goto split; + } + + BUG_ON(write_block(b) != btree_bset_last(b)); + + if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { + if (!b->level) + bch_btree_leaf_dirty(b, journal_ref); + else + bch_btree_node_write(b, &cl); + } + + mutex_unlock(&b->write_lock); + + /* wait for btree node write if necessary, after unlock */ + closure_sync(&cl); + + return 0; +split: + if (current->bio_list) { + op->lock = b->c->root->level + 1; + return -EAGAIN; + } else if (op->lock <= b->c->root->level) { + op->lock = b->c->root->level + 1; + return -EINTR; + } else { + /* Invalidated all iterators */ + int ret = btree_split(b, op, insert_keys, replace_key); + + if (bch_keylist_empty(insert_keys)) + return 0; + else if (!ret) + return -EINTR; + return ret; + } +} + +int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, + struct bkey *check_key) +{ + int ret = -EINTR; + uint64_t btree_ptr = b->key.ptr[0]; + unsigned long seq = b->seq; + struct keylist insert; + bool upgrade = op->lock == -1; + + bch_keylist_init(&insert); + + if (upgrade) { + rw_unlock(false, b); + rw_lock(true, b, b->level); + + if (b->key.ptr[0] != btree_ptr || + b->seq != seq + 1) { + op->lock = b->level; + goto out; + } + } + + SET_KEY_PTRS(check_key, 1); + get_random_bytes(&check_key->ptr[0], sizeof(uint64_t)); + + SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV); + + bch_keylist_add(&insert, check_key); + + ret = bch_btree_insert_node(b, op, &insert, NULL, NULL); + + BUG_ON(!ret && !bch_keylist_empty(&insert)); +out: + if (upgrade) + downgrade_write(&b->lock); + return ret; +} + +struct btree_insert_op { + struct btree_op op; + struct keylist *keys; + atomic_t *journal_ref; + struct bkey *replace_key; +}; + +static int btree_insert_fn(struct btree_op *b_op, struct btree *b) +{ + struct btree_insert_op *op = container_of(b_op, + struct btree_insert_op, op); + + int ret = bch_btree_insert_node(b, &op->op, op->keys, + op->journal_ref, op->replace_key); + if (ret && !bch_keylist_empty(op->keys)) + return ret; + else + return MAP_DONE; +} + +int bch_btree_insert(struct cache_set *c, struct keylist *keys, + atomic_t *journal_ref, struct bkey *replace_key) +{ + struct btree_insert_op op; + int ret = 0; + + BUG_ON(current->bio_list); + BUG_ON(bch_keylist_empty(keys)); + + bch_btree_op_init(&op.op, 0); + op.keys = keys; + op.journal_ref = journal_ref; + op.replace_key = replace_key; + + while (!ret && !bch_keylist_empty(keys)) { + op.op.lock = 0; + ret = bch_btree_map_leaf_nodes(&op.op, c, + &START_KEY(keys->keys), + btree_insert_fn); + } + + if (ret) { + struct bkey *k; + + pr_err("error %i\n", ret); + + while ((k = bch_keylist_pop(keys))) + bkey_put(c, k); + } else if (op.op.insert_collision) + ret = -ESRCH; + + return ret; +} + +void bch_btree_set_root(struct btree *b) +{ + unsigned int i; + struct closure cl; + + closure_init_stack(&cl); + + trace_bcache_btree_set_root(b); + + BUG_ON(!b->written); + + for (i = 0; i < KEY_PTRS(&b->key); i++) + BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); + + mutex_lock(&b->c->bucket_lock); + list_del_init(&b->list); + mutex_unlock(&b->c->bucket_lock); + + b->c->root = b; + + bch_journal_meta(b->c, &cl); + closure_sync(&cl); +} + +/* Map across nodes or keys */ + +static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, + btree_map_nodes_fn *fn, int flags) +{ + int ret = MAP_CONTINUE; + + if (b->level) { + struct bkey *k; + struct btree_iter iter; + + bch_btree_iter_init(&b->keys, &iter, from); + + while ((k = bch_btree_iter_next_filter(&iter, &b->keys, + bch_ptr_bad))) { + ret = bcache_btree(map_nodes_recurse, k, b, + op, from, fn, flags); + from = NULL; + + if (ret != MAP_CONTINUE) + return ret; + } + } + + if (!b->level || flags == MAP_ALL_NODES) + ret = fn(op, b); + + return ret; +} + +int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_nodes_fn *fn, int flags) +{ + return bcache_btree_root(map_nodes_recurse, c, op, from, fn, flags); +} + +int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, btree_map_keys_fn *fn, + int flags) +{ + int ret = MAP_CONTINUE; + struct bkey *k; + struct btree_iter iter; + + bch_btree_iter_init(&b->keys, &iter, from); + + while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { + ret = !b->level + ? fn(op, b, k) + : bcache_btree(map_keys_recurse, k, + b, op, from, fn, flags); + from = NULL; + + if (ret != MAP_CONTINUE) + return ret; + } + + if (!b->level && (flags & MAP_END_KEY)) + ret = fn(op, b, &KEY(KEY_INODE(&b->key), + KEY_OFFSET(&b->key), 0)); + + return ret; +} + +int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_keys_fn *fn, int flags) +{ + return bcache_btree_root(map_keys_recurse, c, op, from, fn, flags); +} + +/* Keybuf code */ + +static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) +{ + /* Overlapping keys compare equal */ + if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) + return -1; + if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) + return 1; + return 0; +} + +static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, + struct keybuf_key *r) +{ + return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); +} + +struct refill { + struct btree_op op; + unsigned int nr_found; + struct keybuf *buf; + struct bkey *end; + keybuf_pred_fn *pred; +}; + +static int refill_keybuf_fn(struct btree_op *op, struct btree *b, + struct bkey *k) +{ + struct refill *refill = container_of(op, struct refill, op); + struct keybuf *buf = refill->buf; + int ret = MAP_CONTINUE; + + if (bkey_cmp(k, refill->end) > 0) { + ret = MAP_DONE; + goto out; + } + + if (!KEY_SIZE(k)) /* end key */ + goto out; + + if (refill->pred(buf, k)) { + struct keybuf_key *w; + + spin_lock(&buf->lock); + + w = array_alloc(&buf->freelist); + if (!w) { + spin_unlock(&buf->lock); + return MAP_DONE; + } + + w->private = NULL; + bkey_copy(&w->key, k); + + if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) + array_free(&buf->freelist, w); + else + refill->nr_found++; + + if (array_freelist_empty(&buf->freelist)) + ret = MAP_DONE; + + spin_unlock(&buf->lock); + } +out: + buf->last_scanned = *k; + return ret; +} + +void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, + struct bkey *end, keybuf_pred_fn *pred) +{ + struct bkey start = buf->last_scanned; + struct refill refill; + + cond_resched(); + + bch_btree_op_init(&refill.op, -1); + refill.nr_found = 0; + refill.buf = buf; + refill.end = end; + refill.pred = pred; + + bch_btree_map_keys(&refill.op, c, &buf->last_scanned, + refill_keybuf_fn, MAP_END_KEY); + + trace_bcache_keyscan(refill.nr_found, + KEY_INODE(&start), KEY_OFFSET(&start), + KEY_INODE(&buf->last_scanned), + KEY_OFFSET(&buf->last_scanned)); + + spin_lock(&buf->lock); + + if (!RB_EMPTY_ROOT(&buf->keys)) { + struct keybuf_key *w; + + w = RB_FIRST(&buf->keys, struct keybuf_key, node); + buf->start = START_KEY(&w->key); + + w = RB_LAST(&buf->keys, struct keybuf_key, node); + buf->end = w->key; + } else { + buf->start = MAX_KEY; + buf->end = MAX_KEY; + } + + spin_unlock(&buf->lock); +} + +static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) +{ + rb_erase(&w->node, &buf->keys); + array_free(&buf->freelist, w); +} + +void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) +{ + spin_lock(&buf->lock); + __bch_keybuf_del(buf, w); + spin_unlock(&buf->lock); +} + +bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, + struct bkey *end) +{ + bool ret = false; + struct keybuf_key *p, *w, s; + + s.key = *start; + + if (bkey_cmp(end, &buf->start) <= 0 || + bkey_cmp(start, &buf->end) >= 0) + return false; + + spin_lock(&buf->lock); + w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); + + while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { + p = w; + w = RB_NEXT(w, node); + + if (p->private) + ret = true; + else + __bch_keybuf_del(buf, p); + } + + spin_unlock(&buf->lock); + return ret; +} + +struct keybuf_key *bch_keybuf_next(struct keybuf *buf) +{ + struct keybuf_key *w; + + spin_lock(&buf->lock); + + w = RB_FIRST(&buf->keys, struct keybuf_key, node); + + while (w && w->private) + w = RB_NEXT(w, node); + + if (w) + w->private = ERR_PTR(-EINTR); + + spin_unlock(&buf->lock); + return w; +} + +struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, + struct keybuf *buf, + struct bkey *end, + keybuf_pred_fn *pred) +{ + struct keybuf_key *ret; + + while (1) { + ret = bch_keybuf_next(buf); + if (ret) + break; + + if (bkey_cmp(&buf->last_scanned, end) >= 0) { + pr_debug("scan finished\n"); + break; + } + + bch_refill_keybuf(c, buf, end, pred); + } + + return ret; +} + +void bch_keybuf_init(struct keybuf *buf) +{ + buf->last_scanned = MAX_KEY; + buf->keys = RB_ROOT; + + spin_lock_init(&buf->lock); + array_allocator_init(&buf->freelist); +} + +void bch_btree_exit(void) +{ + if (btree_io_wq) + destroy_workqueue(btree_io_wq); +} + +int __init bch_btree_init(void) +{ + btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0); + if (!btree_io_wq) + return -ENOMEM; + + return 0; +} |