diff options
Diffstat (limited to 'drivers/block/aoe/aoedev.c')
-rw-r--r-- | drivers/block/aoe/aoedev.c | 532 |
1 files changed, 532 insertions, 0 deletions
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c new file mode 100644 index 000000000..e2ea2356d --- /dev/null +++ b/drivers/block/aoe/aoedev.c @@ -0,0 +1,532 @@ +/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ +/* + * aoedev.c + * AoE device utility functions; maintains device list. + */ + +#include <linux/hdreg.h> +#include <linux/blk-mq.h> +#include <linux/netdevice.h> +#include <linux/delay.h> +#include <linux/slab.h> +#include <linux/bitmap.h> +#include <linux/kdev_t.h> +#include <linux/moduleparam.h> +#include <linux/string.h> +#include "aoe.h" + +static void freetgt(struct aoedev *d, struct aoetgt *t); +static void skbpoolfree(struct aoedev *d); + +static int aoe_dyndevs = 1; +module_param(aoe_dyndevs, int, 0644); +MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices."); + +static struct aoedev *devlist; +static DEFINE_SPINLOCK(devlist_lock); + +/* Because some systems will have one, many, or no + * - partitions, + * - slots per shelf, + * - or shelves, + * we need some flexibility in the way the minor numbers + * are allocated. So they are dynamic. + */ +#define N_DEVS ((1U<<MINORBITS)/AOE_PARTITIONS) + +static DEFINE_SPINLOCK(used_minors_lock); +static DECLARE_BITMAP(used_minors, N_DEVS); + +static int +minor_get_dyn(ulong *sysminor) +{ + ulong flags; + ulong n; + int error = 0; + + spin_lock_irqsave(&used_minors_lock, flags); + n = find_first_zero_bit(used_minors, N_DEVS); + if (n < N_DEVS) + set_bit(n, used_minors); + else + error = -1; + spin_unlock_irqrestore(&used_minors_lock, flags); + + *sysminor = n * AOE_PARTITIONS; + return error; +} + +static int +minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin) +{ + ulong flags; + ulong n; + int error = 0; + enum { + /* for backwards compatibility when !aoe_dyndevs, + * a static number of supported slots per shelf */ + NPERSHELF = 16, + }; + + if (aoemin >= NPERSHELF) { + pr_err("aoe: %s %d slots per shelf\n", + "static minor device numbers support only", + NPERSHELF); + error = -1; + goto out; + } + + n = aoemaj * NPERSHELF + aoemin; + if (n >= N_DEVS) { + pr_err("aoe: %s with e%ld.%d\n", + "cannot use static minor device numbers", + aoemaj, aoemin); + error = -1; + goto out; + } + + spin_lock_irqsave(&used_minors_lock, flags); + if (test_bit(n, used_minors)) { + pr_err("aoe: %s %lu\n", + "existing device already has static minor number", + n); + error = -1; + } else + set_bit(n, used_minors); + spin_unlock_irqrestore(&used_minors_lock, flags); + *sysminor = n * AOE_PARTITIONS; +out: + return error; +} + +static int +minor_get(ulong *sysminor, ulong aoemaj, int aoemin) +{ + if (aoe_dyndevs) + return minor_get_dyn(sysminor); + else + return minor_get_static(sysminor, aoemaj, aoemin); +} + +static void +minor_free(ulong minor) +{ + ulong flags; + + minor /= AOE_PARTITIONS; + BUG_ON(minor >= N_DEVS); + + spin_lock_irqsave(&used_minors_lock, flags); + BUG_ON(!test_bit(minor, used_minors)); + clear_bit(minor, used_minors); + spin_unlock_irqrestore(&used_minors_lock, flags); +} + +/* + * Users who grab a pointer to the device with aoedev_by_aoeaddr + * automatically get a reference count and must be responsible + * for performing a aoedev_put. With the addition of async + * kthread processing I'm no longer confident that we can + * guarantee consistency in the face of device flushes. + * + * For the time being, we only bother to add extra references for + * frames sitting on the iocq. When the kthreads finish processing + * these frames, they will aoedev_put the device. + */ + +void +aoedev_put(struct aoedev *d) +{ + ulong flags; + + spin_lock_irqsave(&devlist_lock, flags); + d->ref--; + spin_unlock_irqrestore(&devlist_lock, flags); +} + +static void +dummy_timer(struct timer_list *t) +{ + struct aoedev *d; + + d = from_timer(d, t, timer); + if (d->flags & DEVFL_TKILL) + return; + d->timer.expires = jiffies + HZ; + add_timer(&d->timer); +} + +static void +aoe_failip(struct aoedev *d) +{ + struct request *rq; + struct aoe_req *req; + struct bio *bio; + + aoe_failbuf(d, d->ip.buf); + rq = d->ip.rq; + if (rq == NULL) + return; + + req = blk_mq_rq_to_pdu(rq); + while ((bio = d->ip.nxbio)) { + bio->bi_status = BLK_STS_IOERR; + d->ip.nxbio = bio->bi_next; + req->nr_bios--; + } + + if (!req->nr_bios) + aoe_end_request(d, rq, 0); +} + +static void +downdev_frame(struct list_head *pos) +{ + struct frame *f; + + f = list_entry(pos, struct frame, head); + list_del(pos); + if (f->buf) { + f->buf->nframesout--; + aoe_failbuf(f->t->d, f->buf); + } + aoe_freetframe(f); +} + +void +aoedev_downdev(struct aoedev *d) +{ + struct aoetgt *t, **tt, **te; + struct list_head *head, *pos, *nx; + int i; + + d->flags &= ~DEVFL_UP; + + /* clean out active and to-be-retransmitted buffers */ + for (i = 0; i < NFACTIVE; i++) { + head = &d->factive[i]; + list_for_each_safe(pos, nx, head) + downdev_frame(pos); + } + head = &d->rexmitq; + list_for_each_safe(pos, nx, head) + downdev_frame(pos); + + /* reset window dressings */ + tt = d->targets; + te = tt + d->ntargets; + for (; tt < te && (t = *tt); tt++) { + aoecmd_wreset(t); + t->nout = 0; + } + + /* clean out the in-process request (if any) */ + aoe_failip(d); + + /* fast fail all pending I/O */ + if (d->blkq) { + /* UP is cleared, freeze+quiesce to insure all are errored */ + blk_mq_freeze_queue(d->blkq); + blk_mq_quiesce_queue(d->blkq); + blk_mq_unquiesce_queue(d->blkq); + blk_mq_unfreeze_queue(d->blkq); + } + + if (d->gd) + set_capacity(d->gd, 0); +} + +/* return whether the user asked for this particular + * device to be flushed + */ +static int +user_req(char *s, size_t slen, struct aoedev *d) +{ + const char *p; + size_t lim; + + if (!d->gd) + return 0; + p = kbasename(d->gd->disk_name); + lim = sizeof(d->gd->disk_name); + lim -= p - d->gd->disk_name; + if (slen < lim) + lim = slen; + + return !strncmp(s, p, lim); +} + +static void +freedev(struct aoedev *d) +{ + struct aoetgt **t, **e; + int freeing = 0; + unsigned long flags; + + spin_lock_irqsave(&d->lock, flags); + if (d->flags & DEVFL_TKILL + && !(d->flags & DEVFL_FREEING)) { + d->flags |= DEVFL_FREEING; + freeing = 1; + } + spin_unlock_irqrestore(&d->lock, flags); + if (!freeing) + return; + + del_timer_sync(&d->timer); + if (d->gd) { + aoedisk_rm_debugfs(d); + del_gendisk(d->gd); + put_disk(d->gd); + blk_mq_free_tag_set(&d->tag_set); + blk_cleanup_queue(d->blkq); + } + t = d->targets; + e = t + d->ntargets; + for (; t < e && *t; t++) + freetgt(d, *t); + + mempool_destroy(d->bufpool); + skbpoolfree(d); + minor_free(d->sysminor); + + spin_lock_irqsave(&d->lock, flags); + d->flags |= DEVFL_FREED; + spin_unlock_irqrestore(&d->lock, flags); +} + +enum flush_parms { + NOT_EXITING = 0, + EXITING = 1, +}; + +static int +flush(const char __user *str, size_t cnt, int exiting) +{ + ulong flags; + struct aoedev *d, **dd; + char buf[16]; + int all = 0; + int specified = 0; /* flush a specific device */ + unsigned int skipflags; + + skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL; + + if (!exiting && cnt >= 3) { + if (cnt > sizeof buf) + cnt = sizeof buf; + if (copy_from_user(buf, str, cnt)) + return -EFAULT; + all = !strncmp(buf, "all", 3); + if (!all) + specified = 1; + } + + flush_scheduled_work(); + /* pass one: do aoedev_downdev, which might sleep */ +restart1: + spin_lock_irqsave(&devlist_lock, flags); + for (d = devlist; d; d = d->next) { + spin_lock(&d->lock); + if (d->flags & DEVFL_TKILL) + goto cont; + + if (exiting) { + /* unconditionally take each device down */ + } else if (specified) { + if (!user_req(buf, cnt, d)) + goto cont; + } else if ((!all && (d->flags & DEVFL_UP)) + || d->flags & skipflags + || d->nopen + || d->ref) + goto cont; + + spin_unlock(&d->lock); + spin_unlock_irqrestore(&devlist_lock, flags); + aoedev_downdev(d); + d->flags |= DEVFL_TKILL; + goto restart1; +cont: + spin_unlock(&d->lock); + } + spin_unlock_irqrestore(&devlist_lock, flags); + + /* pass two: call freedev, which might sleep, + * for aoedevs marked with DEVFL_TKILL + */ +restart2: + spin_lock_irqsave(&devlist_lock, flags); + for (d = devlist; d; d = d->next) { + spin_lock(&d->lock); + if (d->flags & DEVFL_TKILL + && !(d->flags & DEVFL_FREEING)) { + spin_unlock(&d->lock); + spin_unlock_irqrestore(&devlist_lock, flags); + freedev(d); + goto restart2; + } + spin_unlock(&d->lock); + } + + /* pass three: remove aoedevs marked with DEVFL_FREED */ + for (dd = &devlist, d = *dd; d; d = *dd) { + struct aoedev *doomed = NULL; + + spin_lock(&d->lock); + if (d->flags & DEVFL_FREED) { + *dd = d->next; + doomed = d; + } else { + dd = &d->next; + } + spin_unlock(&d->lock); + if (doomed) + kfree(doomed->targets); + kfree(doomed); + } + spin_unlock_irqrestore(&devlist_lock, flags); + + return 0; +} + +int +aoedev_flush(const char __user *str, size_t cnt) +{ + return flush(str, cnt, NOT_EXITING); +} + +/* This has been confirmed to occur once with Tms=3*1000 due to the + * driver changing link and not processing its transmit ring. The + * problem is hard enough to solve by returning an error that I'm + * still punting on "solving" this. + */ +static void +skbfree(struct sk_buff *skb) +{ + enum { Sms = 250, Tms = 30 * 1000}; + int i = Tms / Sms; + + if (skb == NULL) + return; + while (atomic_read(&skb_shinfo(skb)->dataref) != 1 && i-- > 0) + msleep(Sms); + if (i < 0) { + printk(KERN_ERR + "aoe: %s holds ref: %s\n", + skb->dev ? skb->dev->name : "netif", + "cannot free skb -- memory leaked."); + return; + } + skb->truesize -= skb->data_len; + skb_shinfo(skb)->nr_frags = skb->data_len = 0; + skb_trim(skb, 0); + dev_kfree_skb(skb); +} + +static void +skbpoolfree(struct aoedev *d) +{ + struct sk_buff *skb, *tmp; + + skb_queue_walk_safe(&d->skbpool, skb, tmp) + skbfree(skb); + + __skb_queue_head_init(&d->skbpool); +} + +/* find it or allocate it */ +struct aoedev * +aoedev_by_aoeaddr(ulong maj, int min, int do_alloc) +{ + struct aoedev *d; + int i; + ulong flags; + ulong sysminor = 0; + + spin_lock_irqsave(&devlist_lock, flags); + + for (d=devlist; d; d=d->next) + if (d->aoemajor == maj && d->aoeminor == min) { + spin_lock(&d->lock); + if (d->flags & DEVFL_TKILL) { + spin_unlock(&d->lock); + d = NULL; + goto out; + } + d->ref++; + spin_unlock(&d->lock); + break; + } + if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0) + goto out; + d = kcalloc(1, sizeof *d, GFP_ATOMIC); + if (!d) + goto out; + d->targets = kcalloc(NTARGETS, sizeof(*d->targets), GFP_ATOMIC); + if (!d->targets) { + kfree(d); + d = NULL; + goto out; + } + d->ntargets = NTARGETS; + INIT_WORK(&d->work, aoecmd_sleepwork); + spin_lock_init(&d->lock); + INIT_LIST_HEAD(&d->rq_list); + skb_queue_head_init(&d->skbpool); + timer_setup(&d->timer, dummy_timer, 0); + d->timer.expires = jiffies + HZ; + add_timer(&d->timer); + d->bufpool = NULL; /* defer to aoeblk_gdalloc */ + d->tgt = d->targets; + d->ref = 1; + for (i = 0; i < NFACTIVE; i++) + INIT_LIST_HEAD(&d->factive[i]); + INIT_LIST_HEAD(&d->rexmitq); + d->sysminor = sysminor; + d->aoemajor = maj; + d->aoeminor = min; + d->rttavg = RTTAVG_INIT; + d->rttdev = RTTDEV_INIT; + d->next = devlist; + devlist = d; + out: + spin_unlock_irqrestore(&devlist_lock, flags); + return d; +} + +static void +freetgt(struct aoedev *d, struct aoetgt *t) +{ + struct frame *f; + struct list_head *pos, *nx, *head; + struct aoeif *ifp; + + for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) { + if (!ifp->nd) + break; + dev_put(ifp->nd); + } + + head = &t->ffree; + list_for_each_safe(pos, nx, head) { + list_del(pos); + f = list_entry(pos, struct frame, head); + skbfree(f->skb); + kfree(f); + } + kfree(t); +} + +void +aoedev_exit(void) +{ + flush_scheduled_work(); + flush(NULL, 0, EXITING); +} + +int __init +aoedev_init(void) +{ + return 0; +} |