summaryrefslogtreecommitdiffstats
path: root/block/blk-mq.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r--block/blk-mq.c234
1 files changed, 148 insertions, 86 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5ac2087b05..1c85a5c79e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -28,6 +28,7 @@
#include <linux/prefetch.h>
#include <linux/blk-crypto.h>
#include <linux/part_stat.h>
+#include <linux/sched/isolation.h>
#include <trace/events/block.h>
@@ -92,7 +93,7 @@ static bool blk_mq_check_inflight(struct request *rq, void *priv)
struct mq_inflight *mi = priv;
if (rq->part && blk_do_io_stat(rq) &&
- (!mi->part->bd_partno || rq->part == mi->part) &&
+ (!bdev_is_partition(mi->part) || rq->part == mi->part) &&
blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
mi->inflight[rq_data_dir(rq)]++;
@@ -447,6 +448,10 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
if (data->cmd_flags & REQ_NOWAIT)
data->flags |= BLK_MQ_REQ_NOWAIT;
+retry:
+ data->ctx = blk_mq_get_ctx(q);
+ data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
+
if (q->elevator) {
/*
* All requests use scheduler tags when an I/O scheduler is
@@ -468,13 +473,9 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
if (ops->limit_depth)
ops->limit_depth(data->cmd_flags, data);
}
- }
-
-retry:
- data->ctx = blk_mq_get_ctx(q);
- data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
- if (!(data->rq_flags & RQF_SCHED_TAGS))
+ } else {
blk_mq_tag_busy(data->hctx);
+ }
if (data->flags & BLK_MQ_REQ_RESERVED)
data->rq_flags |= RQF_RESV;
@@ -690,6 +691,8 @@ static void blk_mq_finish_request(struct request *rq)
{
struct request_queue *q = rq->q;
+ blk_zone_finish_request(rq);
+
if (rq->rq_flags & RQF_USE_SCHED) {
q->elevator->type->ops.finish_request(rq);
/*
@@ -761,31 +764,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
}
EXPORT_SYMBOL(blk_dump_rq_flags);
-static void req_bio_endio(struct request *rq, struct bio *bio,
- unsigned int nbytes, blk_status_t error)
-{
- if (unlikely(error)) {
- bio->bi_status = error;
- } else if (req_op(rq) == REQ_OP_ZONE_APPEND) {
- /*
- * Partial zone append completions cannot be supported as the
- * BIO fragments may end up not being written sequentially.
- */
- if (bio->bi_iter.bi_size != nbytes)
- bio->bi_status = BLK_STS_IOERR;
- else
- bio->bi_iter.bi_sector = rq->__sector;
- }
-
- bio_advance(bio, nbytes);
-
- if (unlikely(rq->rq_flags & RQF_QUIET))
- bio_set_flag(bio, BIO_QUIET);
- /* don't actually finish bio if it's part of flush sequence */
- if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
- bio_endio(bio);
-}
-
static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
if (req->part && blk_do_io_stat(req)) {
@@ -845,8 +823,7 @@ static void blk_complete_request(struct request *req)
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- if (req_op(req) == REQ_OP_ZONE_APPEND)
- bio->bi_iter.bi_sector = req->__sector;
+ blk_zone_update_request_bio(req, bio);
if (!is_flush)
bio_endio(bio);
@@ -889,6 +866,8 @@ static void blk_complete_request(struct request *req)
bool blk_update_request(struct request *req, blk_status_t error,
unsigned int nr_bytes)
{
+ bool is_flush = req->rq_flags & RQF_FLUSH_SEQ;
+ bool quiet = req->rq_flags & RQF_QUIET;
int total_bytes;
trace_block_rq_complete(req, error, nr_bytes);
@@ -909,9 +888,8 @@ bool blk_update_request(struct request *req, blk_status_t error,
if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
__blk_crypto_rq_put_keyslot(req);
- if (unlikely(error && !blk_rq_is_passthrough(req) &&
- !(req->rq_flags & RQF_QUIET)) &&
- !test_bit(GD_DEAD, &req->q->disk->state)) {
+ if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) &&
+ !test_bit(GD_DEAD, &req->q->disk->state)) {
blk_print_req_error(req, error);
trace_block_rq_error(req, error, nr_bytes);
}
@@ -923,12 +901,33 @@ bool blk_update_request(struct request *req, blk_status_t error,
struct bio *bio = req->bio;
unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
- if (bio_bytes == bio->bi_iter.bi_size)
+ if (unlikely(error))
+ bio->bi_status = error;
+
+ if (bio_bytes == bio->bi_iter.bi_size) {
req->bio = bio->bi_next;
+ } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) {
+ /*
+ * Partial zone append completions cannot be supported
+ * as the BIO fragments may end up not being written
+ * sequentially.
+ */
+ bio->bi_status = BLK_STS_IOERR;
+ }
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- req_bio_endio(req, bio, bio_bytes, error);
+ if (unlikely(quiet))
+ bio_set_flag(bio, BIO_QUIET);
+
+ bio_advance(bio, bio_bytes);
+
+ /* Don't actually finish bio if it's part of flush sequence */
+ if (!bio->bi_iter.bi_size) {
+ blk_zone_update_request_bio(req, bio);
+ if (!is_flush)
+ bio_endio(bio);
+ }
total_bytes += bio_bytes;
nr_bytes -= bio_bytes;
@@ -1334,11 +1333,6 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head)
blk_account_io_start(rq);
- /*
- * As plugging can be enabled for passthrough requests on a zoned
- * device, directly accessing the plug instead of using blk_mq_plug()
- * should not have any consequences.
- */
if (current->plug && !at_head) {
blk_add_rq_to_plug(current->plug, rq);
return;
@@ -1925,19 +1919,6 @@ static void blk_mq_handle_dev_resource(struct request *rq,
__blk_mq_requeue_request(rq);
}
-static void blk_mq_handle_zone_resource(struct request *rq,
- struct list_head *zone_list)
-{
- /*
- * If we end up here it is because we cannot dispatch a request to a
- * specific zone due to LLD level zone-write locking or other zone
- * related resource not being available. In this case, set the request
- * aside in zone_list for retrying it later.
- */
- list_add(&rq->queuelist, zone_list);
- __blk_mq_requeue_request(rq);
-}
-
enum prep_dispatch {
PREP_DISPATCH_OK,
PREP_DISPATCH_NO_TAG,
@@ -2023,7 +2004,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
struct request *rq;
int queued;
blk_status_t ret = BLK_STS_OK;
- LIST_HEAD(zone_list);
bool needs_resource = false;
if (list_empty(list))
@@ -2065,23 +2045,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
case BLK_STS_DEV_RESOURCE:
blk_mq_handle_dev_resource(rq, list);
goto out;
- case BLK_STS_ZONE_RESOURCE:
- /*
- * Move the request to zone_list and keep going through
- * the dispatch list to find more requests the drive can
- * accept.
- */
- blk_mq_handle_zone_resource(rq, &zone_list);
- needs_resource = true;
- break;
default:
blk_mq_end_request(rq, ret);
}
} while (!list_empty(list));
out:
- if (!list_empty(&zone_list))
- list_splice_tail_init(&zone_list, list);
-
/* If we didn't flush the entire list, we could have told the driver
* there was more coming, but that turned out to be a lie.
*/
@@ -2168,6 +2136,15 @@ static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
}
/*
+ * ->next_cpu is always calculated from hctx->cpumask, so simply use
+ * it for speeding up the check
+ */
+static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx)
+{
+ return hctx->next_cpu >= nr_cpu_ids;
+}
+
+/*
* It'd be great if the workqueue API had a way to pass
* in a mask and had some smarts for more clever placement.
* For now we just round-robin here, switching for every
@@ -2178,7 +2155,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
bool tried = false;
int next_cpu = hctx->next_cpu;
- if (hctx->queue->nr_hw_queues == 1)
+ /* Switch to unbound if no allowable CPUs in this hctx */
+ if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx))
return WORK_CPU_UNBOUND;
if (--hctx->next_cpu_batch <= 0) {
@@ -2936,6 +2914,17 @@ static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
INIT_LIST_HEAD(&rq->queuelist);
}
+static bool bio_unaligned(const struct bio *bio, struct request_queue *q)
+{
+ unsigned int bs_mask = queue_logical_block_size(q) - 1;
+
+ /* .bi_sector of any zero sized bio need to be initialized */
+ if ((bio->bi_iter.bi_size & bs_mask) ||
+ ((bio->bi_iter.bi_sector << SECTOR_SHIFT) & bs_mask))
+ return true;
+ return false;
+}
+
/**
* blk_mq_submit_bio - Create and send a request to block device.
* @bio: Bio pointer.
@@ -2952,27 +2941,51 @@ static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
void blk_mq_submit_bio(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- struct blk_plug *plug = blk_mq_plug(bio);
+ struct blk_plug *plug = current->plug;
const int is_sync = op_is_sync(bio->bi_opf);
struct blk_mq_hw_ctx *hctx;
unsigned int nr_segs = 1;
struct request *rq;
blk_status_t ret;
+ /*
+ * If the plug has a cached request for this queue, try to use it.
+ */
+ rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
+
+ /*
+ * A BIO that was released from a zone write plug has already been
+ * through the preparation in this function, already holds a reference
+ * on the queue usage counter, and is the only write BIO in-flight for
+ * the target zone. Go straight to preparing a request for it.
+ */
+ if (bio_zone_write_plugging(bio)) {
+ nr_segs = bio->__bi_nr_segments;
+ if (rq)
+ blk_queue_exit(q);
+ goto new_request;
+ }
+
bio = blk_queue_bounce(bio, q);
/*
- * If the plug has a cached request for this queue, try use it.
- *
* The cached request already holds a q_usage_counter reference and we
* don't have to acquire a new one if we use it.
*/
- rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
if (!rq) {
if (unlikely(bio_queue_enter(bio)))
return;
}
+ /*
+ * Device reconfiguration may change logical block size, so alignment
+ * check has to be done with queue usage counter held
+ */
+ if (unlikely(bio_unaligned(bio, q))) {
+ bio_io_error(bio);
+ goto queue_exit;
+ }
+
if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
if (!bio)
@@ -2984,6 +2997,10 @@ void blk_mq_submit_bio(struct bio *bio)
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
goto queue_exit;
+ if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
+ goto queue_exit;
+
+new_request:
if (!rq) {
rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
if (unlikely(!rq))
@@ -3006,6 +3023,9 @@ void blk_mq_submit_bio(struct bio *bio)
return;
}
+ if (bio_zone_write_plugging(bio))
+ blk_zone_write_plug_init_request(rq);
+
if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
return;
@@ -3487,14 +3507,30 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
return data.has_rq;
}
-static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
- struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx,
+ unsigned int this_cpu)
{
- if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu)
- return false;
- if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
- return false;
- return true;
+ enum hctx_type type = hctx->type;
+ int cpu;
+
+ /*
+ * hctx->cpumask has to rule out isolated CPUs, but userspace still
+ * might submit IOs on these isolated CPUs, so use the queue map to
+ * check if all CPUs mapped to this hctx are offline
+ */
+ for_each_online_cpu(cpu) {
+ struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue,
+ type, cpu);
+
+ if (h != hctx)
+ continue;
+
+ /* this hctx has at least one online CPU */
+ if (this_cpu != cpu)
+ return true;
+ }
+
+ return false;
}
static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
@@ -3502,8 +3538,7 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
struct blk_mq_hw_ctx, cpuhp_online);
- if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
- !blk_mq_last_cpu_in_hctx(cpu, hctx))
+ if (blk_mq_hctx_has_online_cpu(hctx, cpu))
return 0;
/*
@@ -3530,12 +3565,28 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
return 0;
}
+/*
+ * Check if one CPU is mapped to the specified hctx
+ *
+ * Isolated CPUs have been ruled out from hctx->cpumask, which is supposed
+ * to be used for scheduling kworker only. For other usage, please call this
+ * helper for checking if one CPU belongs to the specified hctx
+ */
+static bool blk_mq_cpu_mapped_to_hctx(unsigned int cpu,
+ const struct blk_mq_hw_ctx *hctx)
+{
+ struct blk_mq_hw_ctx *mapped_hctx = blk_mq_map_queue_type(hctx->queue,
+ hctx->type, cpu);
+
+ return mapped_hctx == hctx;
+}
+
static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
{
struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
struct blk_mq_hw_ctx, cpuhp_online);
- if (cpumask_test_cpu(cpu, hctx->cpumask))
+ if (blk_mq_cpu_mapped_to_hctx(cpu, hctx))
clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
return 0;
}
@@ -3553,7 +3604,7 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
enum hctx_type type;
hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
- if (!cpumask_test_cpu(cpu, hctx->cpumask))
+ if (!blk_mq_cpu_mapped_to_hctx(cpu, hctx))
return 0;
ctx = __blk_mq_get_ctx(hctx->queue, cpu);
@@ -3911,6 +3962,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
}
queue_for_each_hw_ctx(q, hctx, i) {
+ int cpu;
+
/*
* If no software queues are mapped to this hardware queue,
* disable it and free the request entries.
@@ -3938,6 +3991,15 @@ static void blk_mq_map_swqueue(struct request_queue *q)
sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
/*
+ * Rule out isolated CPUs from hctx->cpumask to avoid
+ * running block kworker on isolated CPUs
+ */
+ for_each_cpu(cpu, hctx->cpumask) {
+ if (cpu_is_isolated(cpu))
+ cpumask_clear_cpu(cpu, hctx->cpumask);
+ }
+
+ /*
* Initialize batch roundrobin counts
*/
hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);