summaryrefslogtreecommitdiffstats
path: root/drivers/nvme/host
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r--drivers/nvme/host/apple.c8
-rw-r--r--drivers/nvme/host/core.c507
-rw-r--r--drivers/nvme/host/fabrics.c22
-rw-r--r--drivers/nvme/host/fc.c4
-rw-r--r--drivers/nvme/host/multipath.c20
-rw-r--r--drivers/nvme/host/nvme.h21
-rw-r--r--drivers/nvme/host/pr.c5
-rw-r--r--drivers/nvme/host/rdma.c14
-rw-r--r--drivers/nvme/host/sysfs.c3
-rw-r--r--drivers/nvme/host/tcp.c28
-rw-r--r--drivers/nvme/host/trace.c105
-rw-r--r--drivers/nvme/host/zns.c41
12 files changed, 472 insertions, 306 deletions
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index c727cd1f26..dd6ec08651 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -1516,7 +1516,7 @@ static int apple_nvme_probe(struct platform_device *pdev)
goto put_dev;
}
- anv->ctrl.admin_q = blk_mq_init_queue(&anv->admin_tagset);
+ anv->ctrl.admin_q = blk_mq_alloc_queue(&anv->admin_tagset, NULL, NULL);
if (IS_ERR(anv->ctrl.admin_q)) {
ret = -ENOMEM;
goto put_dev;
@@ -1532,7 +1532,7 @@ put_dev:
return ret;
}
-static int apple_nvme_remove(struct platform_device *pdev)
+static void apple_nvme_remove(struct platform_device *pdev)
{
struct apple_nvme *anv = platform_get_drvdata(pdev);
@@ -1547,8 +1547,6 @@ static int apple_nvme_remove(struct platform_device *pdev)
apple_rtkit_shutdown(anv->rtk);
apple_nvme_detach_genpd(anv);
-
- return 0;
}
static void apple_nvme_shutdown(struct platform_device *pdev)
@@ -1598,7 +1596,7 @@ static struct platform_driver apple_nvme_driver = {
.pm = pm_sleep_ptr(&apple_nvme_pm_ops),
},
.probe = apple_nvme_probe,
- .remove = apple_nvme_remove,
+ .remove_new = apple_nvme_remove,
.shutdown = apple_nvme_shutdown,
};
module_platform_driver(apple_nvme_driver);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3cc79817e4..d513fd2758 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -114,12 +114,21 @@ static DEFINE_MUTEX(nvme_subsystems_lock);
static DEFINE_IDA(nvme_instance_ida);
static dev_t nvme_ctrl_base_chr_devt;
-static struct class *nvme_class;
-static struct class *nvme_subsys_class;
+static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env);
+static const struct class nvme_class = {
+ .name = "nvme",
+ .dev_uevent = nvme_class_uevent,
+};
+
+static const struct class nvme_subsys_class = {
+ .name = "nvme-subsystem",
+};
static DEFINE_IDA(nvme_ns_chr_minor_ida);
static dev_t nvme_ns_chr_devt;
-static struct class *nvme_ns_chr_class;
+static const struct class nvme_ns_chr_class = {
+ .name = "nvme-generic",
+};
static void nvme_put_subsystem(struct nvme_subsystem *subsys);
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
@@ -405,7 +414,15 @@ static inline void nvme_end_req_zoned(struct request *req)
}
}
-static inline void nvme_end_req(struct request *req)
+static inline void __nvme_end_req(struct request *req)
+{
+ nvme_end_req_zoned(req);
+ nvme_trace_bio_complete(req);
+ if (req->cmd_flags & REQ_NVME_MPATH)
+ nvme_mpath_end_request(req);
+}
+
+void nvme_end_req(struct request *req)
{
blk_status_t status = nvme_error_status(nvme_req(req)->status);
@@ -415,10 +432,7 @@ static inline void nvme_end_req(struct request *req)
else
nvme_log_error(req);
}
- nvme_end_req_zoned(req);
- nvme_trace_bio_complete(req);
- if (req->cmd_flags & REQ_NVME_MPATH)
- nvme_mpath_end_request(req);
+ __nvme_end_req(req);
blk_mq_end_request(req, status);
}
@@ -467,7 +481,7 @@ void nvme_complete_batch_req(struct request *req)
{
trace_nvme_complete_rq(req);
nvme_cleanup_cmd(req);
- nvme_end_req_zoned(req);
+ __nvme_end_req(req);
}
EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
@@ -1377,8 +1391,10 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
sizeof(struct nvme_id_ctrl));
- if (error)
+ if (error) {
kfree(*id);
+ *id = NULL;
+ }
return error;
}
@@ -1507,6 +1523,7 @@ int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
if (error) {
dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
kfree(*id);
+ *id = NULL;
}
return error;
}
@@ -1706,12 +1723,23 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return 0;
}
-#ifdef CONFIG_BLK_DEV_INTEGRITY
-static void nvme_init_integrity(struct gendisk *disk,
- struct nvme_ns_head *head, u32 max_integrity_segments)
+static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head)
{
struct blk_integrity integrity = { };
+ blk_integrity_unregister(disk);
+
+ if (!head->ms)
+ return true;
+
+ /*
+ * PI can always be supported as we can ask the controller to simply
+ * insert/strip it, which is not possible for other kinds of metadata.
+ */
+ if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ||
+ !(head->features & NVME_NS_METADATA_SUPPORTED))
+ return nvme_ns_has_pi(head);
+
switch (head->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
switch (head->guard_type) {
@@ -1754,53 +1782,29 @@ static void nvme_init_integrity(struct gendisk *disk,
}
integrity.tuple_size = head->ms;
+ integrity.pi_offset = head->pi_offset;
blk_integrity_register(disk, &integrity);
- blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
-}
-#else
-static void nvme_init_integrity(struct gendisk *disk,
- struct nvme_ns_head *head, u32 max_integrity_segments)
-{
+ return true;
}
-#endif /* CONFIG_BLK_DEV_INTEGRITY */
-static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
- struct nvme_ns_head *head)
+static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim)
{
- struct request_queue *queue = disk->queue;
- u32 max_discard_sectors;
-
- if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) {
- max_discard_sectors = nvme_lba_to_sect(head, ctrl->dmrsl);
- } else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
- max_discard_sectors = UINT_MAX;
- } else {
- blk_queue_max_discard_sectors(queue, 0);
- return;
- }
+ struct nvme_ctrl *ctrl = ns->ctrl;
- BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
- NVME_DSM_MAX_RANGES);
+ if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX))
+ lim->max_hw_discard_sectors =
+ nvme_lba_to_sect(ns->head, ctrl->dmrsl);
+ else if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
+ lim->max_hw_discard_sectors = UINT_MAX;
+ else
+ lim->max_hw_discard_sectors = 0;
- /*
- * If discard is already enabled, don't reset queue limits.
- *
- * This works around the fact that the block layer can't cope well with
- * updating the hardware limits when overridden through sysfs. This is
- * harmless because discard limits in NVMe are purely advisory.
- */
- if (queue->limits.max_discard_sectors)
- return;
+ lim->discard_granularity = lim->logical_block_size;
- blk_queue_max_discard_sectors(queue, max_discard_sectors);
if (ctrl->dmrl)
- blk_queue_max_discard_segments(queue, ctrl->dmrl);
+ lim->max_discard_segments = ctrl->dmrl;
else
- blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
- queue->limits.discard_granularity = queue_logical_block_size(queue);
-
- if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
- blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
+ lim->max_discard_segments = NVME_DSM_MAX_RANGES;
}
static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
@@ -1811,42 +1815,38 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
a->csi == b->csi;
}
-static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head,
- struct nvme_id_ns *id)
+static int nvme_identify_ns_nvm(struct nvme_ctrl *ctrl, unsigned int nsid,
+ struct nvme_id_ns_nvm **nvmp)
{
- bool first = id->dps & NVME_NS_DPS_PI_FIRST;
- unsigned lbaf = nvme_lbaf_index(id->flbas);
- struct nvme_command c = { };
+ struct nvme_command c = {
+ .identify.opcode = nvme_admin_identify,
+ .identify.nsid = cpu_to_le32(nsid),
+ .identify.cns = NVME_ID_CNS_CS_NS,
+ .identify.csi = NVME_CSI_NVM,
+ };
struct nvme_id_ns_nvm *nvm;
- int ret = 0;
- u32 elbaf;
-
- head->pi_size = 0;
- head->ms = le16_to_cpu(id->lbaf[lbaf].ms);
- if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
- head->pi_size = sizeof(struct t10_pi_tuple);
- head->guard_type = NVME_NVM_NS_16B_GUARD;
- goto set_pi;
- }
+ int ret;
nvm = kzalloc(sizeof(*nvm), GFP_KERNEL);
if (!nvm)
return -ENOMEM;
- c.identify.opcode = nvme_admin_identify;
- c.identify.nsid = cpu_to_le32(head->ns_id);
- c.identify.cns = NVME_ID_CNS_CS_NS;
- c.identify.csi = NVME_CSI_NVM;
-
ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm));
if (ret)
- goto free_data;
+ kfree(nvm);
+ else
+ *nvmp = nvm;
+ return ret;
+}
- elbaf = le32_to_cpu(nvm->elbaf[lbaf]);
+static void nvme_configure_pi_elbas(struct nvme_ns_head *head,
+ struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm)
+{
+ u32 elbaf = le32_to_cpu(nvm->elbaf[nvme_lbaf_index(id->flbas)]);
/* no support for storage tag formats right now */
if (nvme_elbaf_sts(elbaf))
- goto free_data;
+ return;
head->guard_type = nvme_elbaf_guard_type(elbaf);
switch (head->guard_type) {
@@ -1859,30 +1859,31 @@ static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head,
default:
break;
}
-
-free_data:
- kfree(nvm);
-set_pi:
- if (head->pi_size && (first || head->ms == head->pi_size))
- head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
- else
- head->pi_type = 0;
-
- return ret;
}
-static int nvme_configure_metadata(struct nvme_ctrl *ctrl,
- struct nvme_ns_head *head, struct nvme_id_ns *id)
+static void nvme_configure_metadata(struct nvme_ctrl *ctrl,
+ struct nvme_ns_head *head, struct nvme_id_ns *id,
+ struct nvme_id_ns_nvm *nvm)
{
- int ret;
-
- ret = nvme_init_ms(ctrl, head, id);
- if (ret)
- return ret;
-
head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
+ head->pi_type = 0;
+ head->pi_size = 0;
+ head->pi_offset = 0;
+ head->ms = le16_to_cpu(id->lbaf[nvme_lbaf_index(id->flbas)].ms);
if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
- return 0;
+ return;
+
+ if (nvm && (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
+ nvme_configure_pi_elbas(head, id, nvm);
+ } else {
+ head->pi_size = sizeof(struct t10_pi_tuple);
+ head->guard_type = NVME_NVM_NS_16B_GUARD;
+ }
+
+ if (head->pi_size && head->ms >= head->pi_size)
+ head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+ if (!(id->dps & NVME_NS_DPS_PI_FIRST))
+ head->pi_offset = head->ms - head->pi_size;
if (ctrl->ops->flags & NVME_F_FABRICS) {
/*
@@ -1891,7 +1892,7 @@ static int nvme_configure_metadata(struct nvme_ctrl *ctrl,
* remap the separate metadata buffer from the block layer.
*/
if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
- return 0;
+ return;
head->features |= NVME_NS_EXT_LBAS;
@@ -1918,33 +1919,32 @@ static int nvme_configure_metadata(struct nvme_ctrl *ctrl,
else
head->features |= NVME_NS_METADATA_SUPPORTED;
}
- return 0;
}
-static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
- struct request_queue *q)
+static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
{
- bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT;
-
- if (ctrl->max_hw_sectors) {
- u32 max_segments =
- (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
+ return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + 1;
+}
- max_segments = min_not_zero(max_segments, ctrl->max_segments);
- blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
- blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
- }
- blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
- blk_queue_dma_alignment(q, 3);
- blk_queue_write_cache(q, vwc, vwc);
+static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl,
+ struct queue_limits *lim)
+{
+ lim->max_hw_sectors = ctrl->max_hw_sectors;
+ lim->max_segments = min_t(u32, USHRT_MAX,
+ min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments));
+ lim->max_integrity_segments = ctrl->max_integrity_segments;
+ lim->virt_boundary_mask = NVME_CTRL_PAGE_SIZE - 1;
+ lim->max_segment_size = UINT_MAX;
+ lim->dma_alignment = 3;
}
-static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
- struct nvme_ns_head *head, struct nvme_id_ns *id)
+static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
+ struct queue_limits *lim)
{
- sector_t capacity = nvme_lba_to_sect(head, le64_to_cpu(id->nsze));
+ struct nvme_ns_head *head = ns->head;
u32 bs = 1U << head->lba_shift;
u32 atomic_bs, phys_bs, io_opt = 0;
+ bool valid = true;
/*
* The block layer can't support LBA sizes larger than the page size
@@ -1952,12 +1952,10 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
* allow block I/O.
*/
if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) {
- capacity = 0;
bs = (1 << 9);
+ valid = false;
}
- blk_integrity_unregister(disk);
-
atomic_bs = phys_bs = bs;
if (id->nabo == 0) {
/*
@@ -1968,7 +1966,7 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
else
- atomic_bs = (1 + ctrl->subsys->awupf) * bs;
+ atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
}
if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
@@ -1978,36 +1976,20 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
io_opt = bs * (1 + le16_to_cpu(id->nows));
}
- blk_queue_logical_block_size(disk->queue, bs);
/*
* Linux filesystems assume writing a single physical block is
* an atomic operation. Hence limit the physical block size to the
* value of the Atomic Write Unit Power Fail parameter.
*/
- blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
- blk_queue_io_min(disk->queue, phys_bs);
- blk_queue_io_opt(disk->queue, io_opt);
-
- /*
- * Register a metadata profile for PI, or the plain non-integrity NVMe
- * metadata masquerading as Type 0 if supported, otherwise reject block
- * I/O to namespaces with metadata except when the namespace supports
- * PI, as it can strip/insert in that case.
- */
- if (head->ms) {
- if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
- (head->features & NVME_NS_METADATA_SUPPORTED))
- nvme_init_integrity(disk, head,
- ctrl->max_integrity_segments);
- else if (!nvme_ns_has_pi(head))
- capacity = 0;
- }
-
- set_capacity_and_notify(disk, capacity);
-
- nvme_config_discard(ctrl, disk, head);
- blk_queue_max_write_zeroes_sectors(disk->queue,
- ctrl->max_zeroes_sectors);
+ lim->logical_block_size = bs;
+ lim->physical_block_size = min(phys_bs, atomic_bs);
+ lim->io_min = phys_bs;
+ lim->io_opt = io_opt;
+ if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
+ lim->max_write_zeroes_sectors = UINT_MAX;
+ else
+ lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors;
+ return valid;
}
static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
@@ -2021,7 +2003,8 @@ static inline bool nvme_first_scan(struct gendisk *disk)
return !disk_live(disk);
}
-static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
+static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id,
+ struct queue_limits *lim)
{
struct nvme_ctrl *ctrl = ns->ctrl;
u32 iob;
@@ -2049,38 +2032,37 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
return;
}
- blk_queue_chunk_sectors(ns->queue, iob);
+ lim->chunk_sectors = iob;
}
static int nvme_update_ns_info_generic(struct nvme_ns *ns,
struct nvme_ns_info *info)
{
+ struct queue_limits lim;
+ int ret;
+
blk_mq_freeze_queue(ns->disk->queue);
- nvme_set_queue_limits(ns->ctrl, ns->queue);
+ lim = queue_limits_start_update(ns->disk->queue);
+ nvme_set_ctrl_limits(ns->ctrl, &lim);
+ ret = queue_limits_commit_update(ns->disk->queue, &lim);
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
blk_mq_unfreeze_queue(ns->disk->queue);
- if (nvme_ns_head_multipath(ns->head)) {
- blk_mq_freeze_queue(ns->head->disk->queue);
- set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
- nvme_mpath_revalidate_paths(ns);
- blk_stack_limits(&ns->head->disk->queue->limits,
- &ns->queue->limits, 0);
- ns->head->disk->flags |= GENHD_FL_HIDDEN;
- blk_mq_unfreeze_queue(ns->head->disk->queue);
- }
-
/* Hide the block-interface for these devices */
- ns->disk->flags |= GENHD_FL_HIDDEN;
- set_bit(NVME_NS_READY, &ns->flags);
-
- return 0;
+ if (!ret)
+ ret = -ENODEV;
+ return ret;
}
static int nvme_update_ns_info_block(struct nvme_ns *ns,
struct nvme_ns_info *info)
{
+ bool vwc = ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT;
+ struct queue_limits lim;
+ struct nvme_id_ns_nvm *nvm = NULL;
+ struct nvme_zone_info zi = {};
struct nvme_id_ns *id;
+ sector_t capacity;
unsigned lbaf;
int ret;
@@ -2091,31 +2073,55 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if (id->ncap == 0) {
/* namespace not allocated or attached */
info->is_removed = true;
- ret = -ENODEV;
- goto error;
+ ret = -ENXIO;
+ goto out;
+ }
+ lbaf = nvme_lbaf_index(id->flbas);
+
+ if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) {
+ ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm);
+ if (ret < 0)
+ goto out;
+ }
+
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ ns->head->ids.csi == NVME_CSI_ZNS) {
+ ret = nvme_query_zone_info(ns, lbaf, &zi);
+ if (ret < 0)
+ goto out;
}
blk_mq_freeze_queue(ns->disk->queue);
- lbaf = nvme_lbaf_index(id->flbas);
ns->head->lba_shift = id->lbaf[lbaf].ds;
ns->head->nuse = le64_to_cpu(id->nuse);
- nvme_set_queue_limits(ns->ctrl, ns->queue);
+ capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
- ret = nvme_configure_metadata(ns->ctrl, ns->head, id);
- if (ret < 0) {
+ lim = queue_limits_start_update(ns->disk->queue);
+ nvme_set_ctrl_limits(ns->ctrl, &lim);
+ nvme_configure_metadata(ns->ctrl, ns->head, id, nvm);
+ nvme_set_chunk_sectors(ns, id, &lim);
+ if (!nvme_update_disk_info(ns, id, &lim))
+ capacity = 0;
+ nvme_config_discard(ns, &lim);
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ ns->head->ids.csi == NVME_CSI_ZNS)
+ nvme_update_zone_info(ns, &lim, &zi);
+ ret = queue_limits_commit_update(ns->disk->queue, &lim);
+ if (ret) {
blk_mq_unfreeze_queue(ns->disk->queue);
goto out;
}
- nvme_set_chunk_sectors(ns, id);
- nvme_update_disk_info(ns->ctrl, ns->disk, ns->head, id);
- if (ns->head->ids.csi == NVME_CSI_ZNS) {
- ret = nvme_update_zone_info(ns, lbaf);
- if (ret) {
- blk_mq_unfreeze_queue(ns->disk->queue);
- goto out;
- }
- }
+ /*
+ * Register a metadata profile for PI, or the plain non-integrity NVMe
+ * metadata masquerading as Type 0 if supported, otherwise reject block
+ * I/O to namespaces with metadata except when the namespace supports
+ * PI, as it can strip/insert in that case.
+ */
+ if (!nvme_init_integrity(ns->disk, ns->head))
+ capacity = 0;
+
+ set_capacity_and_notify(ns->disk, capacity);
/*
* Only set the DEAC bit if the device guarantees that reads from
@@ -2126,62 +2132,101 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
ns->head->features |= NVME_NS_DEAC;
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
+ blk_queue_write_cache(ns->disk->queue, vwc, vwc);
set_bit(NVME_NS_READY, &ns->flags);
blk_mq_unfreeze_queue(ns->disk->queue);
if (blk_queue_is_zoned(ns->queue)) {
- ret = nvme_revalidate_zones(ns);
+ ret = blk_revalidate_disk_zones(ns->disk, NULL);
if (ret && !nvme_first_scan(ns->disk))
goto out;
}
- if (nvme_ns_head_multipath(ns->head)) {
- blk_mq_freeze_queue(ns->head->disk->queue);
- nvme_update_disk_info(ns->ctrl, ns->head->disk, ns->head, id);
- set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
- nvme_mpath_revalidate_paths(ns);
- blk_stack_limits(&ns->head->disk->queue->limits,
- &ns->queue->limits, 0);
- disk_update_readahead(ns->head->disk);
- blk_mq_unfreeze_queue(ns->head->disk->queue);
- }
-
ret = 0;
out:
- /*
- * If probing fails due an unsupported feature, hide the block device,
- * but still allow other access.
- */
- if (ret == -ENODEV) {
- ns->disk->flags |= GENHD_FL_HIDDEN;
- set_bit(NVME_NS_READY, &ns->flags);
- ret = 0;
- }
-
-error:
+ kfree(nvm);
kfree(id);
return ret;
}
static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
{
+ bool unsupported = false;
+ int ret;
+
switch (info->ids.csi) {
case NVME_CSI_ZNS:
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
dev_info(ns->ctrl->device,
"block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
info->nsid);
- return nvme_update_ns_info_generic(ns, info);
+ ret = nvme_update_ns_info_generic(ns, info);
+ break;
}
- return nvme_update_ns_info_block(ns, info);
+ ret = nvme_update_ns_info_block(ns, info);
+ break;
case NVME_CSI_NVM:
- return nvme_update_ns_info_block(ns, info);
+ ret = nvme_update_ns_info_block(ns, info);
+ break;
default:
dev_info(ns->ctrl->device,
"block device for nsid %u not supported (csi %u)\n",
info->nsid, info->ids.csi);
- return nvme_update_ns_info_generic(ns, info);
+ ret = nvme_update_ns_info_generic(ns, info);
+ break;
}
+
+ /*
+ * If probing fails due an unsupported feature, hide the block device,
+ * but still allow other access.
+ */
+ if (ret == -ENODEV) {
+ ns->disk->flags |= GENHD_FL_HIDDEN;
+ set_bit(NVME_NS_READY, &ns->flags);
+ unsupported = true;
+ ret = 0;
+ }
+
+ if (!ret && nvme_ns_head_multipath(ns->head)) {
+ struct queue_limits *ns_lim = &ns->disk->queue->limits;
+ struct queue_limits lim;
+
+ blk_mq_freeze_queue(ns->head->disk->queue);
+ if (unsupported)
+ ns->head->disk->flags |= GENHD_FL_HIDDEN;
+ else
+ nvme_init_integrity(ns->head->disk, ns->head);
+ set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
+ set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
+ nvme_mpath_revalidate_paths(ns);
+
+ /*
+ * queue_limits mixes values that are the hardware limitations
+ * for bio splitting with what is the device configuration.
+ *
+ * For NVMe the device configuration can change after e.g. a
+ * Format command, and we really want to pick up the new format
+ * value here. But we must still stack the queue limits to the
+ * least common denominator for multipathing to split the bios
+ * properly.
+ *
+ * To work around this, we explicitly set the device
+ * configuration to those that we just queried, but only stack
+ * the splitting limits in to make sure we still obey possibly
+ * lower limitations of other controllers.
+ */
+ lim = queue_limits_start_update(ns->head->disk->queue);
+ lim.logical_block_size = ns_lim->logical_block_size;
+ lim.physical_block_size = ns_lim->physical_block_size;
+ lim.io_min = ns_lim->io_min;
+ lim.io_opt = ns_lim->io_opt;
+ queue_limits_stack_bdev(&lim, ns->disk->part0, 0,
+ ns->head->disk->disk_name);
+ ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
+ blk_mq_unfreeze_queue(ns->head->disk->queue);
+ }
+
+ return ret;
}
#ifdef CONFIG_BLK_SED_OPAL
@@ -2856,7 +2901,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
subsys->awupf = le16_to_cpu(id->awupf);
nvme_mpath_default_iopolicy(subsys);
- subsys->dev.class = nvme_subsys_class;
+ subsys->dev.class = &nvme_subsys_class;
subsys->dev.release = nvme_release_subsystem;
subsys->dev.groups = nvme_subsys_attrs_groups;
dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
@@ -3096,11 +3141,17 @@ static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ct
return -EINVAL;
}
+ if (!ctrl->maxcmd) {
+ dev_err(ctrl->device, "Maximum outstanding commands is 0\n");
+ return -EINVAL;
+ }
+
return 0;
}
static int nvme_init_identify(struct nvme_ctrl *ctrl)
{
+ struct queue_limits lim;
struct nvme_id_ctrl *id;
u32 max_hw_sectors;
bool prev_apst_enabled;
@@ -3167,7 +3218,12 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->max_hw_sectors =
min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
- nvme_set_queue_limits(ctrl, ctrl->admin_q);
+ lim = queue_limits_start_update(ctrl->admin_q);
+ nvme_set_ctrl_limits(ctrl, &lim);
+ ret = queue_limits_commit_update(ctrl->admin_q, &lim);
+ if (ret)
+ goto out_free;
+
ctrl->sgls = le32_to_cpu(id->sgls);
ctrl->kas = le16_to_cpu(id->kas);
ctrl->max_namespaces = le32_to_cpu(id->mnan);
@@ -3185,7 +3241,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
if (ctrl->shutdown_timeout != shutdown_timeout)
dev_info(ctrl->device,
- "Shutdown timeout set to %u seconds\n",
+ "D3 entry latency set to %u seconds\n",
ctrl->shutdown_timeout);
} else
ctrl->shutdown_timeout = shutdown_timeout;
@@ -3399,7 +3455,7 @@ int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
if (minor < 0)
return minor;
cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor);
- cdev_device->class = nvme_ns_chr_class;
+ cdev_device->class = &nvme_ns_chr_class;
cdev_device->release = nvme_cdev_rel;
device_initialize(cdev_device);
cdev_init(cdev, fops);
@@ -3671,7 +3727,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
if (!ns)
return;
- disk = blk_mq_alloc_disk(ctrl->tagset, ns);
+ disk = blk_mq_alloc_disk(ctrl->tagset, NULL, ns);
if (IS_ERR(disk))
goto out_free_ns;
disk->fops = &nvme_bdev_ops;
@@ -4332,6 +4388,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
const struct blk_mq_ops *ops, unsigned int cmd_size)
{
+ struct queue_limits lim = {};
int ret;
memset(set, 0, sizeof(*set));
@@ -4352,14 +4409,14 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
if (ret)
return ret;
- ctrl->admin_q = blk_mq_init_queue(set);
+ ctrl->admin_q = blk_mq_alloc_queue(set, &lim, NULL);
if (IS_ERR(ctrl->admin_q)) {
ret = PTR_ERR(ctrl->admin_q);
goto out_free_tagset;
}
if (ctrl->ops->flags & NVME_F_FABRICS) {
- ctrl->fabrics_q = blk_mq_init_queue(set);
+ ctrl->fabrics_q = blk_mq_alloc_queue(set, NULL, NULL);
if (IS_ERR(ctrl->fabrics_q)) {
ret = PTR_ERR(ctrl->fabrics_q);
goto out_cleanup_admin_q;
@@ -4424,7 +4481,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
return ret;
if (ctrl->ops->flags & NVME_F_FABRICS) {
- ctrl->connect_q = blk_mq_init_queue(set);
+ ctrl->connect_q = blk_mq_alloc_queue(set, NULL, NULL);
if (IS_ERR(ctrl->connect_q)) {
ret = PTR_ERR(ctrl->connect_q);
goto out_free_tag_set;
@@ -4594,7 +4651,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
ctrl->device = &ctrl->ctrl_device;
ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
ctrl->instance);
- ctrl->device->class = nvme_class;
+ ctrl->device->class = &nvme_class;
ctrl->device->parent = ctrl->dev;
if (ops->dev_attr_groups)
ctrl->device->groups = ops->dev_attr_groups;
@@ -4827,42 +4884,36 @@ static int __init nvme_core_init(void)
if (result < 0)
goto destroy_delete_wq;
- nvme_class = class_create("nvme");
- if (IS_ERR(nvme_class)) {
- result = PTR_ERR(nvme_class);
+ result = class_register(&nvme_class);
+ if (result)
goto unregister_chrdev;
- }
- nvme_class->dev_uevent = nvme_class_uevent;
- nvme_subsys_class = class_create("nvme-subsystem");
- if (IS_ERR(nvme_subsys_class)) {
- result = PTR_ERR(nvme_subsys_class);
+ result = class_register(&nvme_subsys_class);
+ if (result)
goto destroy_class;
- }
result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS,
"nvme-generic");
if (result < 0)
goto destroy_subsys_class;
- nvme_ns_chr_class = class_create("nvme-generic");
- if (IS_ERR(nvme_ns_chr_class)) {
- result = PTR_ERR(nvme_ns_chr_class);
+ result = class_register(&nvme_ns_chr_class);
+ if (result)
goto unregister_generic_ns;
- }
+
result = nvme_init_auth();
if (result)
goto destroy_ns_chr;
return 0;
destroy_ns_chr:
- class_destroy(nvme_ns_chr_class);
+ class_unregister(&nvme_ns_chr_class);
unregister_generic_ns:
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
destroy_subsys_class:
- class_destroy(nvme_subsys_class);
+ class_unregister(&nvme_subsys_class);
destroy_class:
- class_destroy(nvme_class);
+ class_unregister(&nvme_class);
unregister_chrdev:
unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
destroy_delete_wq:
@@ -4878,9 +4929,9 @@ out:
static void __exit nvme_core_exit(void)
{
nvme_exit_auth();
- class_destroy(nvme_ns_chr_class);
- class_destroy(nvme_subsys_class);
- class_destroy(nvme_class);
+ class_unregister(&nvme_ns_chr_class);
+ class_unregister(&nvme_subsys_class);
+ class_unregister(&nvme_class);
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
destroy_workqueue(nvme_delete_wq);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 495c171dae..1f0ea1f32d 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -638,7 +638,7 @@ static struct key *nvmf_parse_key(int key_id)
}
key = key_lookup(key_id);
- if (!IS_ERR(key))
+ if (IS_ERR(key))
pr_err("key id %08x not found\n", key_id);
else
pr_debug("Using key id %08x\n", key_id);
@@ -1319,7 +1319,10 @@ out_free_opts:
return ERR_PTR(ret);
}
-static struct class *nvmf_class;
+static const struct class nvmf_class = {
+ .name = "nvme-fabrics",
+};
+
static struct device *nvmf_device;
static DEFINE_MUTEX(nvmf_dev_mutex);
@@ -1439,15 +1442,14 @@ static int __init nvmf_init(void)
if (!nvmf_default_host)
return -ENOMEM;
- nvmf_class = class_create("nvme-fabrics");
- if (IS_ERR(nvmf_class)) {
+ ret = class_register(&nvmf_class);
+ if (ret) {
pr_err("couldn't register class nvme-fabrics\n");
- ret = PTR_ERR(nvmf_class);
goto out_free_host;
}
nvmf_device =
- device_create(nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl");
+ device_create(&nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl");
if (IS_ERR(nvmf_device)) {
pr_err("couldn't create nvme-fabrics device!\n");
ret = PTR_ERR(nvmf_device);
@@ -1463,9 +1465,9 @@ static int __init nvmf_init(void)
return 0;
out_destroy_device:
- device_destroy(nvmf_class, MKDEV(0, 0));
+ device_destroy(&nvmf_class, MKDEV(0, 0));
out_destroy_class:
- class_destroy(nvmf_class);
+ class_unregister(&nvmf_class);
out_free_host:
nvmf_host_put(nvmf_default_host);
return ret;
@@ -1474,8 +1476,8 @@ out_free_host:
static void __exit nvmf_exit(void)
{
misc_deregister(&nvmf_misc);
- device_destroy(nvmf_class, MKDEV(0, 0));
- class_destroy(nvmf_class);
+ device_destroy(&nvmf_class, MKDEV(0, 0));
+ class_unregister(&nvmf_class);
nvmf_host_put(nvmf_default_host);
BUILD_BUG_ON(sizeof(struct nvmf_common_command) != 64);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 68a5d97165..a5b29e9ad3 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2428,7 +2428,7 @@ nvme_fc_ctrl_get(struct nvme_fc_ctrl *ctrl)
* controller. Called after last nvme_put_ctrl() call
*/
static void
-nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl)
+nvme_fc_free_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
@@ -3384,7 +3384,7 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
.reg_read32 = nvmf_reg_read32,
.reg_read64 = nvmf_reg_read64,
.reg_write32 = nvmf_reg_write32,
- .free_ctrl = nvme_fc_nvme_ctrl_freed,
+ .free_ctrl = nvme_fc_free_ctrl,
.submit_async_event = nvme_fc_submit_async_event,
.delete_ctrl = nvme_fc_delete_ctrl,
.get_address = nvmf_get_address,
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 75386d3e0f..a4e46eb20b 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -118,7 +118,8 @@ void nvme_failover_req(struct request *req)
blk_steal_bios(&ns->head->requeue_list, req);
spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
- blk_mq_end_request(req, 0);
+ nvme_req(req)->status = 0;
+ nvme_end_req(req);
kblockd_schedule_work(&ns->head->requeue_work);
}
@@ -517,6 +518,7 @@ static void nvme_requeue_work(struct work_struct *work)
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
{
+ struct queue_limits lim;
bool vwc = false;
mutex_init(&head->lock);
@@ -533,9 +535,14 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
!nvme_is_unique_nsid(ctrl, head) || !multipath)
return 0;
- head->disk = blk_alloc_disk(ctrl->numa_node);
- if (!head->disk)
- return -ENOMEM;
+ blk_set_stacking_limits(&lim);
+ lim.dma_alignment = 3;
+ if (head->ids.csi != NVME_CSI_ZNS)
+ lim.max_zone_append_sectors = 0;
+
+ head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
+ if (IS_ERR(head->disk))
+ return PTR_ERR(head->disk);
head->disk->fops = &nvme_ns_head_ops;
head->disk->private_data = head;
sprintf(head->disk->disk_name, "nvme%dn%d",
@@ -554,11 +561,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues)
blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue);
- /* set to a default value of 512 until the disk is validated */
- blk_queue_logical_block_size(head->disk->queue, 512);
- blk_set_stacking_limits(&head->disk->queue->limits);
- blk_queue_dma_alignment(head->disk->queue, 3);
-
/* we need to propagate up the VMC settings */
if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
vwc = true;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 2a7bf57428..d7bcc6d51e 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -469,6 +469,7 @@ struct nvme_ns_head {
u16 ms;
u16 pi_size;
u8 pi_type;
+ u8 pi_offset;
u8 guard_type;
u16 sgs;
u32 sws;
@@ -766,6 +767,7 @@ static inline bool nvme_state_terminal(struct nvme_ctrl *ctrl)
}
}
+void nvme_end_req(struct request *req);
void nvme_complete_rq(struct request *req);
void nvme_complete_batch_req(struct request *req);
@@ -1061,11 +1063,19 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
}
#endif /* CONFIG_NVME_MULTIPATH */
-int nvme_revalidate_zones(struct nvme_ns *ns);
+struct nvme_zone_info {
+ u64 zone_size;
+ unsigned int max_open_zones;
+ unsigned int max_active_zones;
+};
+
int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data);
+int nvme_query_zone_info(struct nvme_ns *ns, unsigned lbaf,
+ struct nvme_zone_info *zi);
+void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim,
+ struct nvme_zone_info *zi);
#ifdef CONFIG_BLK_DEV_ZONED
-int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf);
blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmnd,
enum nvme_zone_mgmt_action action);
@@ -1076,13 +1086,6 @@ static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
{
return BLK_STS_NOTSUPP;
}
-
-static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
-{
- dev_warn(ns->ctrl->device,
- "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n");
- return -EPROTONOSUPPORT;
-}
#endif
static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
diff --git a/drivers/nvme/host/pr.c b/drivers/nvme/host/pr.c
index fc3eed00f9..8fa1ffcdae 100644
--- a/drivers/nvme/host/pr.c
+++ b/drivers/nvme/host/pr.c
@@ -77,7 +77,7 @@ static int nvme_sc_to_pr_err(int nvme_sc)
if (nvme_is_path_error(nvme_sc))
return PR_STS_PATH_FAILED;
- switch (nvme_sc) {
+ switch (nvme_sc & 0x7ff) {
case NVME_SC_SUCCESS:
return PR_STS_SUCCESS;
case NVME_SC_RESERVATION_CONFLICT:
@@ -97,8 +97,7 @@ static int nvme_sc_to_pr_err(int nvme_sc)
static int nvme_send_pr_command(struct block_device *bdev,
struct nvme_command *c, void *data, unsigned int data_len)
{
- if (IS_ENABLED(CONFIG_NVME_MULTIPATH) &&
- nvme_disk_is_ns_head(bdev->bd_disk))
+ if (nvme_disk_is_ns_head(bdev->bd_disk))
return nvme_send_ns_head_pr_command(bdev, c, data, data_len);
return nvme_send_ns_pr_command(bdev->bd_disk->private_data, c, data,
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 20fdd40b18..366f0bb4eb 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1006,6 +1006,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
{
int ret;
bool changed;
+ u16 max_queue_size;
ret = nvme_rdma_configure_admin_queue(ctrl, new);
if (ret)
@@ -1030,11 +1031,16 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
}
- if (ctrl->ctrl.sqsize + 1 > NVME_RDMA_MAX_QUEUE_SIZE) {
+ if (ctrl->ctrl.max_integrity_segments)
+ max_queue_size = NVME_RDMA_MAX_METADATA_QUEUE_SIZE;
+ else
+ max_queue_size = NVME_RDMA_MAX_QUEUE_SIZE;
+
+ if (ctrl->ctrl.sqsize + 1 > max_queue_size) {
dev_warn(ctrl->ctrl.device,
- "ctrl sqsize %u > max queue size %u, clamping down\n",
- ctrl->ctrl.sqsize + 1, NVME_RDMA_MAX_QUEUE_SIZE);
- ctrl->ctrl.sqsize = NVME_RDMA_MAX_QUEUE_SIZE - 1;
+ "ctrl sqsize %u > max queue size %u, clamping down\n",
+ ctrl->ctrl.sqsize + 1, max_queue_size);
+ ctrl->ctrl.sqsize = max_queue_size - 1;
}
if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
index 09fcaa519e..3c55f7edd1 100644
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -236,8 +236,7 @@ static ssize_t nuse_show(struct device *dev, struct device_attribute *attr,
struct block_device *bdev = disk->part0;
int ret;
- if (IS_ENABLED(CONFIG_NVME_MULTIPATH) &&
- bdev->bd_disk->fops == &nvme_ns_head_ops)
+ if (nvme_disk_is_ns_head(bdev->bd_disk))
ret = ns_head_update_nuse(head);
else
ret = ns_update_nuse(bdev->bd_disk->private_data);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 6eeb96578d..28bc2f373c 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -37,6 +37,14 @@ module_param(so_priority, int, 0644);
MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
/*
+ * Use the unbound workqueue for nvme_tcp_wq, then we can set the cpu affinity
+ * from sysfs.
+ */
+static bool wq_unbound;
+module_param(wq_unbound, bool, 0644);
+MODULE_PARM_DESC(wq_unbound, "Use unbound workqueue for nvme-tcp IO context (default false)");
+
+/*
* TLS handshake timeout
*/
static int tls_handshake_timeout = 10;
@@ -1350,7 +1358,6 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
{
- struct page *page;
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
unsigned int noreclaim_flag;
@@ -1361,11 +1368,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
if (queue->hdr_digest || queue->data_digest)
nvme_tcp_free_crypto(queue);
- if (queue->pf_cache.va) {
- page = virt_to_head_page(queue->pf_cache.va);
- __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
- queue->pf_cache.va = NULL;
- }
+ page_frag_cache_drain(&queue->pf_cache);
noreclaim_flag = memalloc_noreclaim_save();
/* ->sock will be released by fput() */
@@ -1557,7 +1560,10 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
else if (nvme_tcp_poll_queue(queue))
n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
ctrl->io_queues[HCTX_TYPE_READ] - 1;
- queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+ if (wq_unbound)
+ queue->io_cpu = WORK_CPU_UNBOUND;
+ else
+ queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
}
static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid)
@@ -2796,6 +2802,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
static int __init nvme_tcp_init_module(void)
{
+ unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS;
+
BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
BUILD_BUG_ON(sizeof(struct nvme_tcp_data_pdu) != 24);
@@ -2805,8 +2813,10 @@ static int __init nvme_tcp_init_module(void)
BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128);
BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24);
- nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
- WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+ if (wq_unbound)
+ wq_flags |= WQ_UNBOUND;
+
+ nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", wq_flags, 0);
if (!nvme_tcp_wq)
return -ENOMEM;
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 1c36fcedea..0288315f00 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -119,7 +119,10 @@ static const char *nvme_trace_get_lba_status(struct trace_seq *p,
static const char *nvme_trace_admin_format_nvm(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
- u8 lbaf = cdw10[0] & 0xF;
+ /*
+ * lbafu(bit 13:12) is already in the upper 4 bits, lbafl: bit 03:00.
+ */
+ u8 lbaf = (cdw10[1] & 0x30) | (cdw10[0] & 0xF);
u8 mset = (cdw10[0] >> 4) & 0x1;
u8 pi = (cdw10[0] >> 5) & 0x7;
u8 pil = cdw10[1] & 0x1;
@@ -164,12 +167,27 @@ static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10)
static const char *nvme_trace_zone_mgmt_send(struct trace_seq *p, u8 *cdw10)
{
+ static const char * const zsa_strs[] = {
+ [0x01] = "close zone",
+ [0x02] = "finish zone",
+ [0x03] = "open zone",
+ [0x04] = "reset zone",
+ [0x05] = "offline zone",
+ [0x10] = "set zone descriptor extension"
+ };
const char *ret = trace_seq_buffer_ptr(p);
u64 slba = get_unaligned_le64(cdw10);
+ const char *zsa_str;
u8 zsa = cdw10[12];
u8 all = cdw10[13];
- trace_seq_printf(p, "slba=%llu, zsa=%u, all=%u", slba, zsa, all);
+ if (zsa < ARRAY_SIZE(zsa_strs) && zsa_strs[zsa])
+ zsa_str = zsa_strs[zsa];
+ else
+ zsa_str = "reserved";
+
+ trace_seq_printf(p, "slba=%llu, zsa=%u:%s, all=%u",
+ slba, zsa, zsa_str, all);
trace_seq_putc(p, 0);
return ret;
@@ -177,15 +195,86 @@ static const char *nvme_trace_zone_mgmt_send(struct trace_seq *p, u8 *cdw10)
static const char *nvme_trace_zone_mgmt_recv(struct trace_seq *p, u8 *cdw10)
{
+ static const char * const zrasf_strs[] = {
+ [0x00] = "list all zones",
+ [0x01] = "list the zones in the ZSE: Empty state",
+ [0x02] = "list the zones in the ZSIO: Implicitly Opened state",
+ [0x03] = "list the zones in the ZSEO: Explicitly Opened state",
+ [0x04] = "list the zones in the ZSC: Closed state",
+ [0x05] = "list the zones in the ZSF: Full state",
+ [0x06] = "list the zones in the ZSRO: Read Only state",
+ [0x07] = "list the zones in the ZSO: Offline state",
+ [0x09] = "list the zones that have the zone attribute"
+ };
const char *ret = trace_seq_buffer_ptr(p);
u64 slba = get_unaligned_le64(cdw10);
u32 numd = get_unaligned_le32(cdw10 + 8);
u8 zra = cdw10[12];
u8 zrasf = cdw10[13];
+ const char *zrasf_str;
u8 pr = cdw10[14];
- trace_seq_printf(p, "slba=%llu, numd=%u, zra=%u, zrasf=%u, pr=%u",
- slba, numd, zra, zrasf, pr);
+ if (zrasf < ARRAY_SIZE(zrasf_strs) && zrasf_strs[zrasf])
+ zrasf_str = zrasf_strs[zrasf];
+ else
+ zrasf_str = "reserved";
+
+ trace_seq_printf(p, "slba=%llu, numd=%u, zra=%u, zrasf=%u:%s, pr=%u",
+ slba, numd, zra, zrasf, zrasf_str, pr);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
+static const char *nvme_trace_resv_reg(struct trace_seq *p, u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u8 rrega = cdw10[0] & 0x7;
+ u8 iekey = (cdw10[0] >> 3) & 0x1;
+ u8 ptpl = (cdw10[3] >> 6) & 0x3;
+
+ trace_seq_printf(p, "rrega=%u, iekey=%u, ptpl=%u",
+ rrega, iekey, ptpl);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
+static const char *nvme_trace_resv_acq(struct trace_seq *p, u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u8 racqa = cdw10[0] & 0x7;
+ u8 iekey = (cdw10[0] >> 3) & 0x1;
+ u8 rtype = cdw10[1];
+
+ trace_seq_printf(p, "racqa=%u, iekey=%u, rtype=%u",
+ racqa, iekey, rtype);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
+static const char *nvme_trace_resv_rel(struct trace_seq *p, u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u8 rrela = cdw10[0] & 0x7;
+ u8 iekey = (cdw10[0] >> 3) & 0x1;
+ u8 rtype = cdw10[1];
+
+ trace_seq_printf(p, "rrela=%u, iekey=%u, rtype=%u",
+ rrela, iekey, rtype);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
+static const char *nvme_trace_resv_report(struct trace_seq *p, u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u32 numd = get_unaligned_le32(cdw10);
+ u8 eds = cdw10[4] & 0x1;
+
+ trace_seq_printf(p, "numd=%u, eds=%u", numd, eds);
trace_seq_putc(p, 0);
return ret;
@@ -243,6 +332,14 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p,
return nvme_trace_zone_mgmt_send(p, cdw10);
case nvme_cmd_zone_mgmt_recv:
return nvme_trace_zone_mgmt_recv(p, cdw10);
+ case nvme_cmd_resv_register:
+ return nvme_trace_resv_reg(p, cdw10);
+ case nvme_cmd_resv_acquire:
+ return nvme_trace_resv_acq(p, cdw10);
+ case nvme_cmd_resv_release:
+ return nvme_trace_resv_rel(p, cdw10);
+ case nvme_cmd_resv_report:
+ return nvme_trace_resv_report(p, cdw10);
default:
return nvme_trace_common(p, cdw10);
}
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 499bbb0eee..77aa0f440a 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -7,16 +7,6 @@
#include <linux/vmalloc.h>
#include "nvme.h"
-int nvme_revalidate_zones(struct nvme_ns *ns)
-{
- struct request_queue *q = ns->queue;
-
- blk_queue_chunk_sectors(q, ns->head->zsze);
- blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append);
-
- return blk_revalidate_disk_zones(ns->disk, NULL);
-}
-
static int nvme_set_max_append(struct nvme_ctrl *ctrl)
{
struct nvme_command c = { };
@@ -45,10 +35,10 @@ static int nvme_set_max_append(struct nvme_ctrl *ctrl)
return 0;
}
-int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
+int nvme_query_zone_info(struct nvme_ns *ns, unsigned lbaf,
+ struct nvme_zone_info *zi)
{
struct nvme_effects_log *log = ns->head->effects;
- struct request_queue *q = ns->queue;
struct nvme_command c = { };
struct nvme_id_ns_zns *id;
int status;
@@ -99,25 +89,34 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
goto free_data;
}
- ns->head->zsze =
- nvme_lba_to_sect(ns->head, le64_to_cpu(id->lbafe[lbaf].zsze));
- if (!is_power_of_2(ns->head->zsze)) {
+ zi->zone_size = le64_to_cpu(id->lbafe[lbaf].zsze);
+ if (!is_power_of_2(zi->zone_size)) {
dev_warn(ns->ctrl->device,
- "invalid zone size:%llu for namespace:%u\n",
- ns->head->zsze, ns->head->ns_id);
+ "invalid zone size: %llu for namespace: %u\n",
+ zi->zone_size, ns->head->ns_id);
status = -ENODEV;
goto free_data;
}
+ zi->max_open_zones = le32_to_cpu(id->mor) + 1;
+ zi->max_active_zones = le32_to_cpu(id->mar) + 1;
- disk_set_zoned(ns->disk);
- blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
- disk_set_max_open_zones(ns->disk, le32_to_cpu(id->mor) + 1);
- disk_set_max_active_zones(ns->disk, le32_to_cpu(id->mar) + 1);
free_data:
kfree(id);
return status;
}
+void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim,
+ struct nvme_zone_info *zi)
+{
+ lim->zoned = 1;
+ lim->max_open_zones = zi->max_open_zones;
+ lim->max_active_zones = zi->max_active_zones;
+ lim->max_zone_append_sectors = ns->ctrl->max_zone_append;
+ lim->chunk_sectors = ns->head->zsze =
+ nvme_lba_to_sect(ns->head, zi->zone_size);
+ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ns->queue);
+}
+
static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
unsigned int nr_zones, size_t *buflen)
{