cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

multipath.c (24375B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (c) 2017-2018 Christoph Hellwig.
      4 */
      5
      6#include <linux/backing-dev.h>
      7#include <linux/moduleparam.h>
      8#include <linux/vmalloc.h>
      9#include <trace/events/block.h>
     10#include "nvme.h"
     11
     12bool multipath = true;
     13module_param(multipath, bool, 0444);
     14MODULE_PARM_DESC(multipath,
     15	"turn on native support for multiple controllers per subsystem");
     16
     17static const char *nvme_iopolicy_names[] = {
     18	[NVME_IOPOLICY_NUMA]	= "numa",
     19	[NVME_IOPOLICY_RR]	= "round-robin",
     20};
     21
     22static int iopolicy = NVME_IOPOLICY_NUMA;
     23
     24static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
     25{
     26	if (!val)
     27		return -EINVAL;
     28	if (!strncmp(val, "numa", 4))
     29		iopolicy = NVME_IOPOLICY_NUMA;
     30	else if (!strncmp(val, "round-robin", 11))
     31		iopolicy = NVME_IOPOLICY_RR;
     32	else
     33		return -EINVAL;
     34
     35	return 0;
     36}
     37
     38static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
     39{
     40	return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
     41}
     42
     43module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
     44	&iopolicy, 0644);
     45MODULE_PARM_DESC(iopolicy,
     46	"Default multipath I/O policy; 'numa' (default) or 'round-robin'");
     47
     48void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
     49{
     50	subsys->iopolicy = iopolicy;
     51}
     52
     53void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
     54{
     55	struct nvme_ns_head *h;
     56
     57	lockdep_assert_held(&subsys->lock);
     58	list_for_each_entry(h, &subsys->nsheads, entry)
     59		if (h->disk)
     60			blk_mq_unfreeze_queue(h->disk->queue);
     61}
     62
     63void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
     64{
     65	struct nvme_ns_head *h;
     66
     67	lockdep_assert_held(&subsys->lock);
     68	list_for_each_entry(h, &subsys->nsheads, entry)
     69		if (h->disk)
     70			blk_mq_freeze_queue_wait(h->disk->queue);
     71}
     72
     73void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
     74{
     75	struct nvme_ns_head *h;
     76
     77	lockdep_assert_held(&subsys->lock);
     78	list_for_each_entry(h, &subsys->nsheads, entry)
     79		if (h->disk)
     80			blk_freeze_queue_start(h->disk->queue);
     81}
     82
     83void nvme_failover_req(struct request *req)
     84{
     85	struct nvme_ns *ns = req->q->queuedata;
     86	u16 status = nvme_req(req)->status & 0x7ff;
     87	unsigned long flags;
     88	struct bio *bio;
     89
     90	nvme_mpath_clear_current_path(ns);
     91
     92	/*
     93	 * If we got back an ANA error, we know the controller is alive but not
     94	 * ready to serve this namespace.  Kick of a re-read of the ANA
     95	 * information page, and just try any other available path for now.
     96	 */
     97	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
     98		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
     99		queue_work(nvme_wq, &ns->ctrl->ana_work);
    100	}
    101
    102	spin_lock_irqsave(&ns->head->requeue_lock, flags);
    103	for (bio = req->bio; bio; bio = bio->bi_next) {
    104		bio_set_dev(bio, ns->head->disk->part0);
    105		if (bio->bi_opf & REQ_POLLED) {
    106			bio->bi_opf &= ~REQ_POLLED;
    107			bio->bi_cookie = BLK_QC_T_NONE;
    108		}
    109	}
    110	blk_steal_bios(&ns->head->requeue_list, req);
    111	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
    112
    113	blk_mq_end_request(req, 0);
    114	kblockd_schedule_work(&ns->head->requeue_work);
    115}
    116
    117void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
    118{
    119	struct nvme_ns *ns;
    120
    121	down_read(&ctrl->namespaces_rwsem);
    122	list_for_each_entry(ns, &ctrl->namespaces, list) {
    123		if (!ns->head->disk)
    124			continue;
    125		kblockd_schedule_work(&ns->head->requeue_work);
    126		if (ctrl->state == NVME_CTRL_LIVE)
    127			disk_uevent(ns->head->disk, KOBJ_CHANGE);
    128	}
    129	up_read(&ctrl->namespaces_rwsem);
    130}
    131
    132static const char *nvme_ana_state_names[] = {
    133	[0]				= "invalid state",
    134	[NVME_ANA_OPTIMIZED]		= "optimized",
    135	[NVME_ANA_NONOPTIMIZED]		= "non-optimized",
    136	[NVME_ANA_INACCESSIBLE]		= "inaccessible",
    137	[NVME_ANA_PERSISTENT_LOSS]	= "persistent-loss",
    138	[NVME_ANA_CHANGE]		= "change",
    139};
    140
    141bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
    142{
    143	struct nvme_ns_head *head = ns->head;
    144	bool changed = false;
    145	int node;
    146
    147	if (!head)
    148		goto out;
    149
    150	for_each_node(node) {
    151		if (ns == rcu_access_pointer(head->current_path[node])) {
    152			rcu_assign_pointer(head->current_path[node], NULL);
    153			changed = true;
    154		}
    155	}
    156out:
    157	return changed;
    158}
    159
    160void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
    161{
    162	struct nvme_ns *ns;
    163
    164	down_read(&ctrl->namespaces_rwsem);
    165	list_for_each_entry(ns, &ctrl->namespaces, list) {
    166		nvme_mpath_clear_current_path(ns);
    167		kblockd_schedule_work(&ns->head->requeue_work);
    168	}
    169	up_read(&ctrl->namespaces_rwsem);
    170}
    171
    172void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
    173{
    174	struct nvme_ns_head *head = ns->head;
    175	sector_t capacity = get_capacity(head->disk);
    176	int node;
    177
    178	list_for_each_entry_rcu(ns, &head->list, siblings) {
    179		if (capacity != get_capacity(ns->disk))
    180			clear_bit(NVME_NS_READY, &ns->flags);
    181	}
    182
    183	for_each_node(node)
    184		rcu_assign_pointer(head->current_path[node], NULL);
    185}
    186
    187static bool nvme_path_is_disabled(struct nvme_ns *ns)
    188{
    189	/*
    190	 * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
    191	 * still be able to complete assuming that the controller is connected.
    192	 * Otherwise it will fail immediately and return to the requeue list.
    193	 */
    194	if (ns->ctrl->state != NVME_CTRL_LIVE &&
    195	    ns->ctrl->state != NVME_CTRL_DELETING)
    196		return true;
    197	if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
    198	    !test_bit(NVME_NS_READY, &ns->flags))
    199		return true;
    200	return false;
    201}
    202
    203static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
    204{
    205	int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
    206	struct nvme_ns *found = NULL, *fallback = NULL, *ns;
    207
    208	list_for_each_entry_rcu(ns, &head->list, siblings) {
    209		if (nvme_path_is_disabled(ns))
    210			continue;
    211
    212		if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
    213			distance = node_distance(node, ns->ctrl->numa_node);
    214		else
    215			distance = LOCAL_DISTANCE;
    216
    217		switch (ns->ana_state) {
    218		case NVME_ANA_OPTIMIZED:
    219			if (distance < found_distance) {
    220				found_distance = distance;
    221				found = ns;
    222			}
    223			break;
    224		case NVME_ANA_NONOPTIMIZED:
    225			if (distance < fallback_distance) {
    226				fallback_distance = distance;
    227				fallback = ns;
    228			}
    229			break;
    230		default:
    231			break;
    232		}
    233	}
    234
    235	if (!found)
    236		found = fallback;
    237	if (found)
    238		rcu_assign_pointer(head->current_path[node], found);
    239	return found;
    240}
    241
    242static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
    243		struct nvme_ns *ns)
    244{
    245	ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
    246			siblings);
    247	if (ns)
    248		return ns;
    249	return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
    250}
    251
    252static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
    253		int node, struct nvme_ns *old)
    254{
    255	struct nvme_ns *ns, *found = NULL;
    256
    257	if (list_is_singular(&head->list)) {
    258		if (nvme_path_is_disabled(old))
    259			return NULL;
    260		return old;
    261	}
    262
    263	for (ns = nvme_next_ns(head, old);
    264	     ns && ns != old;
    265	     ns = nvme_next_ns(head, ns)) {
    266		if (nvme_path_is_disabled(ns))
    267			continue;
    268
    269		if (ns->ana_state == NVME_ANA_OPTIMIZED) {
    270			found = ns;
    271			goto out;
    272		}
    273		if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
    274			found = ns;
    275	}
    276
    277	/*
    278	 * The loop above skips the current path for round-robin semantics.
    279	 * Fall back to the current path if either:
    280	 *  - no other optimized path found and current is optimized,
    281	 *  - no other usable path found and current is usable.
    282	 */
    283	if (!nvme_path_is_disabled(old) &&
    284	    (old->ana_state == NVME_ANA_OPTIMIZED ||
    285	     (!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
    286		return old;
    287
    288	if (!found)
    289		return NULL;
    290out:
    291	rcu_assign_pointer(head->current_path[node], found);
    292	return found;
    293}
    294
    295static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
    296{
    297	return ns->ctrl->state == NVME_CTRL_LIVE &&
    298		ns->ana_state == NVME_ANA_OPTIMIZED;
    299}
    300
    301inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
    302{
    303	int node = numa_node_id();
    304	struct nvme_ns *ns;
    305
    306	ns = srcu_dereference(head->current_path[node], &head->srcu);
    307	if (unlikely(!ns))
    308		return __nvme_find_path(head, node);
    309
    310	if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
    311		return nvme_round_robin_path(head, node, ns);
    312	if (unlikely(!nvme_path_is_optimized(ns)))
    313		return __nvme_find_path(head, node);
    314	return ns;
    315}
    316
    317static bool nvme_available_path(struct nvme_ns_head *head)
    318{
    319	struct nvme_ns *ns;
    320
    321	list_for_each_entry_rcu(ns, &head->list, siblings) {
    322		if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
    323			continue;
    324		switch (ns->ctrl->state) {
    325		case NVME_CTRL_LIVE:
    326		case NVME_CTRL_RESETTING:
    327		case NVME_CTRL_CONNECTING:
    328			/* fallthru */
    329			return true;
    330		default:
    331			break;
    332		}
    333	}
    334	return false;
    335}
    336
    337static void nvme_ns_head_submit_bio(struct bio *bio)
    338{
    339	struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
    340	struct device *dev = disk_to_dev(head->disk);
    341	struct nvme_ns *ns;
    342	int srcu_idx;
    343
    344	/*
    345	 * The namespace might be going away and the bio might be moved to a
    346	 * different queue via blk_steal_bios(), so we need to use the bio_split
    347	 * pool from the original queue to allocate the bvecs from.
    348	 */
    349	blk_queue_split(&bio);
    350
    351	srcu_idx = srcu_read_lock(&head->srcu);
    352	ns = nvme_find_path(head);
    353	if (likely(ns)) {
    354		bio_set_dev(bio, ns->disk->part0);
    355		bio->bi_opf |= REQ_NVME_MPATH;
    356		trace_block_bio_remap(bio, disk_devt(ns->head->disk),
    357				      bio->bi_iter.bi_sector);
    358		submit_bio_noacct(bio);
    359	} else if (nvme_available_path(head)) {
    360		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
    361
    362		spin_lock_irq(&head->requeue_lock);
    363		bio_list_add(&head->requeue_list, bio);
    364		spin_unlock_irq(&head->requeue_lock);
    365	} else {
    366		dev_warn_ratelimited(dev, "no available path - failing I/O\n");
    367
    368		bio_io_error(bio);
    369	}
    370
    371	srcu_read_unlock(&head->srcu, srcu_idx);
    372}
    373
    374static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
    375{
    376	if (!nvme_tryget_ns_head(bdev->bd_disk->private_data))
    377		return -ENXIO;
    378	return 0;
    379}
    380
    381static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
    382{
    383	nvme_put_ns_head(disk->private_data);
    384}
    385
    386#ifdef CONFIG_BLK_DEV_ZONED
    387static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
    388		unsigned int nr_zones, report_zones_cb cb, void *data)
    389{
    390	struct nvme_ns_head *head = disk->private_data;
    391	struct nvme_ns *ns;
    392	int srcu_idx, ret = -EWOULDBLOCK;
    393
    394	srcu_idx = srcu_read_lock(&head->srcu);
    395	ns = nvme_find_path(head);
    396	if (ns)
    397		ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
    398	srcu_read_unlock(&head->srcu, srcu_idx);
    399	return ret;
    400}
    401#else
    402#define nvme_ns_head_report_zones	NULL
    403#endif /* CONFIG_BLK_DEV_ZONED */
    404
    405const struct block_device_operations nvme_ns_head_ops = {
    406	.owner		= THIS_MODULE,
    407	.submit_bio	= nvme_ns_head_submit_bio,
    408	.open		= nvme_ns_head_open,
    409	.release	= nvme_ns_head_release,
    410	.ioctl		= nvme_ns_head_ioctl,
    411	.getgeo		= nvme_getgeo,
    412	.report_zones	= nvme_ns_head_report_zones,
    413	.pr_ops		= &nvme_pr_ops,
    414};
    415
    416static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev)
    417{
    418	return container_of(cdev, struct nvme_ns_head, cdev);
    419}
    420
    421static int nvme_ns_head_chr_open(struct inode *inode, struct file *file)
    422{
    423	if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev)))
    424		return -ENXIO;
    425	return 0;
    426}
    427
    428static int nvme_ns_head_chr_release(struct inode *inode, struct file *file)
    429{
    430	nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev));
    431	return 0;
    432}
    433
    434static const struct file_operations nvme_ns_head_chr_fops = {
    435	.owner		= THIS_MODULE,
    436	.open		= nvme_ns_head_chr_open,
    437	.release	= nvme_ns_head_chr_release,
    438	.unlocked_ioctl	= nvme_ns_head_chr_ioctl,
    439	.compat_ioctl	= compat_ptr_ioctl,
    440	.uring_cmd	= nvme_ns_head_chr_uring_cmd,
    441};
    442
    443static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
    444{
    445	int ret;
    446
    447	head->cdev_device.parent = &head->subsys->dev;
    448	ret = dev_set_name(&head->cdev_device, "ng%dn%d",
    449			   head->subsys->instance, head->instance);
    450	if (ret)
    451		return ret;
    452	ret = nvme_cdev_add(&head->cdev, &head->cdev_device,
    453			    &nvme_ns_head_chr_fops, THIS_MODULE);
    454	return ret;
    455}
    456
    457static void nvme_requeue_work(struct work_struct *work)
    458{
    459	struct nvme_ns_head *head =
    460		container_of(work, struct nvme_ns_head, requeue_work);
    461	struct bio *bio, *next;
    462
    463	spin_lock_irq(&head->requeue_lock);
    464	next = bio_list_get(&head->requeue_list);
    465	spin_unlock_irq(&head->requeue_lock);
    466
    467	while ((bio = next) != NULL) {
    468		next = bio->bi_next;
    469		bio->bi_next = NULL;
    470
    471		submit_bio_noacct(bio);
    472	}
    473}
    474
    475int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
    476{
    477	bool vwc = false;
    478
    479	mutex_init(&head->lock);
    480	bio_list_init(&head->requeue_list);
    481	spin_lock_init(&head->requeue_lock);
    482	INIT_WORK(&head->requeue_work, nvme_requeue_work);
    483
    484	/*
    485	 * Add a multipath node if the subsystems supports multiple controllers.
    486	 * We also do this for private namespaces as the namespace sharing flag
    487	 * could change after a rescan.
    488	 */
    489	if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
    490	    !nvme_is_unique_nsid(ctrl, head) || !multipath)
    491		return 0;
    492
    493	head->disk = blk_alloc_disk(ctrl->numa_node);
    494	if (!head->disk)
    495		return -ENOMEM;
    496	head->disk->fops = &nvme_ns_head_ops;
    497	head->disk->private_data = head;
    498	sprintf(head->disk->disk_name, "nvme%dn%d",
    499			ctrl->subsys->instance, head->instance);
    500
    501	blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue);
    502	blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue);
    503	/*
    504	 * This assumes all controllers that refer to a namespace either
    505	 * support poll queues or not.  That is not a strict guarantee,
    506	 * but if the assumption is wrong the effect is only suboptimal
    507	 * performance but not correctness problem.
    508	 */
    509	if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL &&
    510	    ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues)
    511		blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue);
    512
    513	/* set to a default value of 512 until the disk is validated */
    514	blk_queue_logical_block_size(head->disk->queue, 512);
    515	blk_set_stacking_limits(&head->disk->queue->limits);
    516
    517	/* we need to propagate up the VMC settings */
    518	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
    519		vwc = true;
    520	blk_queue_write_cache(head->disk->queue, vwc, vwc);
    521	return 0;
    522}
    523
    524static void nvme_mpath_set_live(struct nvme_ns *ns)
    525{
    526	struct nvme_ns_head *head = ns->head;
    527	int rc;
    528
    529	if (!head->disk)
    530		return;
    531
    532	/*
    533	 * test_and_set_bit() is used because it is protecting against two nvme
    534	 * paths simultaneously calling device_add_disk() on the same namespace
    535	 * head.
    536	 */
    537	if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
    538		rc = device_add_disk(&head->subsys->dev, head->disk,
    539				     nvme_ns_id_attr_groups);
    540		if (rc) {
    541			clear_bit(NVME_NSHEAD_DISK_LIVE, &ns->flags);
    542			return;
    543		}
    544		nvme_add_ns_head_cdev(head);
    545	}
    546
    547	mutex_lock(&head->lock);
    548	if (nvme_path_is_optimized(ns)) {
    549		int node, srcu_idx;
    550
    551		srcu_idx = srcu_read_lock(&head->srcu);
    552		for_each_node(node)
    553			__nvme_find_path(head, node);
    554		srcu_read_unlock(&head->srcu, srcu_idx);
    555	}
    556	mutex_unlock(&head->lock);
    557
    558	synchronize_srcu(&head->srcu);
    559	kblockd_schedule_work(&head->requeue_work);
    560}
    561
    562static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
    563		int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
    564			void *))
    565{
    566	void *base = ctrl->ana_log_buf;
    567	size_t offset = sizeof(struct nvme_ana_rsp_hdr);
    568	int error, i;
    569
    570	lockdep_assert_held(&ctrl->ana_lock);
    571
    572	for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
    573		struct nvme_ana_group_desc *desc = base + offset;
    574		u32 nr_nsids;
    575		size_t nsid_buf_size;
    576
    577		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
    578			return -EINVAL;
    579
    580		nr_nsids = le32_to_cpu(desc->nnsids);
    581		nsid_buf_size = flex_array_size(desc, nsids, nr_nsids);
    582
    583		if (WARN_ON_ONCE(desc->grpid == 0))
    584			return -EINVAL;
    585		if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
    586			return -EINVAL;
    587		if (WARN_ON_ONCE(desc->state == 0))
    588			return -EINVAL;
    589		if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
    590			return -EINVAL;
    591
    592		offset += sizeof(*desc);
    593		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
    594			return -EINVAL;
    595
    596		error = cb(ctrl, desc, data);
    597		if (error)
    598			return error;
    599
    600		offset += nsid_buf_size;
    601	}
    602
    603	return 0;
    604}
    605
    606static inline bool nvme_state_is_live(enum nvme_ana_state state)
    607{
    608	return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
    609}
    610
    611static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
    612		struct nvme_ns *ns)
    613{
    614	ns->ana_grpid = le32_to_cpu(desc->grpid);
    615	ns->ana_state = desc->state;
    616	clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
    617	/*
    618	 * nvme_mpath_set_live() will trigger I/O to the multipath path device
    619	 * and in turn to this path device.  However we cannot accept this I/O
    620	 * if the controller is not live.  This may deadlock if called from
    621	 * nvme_mpath_init_identify() and the ctrl will never complete
    622	 * initialization, preventing I/O from completing.  For this case we
    623	 * will reprocess the ANA log page in nvme_mpath_update() once the
    624	 * controller is ready.
    625	 */
    626	if (nvme_state_is_live(ns->ana_state) &&
    627	    ns->ctrl->state == NVME_CTRL_LIVE)
    628		nvme_mpath_set_live(ns);
    629}
    630
    631static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
    632		struct nvme_ana_group_desc *desc, void *data)
    633{
    634	u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
    635	unsigned *nr_change_groups = data;
    636	struct nvme_ns *ns;
    637
    638	dev_dbg(ctrl->device, "ANA group %d: %s.\n",
    639			le32_to_cpu(desc->grpid),
    640			nvme_ana_state_names[desc->state]);
    641
    642	if (desc->state == NVME_ANA_CHANGE)
    643		(*nr_change_groups)++;
    644
    645	if (!nr_nsids)
    646		return 0;
    647
    648	down_read(&ctrl->namespaces_rwsem);
    649	list_for_each_entry(ns, &ctrl->namespaces, list) {
    650		unsigned nsid;
    651again:
    652		nsid = le32_to_cpu(desc->nsids[n]);
    653		if (ns->head->ns_id < nsid)
    654			continue;
    655		if (ns->head->ns_id == nsid)
    656			nvme_update_ns_ana_state(desc, ns);
    657		if (++n == nr_nsids)
    658			break;
    659		if (ns->head->ns_id > nsid)
    660			goto again;
    661	}
    662	up_read(&ctrl->namespaces_rwsem);
    663	return 0;
    664}
    665
    666static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
    667{
    668	u32 nr_change_groups = 0;
    669	int error;
    670
    671	mutex_lock(&ctrl->ana_lock);
    672	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
    673			ctrl->ana_log_buf, ctrl->ana_log_size, 0);
    674	if (error) {
    675		dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
    676		goto out_unlock;
    677	}
    678
    679	error = nvme_parse_ana_log(ctrl, &nr_change_groups,
    680			nvme_update_ana_state);
    681	if (error)
    682		goto out_unlock;
    683
    684	/*
    685	 * In theory we should have an ANATT timer per group as they might enter
    686	 * the change state at different times.  But that is a lot of overhead
    687	 * just to protect against a target that keeps entering new changes
    688	 * states while never finishing previous ones.  But we'll still
    689	 * eventually time out once all groups are in change state, so this
    690	 * isn't a big deal.
    691	 *
    692	 * We also double the ANATT value to provide some slack for transports
    693	 * or AEN processing overhead.
    694	 */
    695	if (nr_change_groups)
    696		mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
    697	else
    698		del_timer_sync(&ctrl->anatt_timer);
    699out_unlock:
    700	mutex_unlock(&ctrl->ana_lock);
    701	return error;
    702}
    703
    704static void nvme_ana_work(struct work_struct *work)
    705{
    706	struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
    707
    708	if (ctrl->state != NVME_CTRL_LIVE)
    709		return;
    710
    711	nvme_read_ana_log(ctrl);
    712}
    713
    714void nvme_mpath_update(struct nvme_ctrl *ctrl)
    715{
    716	u32 nr_change_groups = 0;
    717
    718	if (!ctrl->ana_log_buf)
    719		return;
    720
    721	mutex_lock(&ctrl->ana_lock);
    722	nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state);
    723	mutex_unlock(&ctrl->ana_lock);
    724}
    725
    726static void nvme_anatt_timeout(struct timer_list *t)
    727{
    728	struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
    729
    730	dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
    731	nvme_reset_ctrl(ctrl);
    732}
    733
    734void nvme_mpath_stop(struct nvme_ctrl *ctrl)
    735{
    736	if (!nvme_ctrl_use_ana(ctrl))
    737		return;
    738	del_timer_sync(&ctrl->anatt_timer);
    739	cancel_work_sync(&ctrl->ana_work);
    740}
    741
    742#define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
    743	struct device_attribute subsys_attr_##_name =	\
    744		__ATTR(_name, _mode, _show, _store)
    745
    746static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
    747		struct device_attribute *attr, char *buf)
    748{
    749	struct nvme_subsystem *subsys =
    750		container_of(dev, struct nvme_subsystem, dev);
    751
    752	return sysfs_emit(buf, "%s\n",
    753			  nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
    754}
    755
    756static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
    757		struct device_attribute *attr, const char *buf, size_t count)
    758{
    759	struct nvme_subsystem *subsys =
    760		container_of(dev, struct nvme_subsystem, dev);
    761	int i;
    762
    763	for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
    764		if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
    765			WRITE_ONCE(subsys->iopolicy, i);
    766			return count;
    767		}
    768	}
    769
    770	return -EINVAL;
    771}
    772SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
    773		      nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
    774
    775static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
    776		char *buf)
    777{
    778	return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
    779}
    780DEVICE_ATTR_RO(ana_grpid);
    781
    782static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
    783		char *buf)
    784{
    785	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
    786
    787	return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
    788}
    789DEVICE_ATTR_RO(ana_state);
    790
    791static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
    792		struct nvme_ana_group_desc *desc, void *data)
    793{
    794	struct nvme_ana_group_desc *dst = data;
    795
    796	if (desc->grpid != dst->grpid)
    797		return 0;
    798
    799	*dst = *desc;
    800	return -ENXIO; /* just break out of the loop */
    801}
    802
    803void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
    804{
    805	if (nvme_ctrl_use_ana(ns->ctrl)) {
    806		struct nvme_ana_group_desc desc = {
    807			.grpid = id->anagrpid,
    808			.state = 0,
    809		};
    810
    811		mutex_lock(&ns->ctrl->ana_lock);
    812		ns->ana_grpid = le32_to_cpu(id->anagrpid);
    813		nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
    814		mutex_unlock(&ns->ctrl->ana_lock);
    815		if (desc.state) {
    816			/* found the group desc: update */
    817			nvme_update_ns_ana_state(&desc, ns);
    818		} else {
    819			/* group desc not found: trigger a re-read */
    820			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
    821			queue_work(nvme_wq, &ns->ctrl->ana_work);
    822		}
    823	} else {
    824		ns->ana_state = NVME_ANA_OPTIMIZED;
    825		nvme_mpath_set_live(ns);
    826	}
    827
    828	if (blk_queue_stable_writes(ns->queue) && ns->head->disk)
    829		blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES,
    830				   ns->head->disk->queue);
    831#ifdef CONFIG_BLK_DEV_ZONED
    832	if (blk_queue_is_zoned(ns->queue) && ns->head->disk)
    833		ns->head->disk->queue->nr_zones = ns->queue->nr_zones;
    834#endif
    835}
    836
    837void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
    838{
    839	if (!head->disk)
    840		return;
    841	kblockd_schedule_work(&head->requeue_work);
    842	if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
    843		nvme_cdev_del(&head->cdev, &head->cdev_device);
    844		del_gendisk(head->disk);
    845	}
    846}
    847
    848void nvme_mpath_remove_disk(struct nvme_ns_head *head)
    849{
    850	if (!head->disk)
    851		return;
    852	blk_mark_disk_dead(head->disk);
    853	/* make sure all pending bios are cleaned up */
    854	kblockd_schedule_work(&head->requeue_work);
    855	flush_work(&head->requeue_work);
    856	blk_cleanup_disk(head->disk);
    857}
    858
    859void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
    860{
    861	mutex_init(&ctrl->ana_lock);
    862	timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
    863	INIT_WORK(&ctrl->ana_work, nvme_ana_work);
    864}
    865
    866int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
    867{
    868	size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
    869	size_t ana_log_size;
    870	int error = 0;
    871
    872	/* check if multipath is enabled and we have the capability */
    873	if (!multipath || !ctrl->subsys ||
    874	    !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
    875		return 0;
    876
    877	if (!ctrl->max_namespaces ||
    878	    ctrl->max_namespaces > le32_to_cpu(id->nn)) {
    879		dev_err(ctrl->device,
    880			"Invalid MNAN value %u\n", ctrl->max_namespaces);
    881		return -EINVAL;
    882	}
    883
    884	ctrl->anacap = id->anacap;
    885	ctrl->anatt = id->anatt;
    886	ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
    887	ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
    888
    889	ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
    890		ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
    891		ctrl->max_namespaces * sizeof(__le32);
    892	if (ana_log_size > max_transfer_size) {
    893		dev_err(ctrl->device,
    894			"ANA log page size (%zd) larger than MDTS (%zd).\n",
    895			ana_log_size, max_transfer_size);
    896		dev_err(ctrl->device, "disabling ANA support.\n");
    897		goto out_uninit;
    898	}
    899	if (ana_log_size > ctrl->ana_log_size) {
    900		nvme_mpath_stop(ctrl);
    901		nvme_mpath_uninit(ctrl);
    902		ctrl->ana_log_buf = kvmalloc(ana_log_size, GFP_KERNEL);
    903		if (!ctrl->ana_log_buf)
    904			return -ENOMEM;
    905	}
    906	ctrl->ana_log_size = ana_log_size;
    907	error = nvme_read_ana_log(ctrl);
    908	if (error)
    909		goto out_uninit;
    910	return 0;
    911
    912out_uninit:
    913	nvme_mpath_uninit(ctrl);
    914	return error;
    915}
    916
    917void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
    918{
    919	kvfree(ctrl->ana_log_buf);
    920	ctrl->ana_log_buf = NULL;
    921	ctrl->ana_log_size = 0;
    922}