cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

core.c (134008B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * NVM Express device driver
      4 * Copyright (c) 2011-2014, Intel Corporation.
      5 */
      6
      7#include <linux/blkdev.h>
      8#include <linux/blk-mq.h>
      9#include <linux/blk-integrity.h>
     10#include <linux/compat.h>
     11#include <linux/delay.h>
     12#include <linux/errno.h>
     13#include <linux/hdreg.h>
     14#include <linux/kernel.h>
     15#include <linux/module.h>
     16#include <linux/backing-dev.h>
     17#include <linux/slab.h>
     18#include <linux/types.h>
     19#include <linux/pr.h>
     20#include <linux/ptrace.h>
     21#include <linux/nvme_ioctl.h>
     22#include <linux/pm_qos.h>
     23#include <asm/unaligned.h>
     24
     25#include "nvme.h"
     26#include "fabrics.h"
     27
     28#define CREATE_TRACE_POINTS
     29#include "trace.h"
     30
     31#define NVME_MINORS		(1U << MINORBITS)
     32
     33unsigned int admin_timeout = 60;
     34module_param(admin_timeout, uint, 0644);
     35MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
     36EXPORT_SYMBOL_GPL(admin_timeout);
     37
     38unsigned int nvme_io_timeout = 30;
     39module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
     40MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
     41EXPORT_SYMBOL_GPL(nvme_io_timeout);
     42
     43static unsigned char shutdown_timeout = 5;
     44module_param(shutdown_timeout, byte, 0644);
     45MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
     46
     47static u8 nvme_max_retries = 5;
     48module_param_named(max_retries, nvme_max_retries, byte, 0644);
     49MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
     50
     51static unsigned long default_ps_max_latency_us = 100000;
     52module_param(default_ps_max_latency_us, ulong, 0644);
     53MODULE_PARM_DESC(default_ps_max_latency_us,
     54		 "max power saving latency for new devices; use PM QOS to change per device");
     55
     56static bool force_apst;
     57module_param(force_apst, bool, 0644);
     58MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
     59
     60static unsigned long apst_primary_timeout_ms = 100;
     61module_param(apst_primary_timeout_ms, ulong, 0644);
     62MODULE_PARM_DESC(apst_primary_timeout_ms,
     63	"primary APST timeout in ms");
     64
     65static unsigned long apst_secondary_timeout_ms = 2000;
     66module_param(apst_secondary_timeout_ms, ulong, 0644);
     67MODULE_PARM_DESC(apst_secondary_timeout_ms,
     68	"secondary APST timeout in ms");
     69
     70static unsigned long apst_primary_latency_tol_us = 15000;
     71module_param(apst_primary_latency_tol_us, ulong, 0644);
     72MODULE_PARM_DESC(apst_primary_latency_tol_us,
     73	"primary APST latency tolerance in us");
     74
     75static unsigned long apst_secondary_latency_tol_us = 100000;
     76module_param(apst_secondary_latency_tol_us, ulong, 0644);
     77MODULE_PARM_DESC(apst_secondary_latency_tol_us,
     78	"secondary APST latency tolerance in us");
     79
     80/*
     81 * nvme_wq - hosts nvme related works that are not reset or delete
     82 * nvme_reset_wq - hosts nvme reset works
     83 * nvme_delete_wq - hosts nvme delete works
     84 *
     85 * nvme_wq will host works such as scan, aen handling, fw activation,
     86 * keep-alive, periodic reconnects etc. nvme_reset_wq
     87 * runs reset works which also flush works hosted on nvme_wq for
     88 * serialization purposes. nvme_delete_wq host controller deletion
     89 * works which flush reset works for serialization.
     90 */
     91struct workqueue_struct *nvme_wq;
     92EXPORT_SYMBOL_GPL(nvme_wq);
     93
     94struct workqueue_struct *nvme_reset_wq;
     95EXPORT_SYMBOL_GPL(nvme_reset_wq);
     96
     97struct workqueue_struct *nvme_delete_wq;
     98EXPORT_SYMBOL_GPL(nvme_delete_wq);
     99
    100static LIST_HEAD(nvme_subsystems);
    101static DEFINE_MUTEX(nvme_subsystems_lock);
    102
    103static DEFINE_IDA(nvme_instance_ida);
    104static dev_t nvme_ctrl_base_chr_devt;
    105static struct class *nvme_class;
    106static struct class *nvme_subsys_class;
    107
    108static DEFINE_IDA(nvme_ns_chr_minor_ida);
    109static dev_t nvme_ns_chr_devt;
    110static struct class *nvme_ns_chr_class;
    111
    112static void nvme_put_subsystem(struct nvme_subsystem *subsys);
    113static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
    114					   unsigned nsid);
    115static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
    116				   struct nvme_command *cmd);
    117
    118void nvme_queue_scan(struct nvme_ctrl *ctrl)
    119{
    120	/*
    121	 * Only new queue scan work when admin and IO queues are both alive
    122	 */
    123	if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
    124		queue_work(nvme_wq, &ctrl->scan_work);
    125}
    126
    127/*
    128 * Use this function to proceed with scheduling reset_work for a controller
    129 * that had previously been set to the resetting state. This is intended for
    130 * code paths that can't be interrupted by other reset attempts. A hot removal
    131 * may prevent this from succeeding.
    132 */
    133int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
    134{
    135	if (ctrl->state != NVME_CTRL_RESETTING)
    136		return -EBUSY;
    137	if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
    138		return -EBUSY;
    139	return 0;
    140}
    141EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
    142
    143static void nvme_failfast_work(struct work_struct *work)
    144{
    145	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
    146			struct nvme_ctrl, failfast_work);
    147
    148	if (ctrl->state != NVME_CTRL_CONNECTING)
    149		return;
    150
    151	set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
    152	dev_info(ctrl->device, "failfast expired\n");
    153	nvme_kick_requeue_lists(ctrl);
    154}
    155
    156static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl)
    157{
    158	if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1)
    159		return;
    160
    161	schedule_delayed_work(&ctrl->failfast_work,
    162			      ctrl->opts->fast_io_fail_tmo * HZ);
    163}
    164
    165static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl)
    166{
    167	if (!ctrl->opts)
    168		return;
    169
    170	cancel_delayed_work_sync(&ctrl->failfast_work);
    171	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
    172}
    173
    174
    175int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
    176{
    177	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
    178		return -EBUSY;
    179	if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
    180		return -EBUSY;
    181	return 0;
    182}
    183EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
    184
    185int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
    186{
    187	int ret;
    188
    189	ret = nvme_reset_ctrl(ctrl);
    190	if (!ret) {
    191		flush_work(&ctrl->reset_work);
    192		if (ctrl->state != NVME_CTRL_LIVE)
    193			ret = -ENETRESET;
    194	}
    195
    196	return ret;
    197}
    198
    199static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
    200{
    201	dev_info(ctrl->device,
    202		 "Removing ctrl: NQN \"%s\"\n", nvmf_ctrl_subsysnqn(ctrl));
    203
    204	flush_work(&ctrl->reset_work);
    205	nvme_stop_ctrl(ctrl);
    206	nvme_remove_namespaces(ctrl);
    207	ctrl->ops->delete_ctrl(ctrl);
    208	nvme_uninit_ctrl(ctrl);
    209}
    210
    211static void nvme_delete_ctrl_work(struct work_struct *work)
    212{
    213	struct nvme_ctrl *ctrl =
    214		container_of(work, struct nvme_ctrl, delete_work);
    215
    216	nvme_do_delete_ctrl(ctrl);
    217}
    218
    219int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
    220{
    221	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
    222		return -EBUSY;
    223	if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
    224		return -EBUSY;
    225	return 0;
    226}
    227EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
    228
    229static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
    230{
    231	/*
    232	 * Keep a reference until nvme_do_delete_ctrl() complete,
    233	 * since ->delete_ctrl can free the controller.
    234	 */
    235	nvme_get_ctrl(ctrl);
    236	if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
    237		nvme_do_delete_ctrl(ctrl);
    238	nvme_put_ctrl(ctrl);
    239}
    240
    241static blk_status_t nvme_error_status(u16 status)
    242{
    243	switch (status & 0x7ff) {
    244	case NVME_SC_SUCCESS:
    245		return BLK_STS_OK;
    246	case NVME_SC_CAP_EXCEEDED:
    247		return BLK_STS_NOSPC;
    248	case NVME_SC_LBA_RANGE:
    249	case NVME_SC_CMD_INTERRUPTED:
    250	case NVME_SC_NS_NOT_READY:
    251		return BLK_STS_TARGET;
    252	case NVME_SC_BAD_ATTRIBUTES:
    253	case NVME_SC_ONCS_NOT_SUPPORTED:
    254	case NVME_SC_INVALID_OPCODE:
    255	case NVME_SC_INVALID_FIELD:
    256	case NVME_SC_INVALID_NS:
    257		return BLK_STS_NOTSUPP;
    258	case NVME_SC_WRITE_FAULT:
    259	case NVME_SC_READ_ERROR:
    260	case NVME_SC_UNWRITTEN_BLOCK:
    261	case NVME_SC_ACCESS_DENIED:
    262	case NVME_SC_READ_ONLY:
    263	case NVME_SC_COMPARE_FAILED:
    264		return BLK_STS_MEDIUM;
    265	case NVME_SC_GUARD_CHECK:
    266	case NVME_SC_APPTAG_CHECK:
    267	case NVME_SC_REFTAG_CHECK:
    268	case NVME_SC_INVALID_PI:
    269		return BLK_STS_PROTECTION;
    270	case NVME_SC_RESERVATION_CONFLICT:
    271		return BLK_STS_NEXUS;
    272	case NVME_SC_HOST_PATH_ERROR:
    273		return BLK_STS_TRANSPORT;
    274	case NVME_SC_ZONE_TOO_MANY_ACTIVE:
    275		return BLK_STS_ZONE_ACTIVE_RESOURCE;
    276	case NVME_SC_ZONE_TOO_MANY_OPEN:
    277		return BLK_STS_ZONE_OPEN_RESOURCE;
    278	default:
    279		return BLK_STS_IOERR;
    280	}
    281}
    282
    283static void nvme_retry_req(struct request *req)
    284{
    285	unsigned long delay = 0;
    286	u16 crd;
    287
    288	/* The mask and shift result must be <= 3 */
    289	crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
    290	if (crd)
    291		delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100;
    292
    293	nvme_req(req)->retries++;
    294	blk_mq_requeue_request(req, false);
    295	blk_mq_delay_kick_requeue_list(req->q, delay);
    296}
    297
    298static void nvme_log_error(struct request *req)
    299{
    300	struct nvme_ns *ns = req->q->queuedata;
    301	struct nvme_request *nr = nvme_req(req);
    302
    303	if (ns) {
    304		pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %llu blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
    305		       ns->disk ? ns->disk->disk_name : "?",
    306		       nvme_get_opcode_str(nr->cmd->common.opcode),
    307		       nr->cmd->common.opcode,
    308		       (unsigned long long)nvme_sect_to_lba(ns, blk_rq_pos(req)),
    309		       (unsigned long long)blk_rq_bytes(req) >> ns->lba_shift,
    310		       nvme_get_error_status_str(nr->status),
    311		       nr->status >> 8 & 7,	/* Status Code Type */
    312		       nr->status & 0xff,	/* Status Code */
    313		       nr->status & NVME_SC_MORE ? "MORE " : "",
    314		       nr->status & NVME_SC_DNR  ? "DNR "  : "");
    315		return;
    316	}
    317
    318	pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s\n",
    319			   dev_name(nr->ctrl->device),
    320			   nvme_get_admin_opcode_str(nr->cmd->common.opcode),
    321			   nr->cmd->common.opcode,
    322			   nvme_get_error_status_str(nr->status),
    323			   nr->status >> 8 & 7,	/* Status Code Type */
    324			   nr->status & 0xff,	/* Status Code */
    325			   nr->status & NVME_SC_MORE ? "MORE " : "",
    326			   nr->status & NVME_SC_DNR  ? "DNR "  : "");
    327}
    328
    329enum nvme_disposition {
    330	COMPLETE,
    331	RETRY,
    332	FAILOVER,
    333};
    334
    335static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
    336{
    337	if (likely(nvme_req(req)->status == 0))
    338		return COMPLETE;
    339
    340	if (blk_noretry_request(req) ||
    341	    (nvme_req(req)->status & NVME_SC_DNR) ||
    342	    nvme_req(req)->retries >= nvme_max_retries)
    343		return COMPLETE;
    344
    345	if (req->cmd_flags & REQ_NVME_MPATH) {
    346		if (nvme_is_path_error(nvme_req(req)->status) ||
    347		    blk_queue_dying(req->q))
    348			return FAILOVER;
    349	} else {
    350		if (blk_queue_dying(req->q))
    351			return COMPLETE;
    352	}
    353
    354	return RETRY;
    355}
    356
    357static inline void nvme_end_req_zoned(struct request *req)
    358{
    359	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
    360	    req_op(req) == REQ_OP_ZONE_APPEND)
    361		req->__sector = nvme_lba_to_sect(req->q->queuedata,
    362			le64_to_cpu(nvme_req(req)->result.u64));
    363}
    364
    365static inline void nvme_end_req(struct request *req)
    366{
    367	blk_status_t status = nvme_error_status(nvme_req(req)->status);
    368
    369	if (unlikely(nvme_req(req)->status && !(req->rq_flags & RQF_QUIET)))
    370		nvme_log_error(req);
    371	nvme_end_req_zoned(req);
    372	nvme_trace_bio_complete(req);
    373	blk_mq_end_request(req, status);
    374}
    375
    376void nvme_complete_rq(struct request *req)
    377{
    378	trace_nvme_complete_rq(req);
    379	nvme_cleanup_cmd(req);
    380
    381	if (nvme_req(req)->ctrl->kas)
    382		nvme_req(req)->ctrl->comp_seen = true;
    383
    384	switch (nvme_decide_disposition(req)) {
    385	case COMPLETE:
    386		nvme_end_req(req);
    387		return;
    388	case RETRY:
    389		nvme_retry_req(req);
    390		return;
    391	case FAILOVER:
    392		nvme_failover_req(req);
    393		return;
    394	}
    395}
    396EXPORT_SYMBOL_GPL(nvme_complete_rq);
    397
    398void nvme_complete_batch_req(struct request *req)
    399{
    400	trace_nvme_complete_rq(req);
    401	nvme_cleanup_cmd(req);
    402	nvme_end_req_zoned(req);
    403}
    404EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
    405
    406/*
    407 * Called to unwind from ->queue_rq on a failed command submission so that the
    408 * multipathing code gets called to potentially failover to another path.
    409 * The caller needs to unwind all transport specific resource allocations and
    410 * must return propagate the return value.
    411 */
    412blk_status_t nvme_host_path_error(struct request *req)
    413{
    414	nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
    415	blk_mq_set_request_complete(req);
    416	nvme_complete_rq(req);
    417	return BLK_STS_OK;
    418}
    419EXPORT_SYMBOL_GPL(nvme_host_path_error);
    420
    421bool nvme_cancel_request(struct request *req, void *data, bool reserved)
    422{
    423	dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
    424				"Cancelling I/O %d", req->tag);
    425
    426	/* don't abort one completed request */
    427	if (blk_mq_request_completed(req))
    428		return true;
    429
    430	nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
    431	nvme_req(req)->flags |= NVME_REQ_CANCELLED;
    432	blk_mq_complete_request(req);
    433	return true;
    434}
    435EXPORT_SYMBOL_GPL(nvme_cancel_request);
    436
    437void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
    438{
    439	if (ctrl->tagset) {
    440		blk_mq_tagset_busy_iter(ctrl->tagset,
    441				nvme_cancel_request, ctrl);
    442		blk_mq_tagset_wait_completed_request(ctrl->tagset);
    443	}
    444}
    445EXPORT_SYMBOL_GPL(nvme_cancel_tagset);
    446
    447void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
    448{
    449	if (ctrl->admin_tagset) {
    450		blk_mq_tagset_busy_iter(ctrl->admin_tagset,
    451				nvme_cancel_request, ctrl);
    452		blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
    453	}
    454}
    455EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
    456
    457bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
    458		enum nvme_ctrl_state new_state)
    459{
    460	enum nvme_ctrl_state old_state;
    461	unsigned long flags;
    462	bool changed = false;
    463
    464	spin_lock_irqsave(&ctrl->lock, flags);
    465
    466	old_state = ctrl->state;
    467	switch (new_state) {
    468	case NVME_CTRL_LIVE:
    469		switch (old_state) {
    470		case NVME_CTRL_NEW:
    471		case NVME_CTRL_RESETTING:
    472		case NVME_CTRL_CONNECTING:
    473			changed = true;
    474			fallthrough;
    475		default:
    476			break;
    477		}
    478		break;
    479	case NVME_CTRL_RESETTING:
    480		switch (old_state) {
    481		case NVME_CTRL_NEW:
    482		case NVME_CTRL_LIVE:
    483			changed = true;
    484			fallthrough;
    485		default:
    486			break;
    487		}
    488		break;
    489	case NVME_CTRL_CONNECTING:
    490		switch (old_state) {
    491		case NVME_CTRL_NEW:
    492		case NVME_CTRL_RESETTING:
    493			changed = true;
    494			fallthrough;
    495		default:
    496			break;
    497		}
    498		break;
    499	case NVME_CTRL_DELETING:
    500		switch (old_state) {
    501		case NVME_CTRL_LIVE:
    502		case NVME_CTRL_RESETTING:
    503		case NVME_CTRL_CONNECTING:
    504			changed = true;
    505			fallthrough;
    506		default:
    507			break;
    508		}
    509		break;
    510	case NVME_CTRL_DELETING_NOIO:
    511		switch (old_state) {
    512		case NVME_CTRL_DELETING:
    513		case NVME_CTRL_DEAD:
    514			changed = true;
    515			fallthrough;
    516		default:
    517			break;
    518		}
    519		break;
    520	case NVME_CTRL_DEAD:
    521		switch (old_state) {
    522		case NVME_CTRL_DELETING:
    523			changed = true;
    524			fallthrough;
    525		default:
    526			break;
    527		}
    528		break;
    529	default:
    530		break;
    531	}
    532
    533	if (changed) {
    534		ctrl->state = new_state;
    535		wake_up_all(&ctrl->state_wq);
    536	}
    537
    538	spin_unlock_irqrestore(&ctrl->lock, flags);
    539	if (!changed)
    540		return false;
    541
    542	if (ctrl->state == NVME_CTRL_LIVE) {
    543		if (old_state == NVME_CTRL_CONNECTING)
    544			nvme_stop_failfast_work(ctrl);
    545		nvme_kick_requeue_lists(ctrl);
    546	} else if (ctrl->state == NVME_CTRL_CONNECTING &&
    547		old_state == NVME_CTRL_RESETTING) {
    548		nvme_start_failfast_work(ctrl);
    549	}
    550	return changed;
    551}
    552EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
    553
    554/*
    555 * Returns true for sink states that can't ever transition back to live.
    556 */
    557static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
    558{
    559	switch (ctrl->state) {
    560	case NVME_CTRL_NEW:
    561	case NVME_CTRL_LIVE:
    562	case NVME_CTRL_RESETTING:
    563	case NVME_CTRL_CONNECTING:
    564		return false;
    565	case NVME_CTRL_DELETING:
    566	case NVME_CTRL_DELETING_NOIO:
    567	case NVME_CTRL_DEAD:
    568		return true;
    569	default:
    570		WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
    571		return true;
    572	}
    573}
    574
    575/*
    576 * Waits for the controller state to be resetting, or returns false if it is
    577 * not possible to ever transition to that state.
    578 */
    579bool nvme_wait_reset(struct nvme_ctrl *ctrl)
    580{
    581	wait_event(ctrl->state_wq,
    582		   nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
    583		   nvme_state_terminal(ctrl));
    584	return ctrl->state == NVME_CTRL_RESETTING;
    585}
    586EXPORT_SYMBOL_GPL(nvme_wait_reset);
    587
    588static void nvme_free_ns_head(struct kref *ref)
    589{
    590	struct nvme_ns_head *head =
    591		container_of(ref, struct nvme_ns_head, ref);
    592
    593	nvme_mpath_remove_disk(head);
    594	ida_free(&head->subsys->ns_ida, head->instance);
    595	cleanup_srcu_struct(&head->srcu);
    596	nvme_put_subsystem(head->subsys);
    597	kfree(head);
    598}
    599
    600bool nvme_tryget_ns_head(struct nvme_ns_head *head)
    601{
    602	return kref_get_unless_zero(&head->ref);
    603}
    604
    605void nvme_put_ns_head(struct nvme_ns_head *head)
    606{
    607	kref_put(&head->ref, nvme_free_ns_head);
    608}
    609
    610static void nvme_free_ns(struct kref *kref)
    611{
    612	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
    613
    614	put_disk(ns->disk);
    615	nvme_put_ns_head(ns->head);
    616	nvme_put_ctrl(ns->ctrl);
    617	kfree(ns);
    618}
    619
    620static inline bool nvme_get_ns(struct nvme_ns *ns)
    621{
    622	return kref_get_unless_zero(&ns->kref);
    623}
    624
    625void nvme_put_ns(struct nvme_ns *ns)
    626{
    627	kref_put(&ns->kref, nvme_free_ns);
    628}
    629EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
    630
    631static inline void nvme_clear_nvme_request(struct request *req)
    632{
    633	nvme_req(req)->status = 0;
    634	nvme_req(req)->retries = 0;
    635	nvme_req(req)->flags = 0;
    636	req->rq_flags |= RQF_DONTPREP;
    637}
    638
    639/* initialize a passthrough request */
    640void nvme_init_request(struct request *req, struct nvme_command *cmd)
    641{
    642	if (req->q->queuedata)
    643		req->timeout = NVME_IO_TIMEOUT;
    644	else /* no queuedata implies admin queue */
    645		req->timeout = NVME_ADMIN_TIMEOUT;
    646
    647	/* passthru commands should let the driver set the SGL flags */
    648	cmd->common.flags &= ~NVME_CMD_SGL_ALL;
    649
    650	req->cmd_flags |= REQ_FAILFAST_DRIVER;
    651	if (req->mq_hctx->type == HCTX_TYPE_POLL)
    652		req->cmd_flags |= REQ_POLLED;
    653	nvme_clear_nvme_request(req);
    654	memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd));
    655}
    656EXPORT_SYMBOL_GPL(nvme_init_request);
    657
    658/*
    659 * For something we're not in a state to send to the device the default action
    660 * is to busy it and retry it after the controller state is recovered.  However,
    661 * if the controller is deleting or if anything is marked for failfast or
    662 * nvme multipath it is immediately failed.
    663 *
    664 * Note: commands used to initialize the controller will be marked for failfast.
    665 * Note: nvme cli/ioctl commands are marked for failfast.
    666 */
    667blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
    668		struct request *rq)
    669{
    670	if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
    671	    ctrl->state != NVME_CTRL_DELETING &&
    672	    ctrl->state != NVME_CTRL_DEAD &&
    673	    !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
    674	    !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
    675		return BLK_STS_RESOURCE;
    676	return nvme_host_path_error(rq);
    677}
    678EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);
    679
    680bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
    681		bool queue_live)
    682{
    683	struct nvme_request *req = nvme_req(rq);
    684
    685	/*
    686	 * currently we have a problem sending passthru commands
    687	 * on the admin_q if the controller is not LIVE because we can't
    688	 * make sure that they are going out after the admin connect,
    689	 * controller enable and/or other commands in the initialization
    690	 * sequence. until the controller will be LIVE, fail with
    691	 * BLK_STS_RESOURCE so that they will be rescheduled.
    692	 */
    693	if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD))
    694		return false;
    695
    696	if (ctrl->ops->flags & NVME_F_FABRICS) {
    697		/*
    698		 * Only allow commands on a live queue, except for the connect
    699		 * command, which is require to set the queue live in the
    700		 * appropinquate states.
    701		 */
    702		switch (ctrl->state) {
    703		case NVME_CTRL_CONNECTING:
    704			if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
    705			    req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
    706				return true;
    707			break;
    708		default:
    709			break;
    710		case NVME_CTRL_DEAD:
    711			return false;
    712		}
    713	}
    714
    715	return queue_live;
    716}
    717EXPORT_SYMBOL_GPL(__nvme_check_ready);
    718
    719static inline void nvme_setup_flush(struct nvme_ns *ns,
    720		struct nvme_command *cmnd)
    721{
    722	memset(cmnd, 0, sizeof(*cmnd));
    723	cmnd->common.opcode = nvme_cmd_flush;
    724	cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
    725}
    726
    727static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
    728		struct nvme_command *cmnd)
    729{
    730	unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
    731	struct nvme_dsm_range *range;
    732	struct bio *bio;
    733
    734	/*
    735	 * Some devices do not consider the DSM 'Number of Ranges' field when
    736	 * determining how much data to DMA. Always allocate memory for maximum
    737	 * number of segments to prevent device reading beyond end of buffer.
    738	 */
    739	static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;
    740
    741	range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN);
    742	if (!range) {
    743		/*
    744		 * If we fail allocation our range, fallback to the controller
    745		 * discard page. If that's also busy, it's safe to return
    746		 * busy, as we know we can make progress once that's freed.
    747		 */
    748		if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
    749			return BLK_STS_RESOURCE;
    750
    751		range = page_address(ns->ctrl->discard_page);
    752	}
    753
    754	__rq_for_each_bio(bio, req) {
    755		u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
    756		u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
    757
    758		if (n < segments) {
    759			range[n].cattr = cpu_to_le32(0);
    760			range[n].nlb = cpu_to_le32(nlb);
    761			range[n].slba = cpu_to_le64(slba);
    762		}
    763		n++;
    764	}
    765
    766	if (WARN_ON_ONCE(n != segments)) {
    767		if (virt_to_page(range) == ns->ctrl->discard_page)
    768			clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
    769		else
    770			kfree(range);
    771		return BLK_STS_IOERR;
    772	}
    773
    774	memset(cmnd, 0, sizeof(*cmnd));
    775	cmnd->dsm.opcode = nvme_cmd_dsm;
    776	cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
    777	cmnd->dsm.nr = cpu_to_le32(segments - 1);
    778	cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
    779
    780	req->special_vec.bv_page = virt_to_page(range);
    781	req->special_vec.bv_offset = offset_in_page(range);
    782	req->special_vec.bv_len = alloc_size;
    783	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
    784
    785	return BLK_STS_OK;
    786}
    787
    788static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
    789			      struct request *req)
    790{
    791	u32 upper, lower;
    792	u64 ref48;
    793
    794	/* both rw and write zeroes share the same reftag format */
    795	switch (ns->guard_type) {
    796	case NVME_NVM_NS_16B_GUARD:
    797		cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
    798		break;
    799	case NVME_NVM_NS_64B_GUARD:
    800		ref48 = ext_pi_ref_tag(req);
    801		lower = lower_32_bits(ref48);
    802		upper = upper_32_bits(ref48);
    803
    804		cmnd->rw.reftag = cpu_to_le32(lower);
    805		cmnd->rw.cdw3 = cpu_to_le32(upper);
    806		break;
    807	default:
    808		break;
    809	}
    810}
    811
    812static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
    813		struct request *req, struct nvme_command *cmnd)
    814{
    815	memset(cmnd, 0, sizeof(*cmnd));
    816
    817	if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
    818		return nvme_setup_discard(ns, req, cmnd);
    819
    820	cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
    821	cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
    822	cmnd->write_zeroes.slba =
    823		cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
    824	cmnd->write_zeroes.length =
    825		cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
    826
    827	if (nvme_ns_has_pi(ns)) {
    828		cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT);
    829
    830		switch (ns->pi_type) {
    831		case NVME_NS_DPS_PI_TYPE1:
    832		case NVME_NS_DPS_PI_TYPE2:
    833			nvme_set_ref_tag(ns, cmnd, req);
    834			break;
    835		}
    836	}
    837
    838	return BLK_STS_OK;
    839}
    840
    841static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
    842		struct request *req, struct nvme_command *cmnd,
    843		enum nvme_opcode op)
    844{
    845	u16 control = 0;
    846	u32 dsmgmt = 0;
    847
    848	if (req->cmd_flags & REQ_FUA)
    849		control |= NVME_RW_FUA;
    850	if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
    851		control |= NVME_RW_LR;
    852
    853	if (req->cmd_flags & REQ_RAHEAD)
    854		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
    855
    856	cmnd->rw.opcode = op;
    857	cmnd->rw.flags = 0;
    858	cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
    859	cmnd->rw.cdw2 = 0;
    860	cmnd->rw.cdw3 = 0;
    861	cmnd->rw.metadata = 0;
    862	cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
    863	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
    864	cmnd->rw.reftag = 0;
    865	cmnd->rw.apptag = 0;
    866	cmnd->rw.appmask = 0;
    867
    868	if (ns->ms) {
    869		/*
    870		 * If formated with metadata, the block layer always provides a
    871		 * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled.  Else
    872		 * we enable the PRACT bit for protection information or set the
    873		 * namespace capacity to zero to prevent any I/O.
    874		 */
    875		if (!blk_integrity_rq(req)) {
    876			if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
    877				return BLK_STS_NOTSUPP;
    878			control |= NVME_RW_PRINFO_PRACT;
    879		}
    880
    881		switch (ns->pi_type) {
    882		case NVME_NS_DPS_PI_TYPE3:
    883			control |= NVME_RW_PRINFO_PRCHK_GUARD;
    884			break;
    885		case NVME_NS_DPS_PI_TYPE1:
    886		case NVME_NS_DPS_PI_TYPE2:
    887			control |= NVME_RW_PRINFO_PRCHK_GUARD |
    888					NVME_RW_PRINFO_PRCHK_REF;
    889			if (op == nvme_cmd_zone_append)
    890				control |= NVME_RW_APPEND_PIREMAP;
    891			nvme_set_ref_tag(ns, cmnd, req);
    892			break;
    893		}
    894	}
    895
    896	cmnd->rw.control = cpu_to_le16(control);
    897	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
    898	return 0;
    899}
    900
    901void nvme_cleanup_cmd(struct request *req)
    902{
    903	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
    904		struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
    905
    906		if (req->special_vec.bv_page == ctrl->discard_page)
    907			clear_bit_unlock(0, &ctrl->discard_page_busy);
    908		else
    909			kfree(bvec_virt(&req->special_vec));
    910	}
    911}
    912EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
    913
    914blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
    915{
    916	struct nvme_command *cmd = nvme_req(req)->cmd;
    917	blk_status_t ret = BLK_STS_OK;
    918
    919	if (!(req->rq_flags & RQF_DONTPREP))
    920		nvme_clear_nvme_request(req);
    921
    922	switch (req_op(req)) {
    923	case REQ_OP_DRV_IN:
    924	case REQ_OP_DRV_OUT:
    925		/* these are setup prior to execution in nvme_init_request() */
    926		break;
    927	case REQ_OP_FLUSH:
    928		nvme_setup_flush(ns, cmd);
    929		break;
    930	case REQ_OP_ZONE_RESET_ALL:
    931	case REQ_OP_ZONE_RESET:
    932		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
    933		break;
    934	case REQ_OP_ZONE_OPEN:
    935		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
    936		break;
    937	case REQ_OP_ZONE_CLOSE:
    938		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
    939		break;
    940	case REQ_OP_ZONE_FINISH:
    941		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
    942		break;
    943	case REQ_OP_WRITE_ZEROES:
    944		ret = nvme_setup_write_zeroes(ns, req, cmd);
    945		break;
    946	case REQ_OP_DISCARD:
    947		ret = nvme_setup_discard(ns, req, cmd);
    948		break;
    949	case REQ_OP_READ:
    950		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
    951		break;
    952	case REQ_OP_WRITE:
    953		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
    954		break;
    955	case REQ_OP_ZONE_APPEND:
    956		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
    957		break;
    958	default:
    959		WARN_ON_ONCE(1);
    960		return BLK_STS_IOERR;
    961	}
    962
    963	cmd->common.command_id = nvme_cid(req);
    964	trace_nvme_setup_cmd(req, cmd);
    965	return ret;
    966}
    967EXPORT_SYMBOL_GPL(nvme_setup_cmd);
    968
    969/*
    970 * Return values:
    971 * 0:  success
    972 * >0: nvme controller's cqe status response
    973 * <0: kernel error in lieu of controller response
    974 */
    975static int nvme_execute_rq(struct request *rq, bool at_head)
    976{
    977	blk_status_t status;
    978
    979	status = blk_execute_rq(rq, at_head);
    980	if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
    981		return -EINTR;
    982	if (nvme_req(rq)->status)
    983		return nvme_req(rq)->status;
    984	return blk_status_to_errno(status);
    985}
    986
    987/*
    988 * Returns 0 on success.  If the result is negative, it's a Linux error code;
    989 * if the result is positive, it's an NVM Express status code
    990 */
    991int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
    992		union nvme_result *result, void *buffer, unsigned bufflen,
    993		unsigned timeout, int qid, int at_head,
    994		blk_mq_req_flags_t flags)
    995{
    996	struct request *req;
    997	int ret;
    998
    999	if (qid == NVME_QID_ANY)
   1000		req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags);
   1001	else
   1002		req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags,
   1003						qid ? qid - 1 : 0);
   1004
   1005	if (IS_ERR(req))
   1006		return PTR_ERR(req);
   1007	nvme_init_request(req, cmd);
   1008
   1009	if (timeout)
   1010		req->timeout = timeout;
   1011
   1012	if (buffer && bufflen) {
   1013		ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
   1014		if (ret)
   1015			goto out;
   1016	}
   1017
   1018	req->rq_flags |= RQF_QUIET;
   1019	ret = nvme_execute_rq(req, at_head);
   1020	if (result && ret >= 0)
   1021		*result = nvme_req(req)->result;
   1022 out:
   1023	blk_mq_free_request(req);
   1024	return ret;
   1025}
   1026EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
   1027
   1028int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
   1029		void *buffer, unsigned bufflen)
   1030{
   1031	return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
   1032			NVME_QID_ANY, 0, 0);
   1033}
   1034EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
   1035
   1036static u32 nvme_known_admin_effects(u8 opcode)
   1037{
   1038	switch (opcode) {
   1039	case nvme_admin_format_nvm:
   1040		return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC |
   1041			NVME_CMD_EFFECTS_CSE_MASK;
   1042	case nvme_admin_sanitize_nvm:
   1043		return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK;
   1044	default:
   1045		break;
   1046	}
   1047	return 0;
   1048}
   1049
   1050u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
   1051{
   1052	u32 effects = 0;
   1053
   1054	if (ns) {
   1055		if (ns->head->effects)
   1056			effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
   1057		if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
   1058			dev_warn_once(ctrl->device,
   1059				"IO command:%02x has unhandled effects:%08x\n",
   1060				opcode, effects);
   1061		return 0;
   1062	}
   1063
   1064	if (ctrl->effects)
   1065		effects = le32_to_cpu(ctrl->effects->acs[opcode]);
   1066	effects |= nvme_known_admin_effects(opcode);
   1067
   1068	return effects;
   1069}
   1070EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
   1071
   1072static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
   1073			       u8 opcode)
   1074{
   1075	u32 effects = nvme_command_effects(ctrl, ns, opcode);
   1076
   1077	/*
   1078	 * For simplicity, IO to all namespaces is quiesced even if the command
   1079	 * effects say only one namespace is affected.
   1080	 */
   1081	if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
   1082		mutex_lock(&ctrl->scan_lock);
   1083		mutex_lock(&ctrl->subsys->lock);
   1084		nvme_mpath_start_freeze(ctrl->subsys);
   1085		nvme_mpath_wait_freeze(ctrl->subsys);
   1086		nvme_start_freeze(ctrl);
   1087		nvme_wait_freeze(ctrl);
   1088	}
   1089	return effects;
   1090}
   1091
   1092static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects,
   1093			      struct nvme_command *cmd, int status)
   1094{
   1095	if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
   1096		nvme_unfreeze(ctrl);
   1097		nvme_mpath_unfreeze(ctrl->subsys);
   1098		mutex_unlock(&ctrl->subsys->lock);
   1099		nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
   1100		mutex_unlock(&ctrl->scan_lock);
   1101	}
   1102	if (effects & NVME_CMD_EFFECTS_CCC)
   1103		nvme_init_ctrl_finish(ctrl);
   1104	if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
   1105		nvme_queue_scan(ctrl);
   1106		flush_work(&ctrl->scan_work);
   1107	}
   1108
   1109	switch (cmd->common.opcode) {
   1110	case nvme_admin_set_features:
   1111		switch (le32_to_cpu(cmd->common.cdw10) & 0xFF) {
   1112		case NVME_FEAT_KATO:
   1113			/*
   1114			 * Keep alive commands interval on the host should be
   1115			 * updated when KATO is modified by Set Features
   1116			 * commands.
   1117			 */
   1118			if (!status)
   1119				nvme_update_keep_alive(ctrl, cmd);
   1120			break;
   1121		default:
   1122			break;
   1123		}
   1124		break;
   1125	default:
   1126		break;
   1127	}
   1128}
   1129
   1130int nvme_execute_passthru_rq(struct request *rq)
   1131{
   1132	struct nvme_command *cmd = nvme_req(rq)->cmd;
   1133	struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
   1134	struct nvme_ns *ns = rq->q->queuedata;
   1135	u32 effects;
   1136	int  ret;
   1137
   1138	effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
   1139	ret = nvme_execute_rq(rq, false);
   1140	if (effects) /* nothing to be done for zero cmd effects */
   1141		nvme_passthru_end(ctrl, effects, cmd, ret);
   1142
   1143	return ret;
   1144}
   1145EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
   1146
   1147/*
   1148 * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1:
   1149 * 
   1150 *   The host should send Keep Alive commands at half of the Keep Alive Timeout
   1151 *   accounting for transport roundtrip times [..].
   1152 */
   1153static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
   1154{
   1155	queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ / 2);
   1156}
   1157
   1158static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
   1159{
   1160	struct nvme_ctrl *ctrl = rq->end_io_data;
   1161	unsigned long flags;
   1162	bool startka = false;
   1163
   1164	blk_mq_free_request(rq);
   1165
   1166	if (status) {
   1167		dev_err(ctrl->device,
   1168			"failed nvme_keep_alive_end_io error=%d\n",
   1169				status);
   1170		return;
   1171	}
   1172
   1173	ctrl->comp_seen = false;
   1174	spin_lock_irqsave(&ctrl->lock, flags);
   1175	if (ctrl->state == NVME_CTRL_LIVE ||
   1176	    ctrl->state == NVME_CTRL_CONNECTING)
   1177		startka = true;
   1178	spin_unlock_irqrestore(&ctrl->lock, flags);
   1179	if (startka)
   1180		nvme_queue_keep_alive_work(ctrl);
   1181}
   1182
   1183static void nvme_keep_alive_work(struct work_struct *work)
   1184{
   1185	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
   1186			struct nvme_ctrl, ka_work);
   1187	bool comp_seen = ctrl->comp_seen;
   1188	struct request *rq;
   1189
   1190	if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
   1191		dev_dbg(ctrl->device,
   1192			"reschedule traffic based keep-alive timer\n");
   1193		ctrl->comp_seen = false;
   1194		nvme_queue_keep_alive_work(ctrl);
   1195		return;
   1196	}
   1197
   1198	rq = blk_mq_alloc_request(ctrl->admin_q, nvme_req_op(&ctrl->ka_cmd),
   1199				  BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
   1200	if (IS_ERR(rq)) {
   1201		/* allocation failure, reset the controller */
   1202		dev_err(ctrl->device, "keep-alive failed: %ld\n", PTR_ERR(rq));
   1203		nvme_reset_ctrl(ctrl);
   1204		return;
   1205	}
   1206	nvme_init_request(rq, &ctrl->ka_cmd);
   1207
   1208	rq->timeout = ctrl->kato * HZ;
   1209	rq->end_io = nvme_keep_alive_end_io;
   1210	rq->end_io_data = ctrl;
   1211	rq->rq_flags |= RQF_QUIET;
   1212	blk_execute_rq_nowait(rq, false);
   1213}
   1214
   1215static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
   1216{
   1217	if (unlikely(ctrl->kato == 0))
   1218		return;
   1219
   1220	nvme_queue_keep_alive_work(ctrl);
   1221}
   1222
   1223void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
   1224{
   1225	if (unlikely(ctrl->kato == 0))
   1226		return;
   1227
   1228	cancel_delayed_work_sync(&ctrl->ka_work);
   1229}
   1230EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
   1231
   1232static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
   1233				   struct nvme_command *cmd)
   1234{
   1235	unsigned int new_kato =
   1236		DIV_ROUND_UP(le32_to_cpu(cmd->common.cdw11), 1000);
   1237
   1238	dev_info(ctrl->device,
   1239		 "keep alive interval updated from %u ms to %u ms\n",
   1240		 ctrl->kato * 1000 / 2, new_kato * 1000 / 2);
   1241
   1242	nvme_stop_keep_alive(ctrl);
   1243	ctrl->kato = new_kato;
   1244	nvme_start_keep_alive(ctrl);
   1245}
   1246
   1247/*
   1248 * In NVMe 1.0 the CNS field was just a binary controller or namespace
   1249 * flag, thus sending any new CNS opcodes has a big chance of not working.
   1250 * Qemu unfortunately had that bug after reporting a 1.1 version compliance
   1251 * (but not for any later version).
   1252 */
   1253static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl)
   1254{
   1255	if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)
   1256		return ctrl->vs < NVME_VS(1, 2, 0);
   1257	return ctrl->vs < NVME_VS(1, 1, 0);
   1258}
   1259
   1260static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
   1261{
   1262	struct nvme_command c = { };
   1263	int error;
   1264
   1265	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
   1266	c.identify.opcode = nvme_admin_identify;
   1267	c.identify.cns = NVME_ID_CNS_CTRL;
   1268
   1269	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
   1270	if (!*id)
   1271		return -ENOMEM;
   1272
   1273	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
   1274			sizeof(struct nvme_id_ctrl));
   1275	if (error)
   1276		kfree(*id);
   1277	return error;
   1278}
   1279
   1280static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
   1281		struct nvme_ns_id_desc *cur, bool *csi_seen)
   1282{
   1283	const char *warn_str = "ctrl returned bogus length:";
   1284	void *data = cur;
   1285
   1286	switch (cur->nidt) {
   1287	case NVME_NIDT_EUI64:
   1288		if (cur->nidl != NVME_NIDT_EUI64_LEN) {
   1289			dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
   1290				 warn_str, cur->nidl);
   1291			return -1;
   1292		}
   1293		if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
   1294			return NVME_NIDT_EUI64_LEN;
   1295		memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
   1296		return NVME_NIDT_EUI64_LEN;
   1297	case NVME_NIDT_NGUID:
   1298		if (cur->nidl != NVME_NIDT_NGUID_LEN) {
   1299			dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
   1300				 warn_str, cur->nidl);
   1301			return -1;
   1302		}
   1303		if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
   1304			return NVME_NIDT_NGUID_LEN;
   1305		memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
   1306		return NVME_NIDT_NGUID_LEN;
   1307	case NVME_NIDT_UUID:
   1308		if (cur->nidl != NVME_NIDT_UUID_LEN) {
   1309			dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
   1310				 warn_str, cur->nidl);
   1311			return -1;
   1312		}
   1313		if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
   1314			return NVME_NIDT_UUID_LEN;
   1315		uuid_copy(&ids->uuid, data + sizeof(*cur));
   1316		return NVME_NIDT_UUID_LEN;
   1317	case NVME_NIDT_CSI:
   1318		if (cur->nidl != NVME_NIDT_CSI_LEN) {
   1319			dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
   1320				 warn_str, cur->nidl);
   1321			return -1;
   1322		}
   1323		memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
   1324		*csi_seen = true;
   1325		return NVME_NIDT_CSI_LEN;
   1326	default:
   1327		/* Skip unknown types */
   1328		return cur->nidl;
   1329	}
   1330}
   1331
   1332static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
   1333		struct nvme_ns_ids *ids)
   1334{
   1335	struct nvme_command c = { };
   1336	bool csi_seen = false;
   1337	int status, pos, len;
   1338	void *data;
   1339
   1340	if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl))
   1341		return 0;
   1342	if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
   1343		return 0;
   1344
   1345	c.identify.opcode = nvme_admin_identify;
   1346	c.identify.nsid = cpu_to_le32(nsid);
   1347	c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
   1348
   1349	data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
   1350	if (!data)
   1351		return -ENOMEM;
   1352
   1353	status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
   1354				      NVME_IDENTIFY_DATA_SIZE);
   1355	if (status) {
   1356		dev_warn(ctrl->device,
   1357			"Identify Descriptors failed (nsid=%u, status=0x%x)\n",
   1358			nsid, status);
   1359		goto free_data;
   1360	}
   1361
   1362	for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
   1363		struct nvme_ns_id_desc *cur = data + pos;
   1364
   1365		if (cur->nidl == 0)
   1366			break;
   1367
   1368		len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
   1369		if (len < 0)
   1370			break;
   1371
   1372		len += sizeof(*cur);
   1373	}
   1374
   1375	if (nvme_multi_css(ctrl) && !csi_seen) {
   1376		dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
   1377			 nsid);
   1378		status = -EINVAL;
   1379	}
   1380
   1381free_data:
   1382	kfree(data);
   1383	return status;
   1384}
   1385
   1386static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
   1387			struct nvme_ns_ids *ids, struct nvme_id_ns **id)
   1388{
   1389	struct nvme_command c = { };
   1390	int error;
   1391
   1392	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
   1393	c.identify.opcode = nvme_admin_identify;
   1394	c.identify.nsid = cpu_to_le32(nsid);
   1395	c.identify.cns = NVME_ID_CNS_NS;
   1396
   1397	*id = kmalloc(sizeof(**id), GFP_KERNEL);
   1398	if (!*id)
   1399		return -ENOMEM;
   1400
   1401	error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
   1402	if (error) {
   1403		dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
   1404		goto out_free_id;
   1405	}
   1406
   1407	error = NVME_SC_INVALID_NS | NVME_SC_DNR;
   1408	if ((*id)->ncap == 0) /* namespace not allocated or attached */
   1409		goto out_free_id;
   1410
   1411
   1412	if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
   1413		dev_info(ctrl->device,
   1414			 "Ignoring bogus Namespace Identifiers\n");
   1415	} else {
   1416		if (ctrl->vs >= NVME_VS(1, 1, 0) &&
   1417		    !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
   1418			memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64));
   1419		if (ctrl->vs >= NVME_VS(1, 2, 0) &&
   1420		    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
   1421			memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid));
   1422	}
   1423
   1424	return 0;
   1425
   1426out_free_id:
   1427	kfree(*id);
   1428	return error;
   1429}
   1430
   1431static int nvme_identify_ns_cs_indep(struct nvme_ctrl *ctrl, unsigned nsid,
   1432			struct nvme_id_ns_cs_indep **id)
   1433{
   1434	struct nvme_command c = {
   1435		.identify.opcode	= nvme_admin_identify,
   1436		.identify.nsid		= cpu_to_le32(nsid),
   1437		.identify.cns		= NVME_ID_CNS_NS_CS_INDEP,
   1438	};
   1439	int ret;
   1440
   1441	*id = kmalloc(sizeof(**id), GFP_KERNEL);
   1442	if (!*id)
   1443		return -ENOMEM;
   1444
   1445	ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
   1446	if (ret) {
   1447		dev_warn(ctrl->device,
   1448			 "Identify namespace (CS independent) failed (%d)\n",
   1449			 ret);
   1450		kfree(*id);
   1451		return ret;
   1452	}
   1453
   1454	return 0;
   1455}
   1456
   1457static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
   1458		unsigned int dword11, void *buffer, size_t buflen, u32 *result)
   1459{
   1460	union nvme_result res = { 0 };
   1461	struct nvme_command c = { };
   1462	int ret;
   1463
   1464	c.features.opcode = op;
   1465	c.features.fid = cpu_to_le32(fid);
   1466	c.features.dword11 = cpu_to_le32(dword11);
   1467
   1468	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
   1469			buffer, buflen, 0, NVME_QID_ANY, 0, 0);
   1470	if (ret >= 0 && result)
   1471		*result = le32_to_cpu(res.u32);
   1472	return ret;
   1473}
   1474
   1475int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
   1476		      unsigned int dword11, void *buffer, size_t buflen,
   1477		      u32 *result)
   1478{
   1479	return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
   1480			     buflen, result);
   1481}
   1482EXPORT_SYMBOL_GPL(nvme_set_features);
   1483
   1484int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
   1485		      unsigned int dword11, void *buffer, size_t buflen,
   1486		      u32 *result)
   1487{
   1488	return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
   1489			     buflen, result);
   1490}
   1491EXPORT_SYMBOL_GPL(nvme_get_features);
   1492
   1493int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
   1494{
   1495	u32 q_count = (*count - 1) | ((*count - 1) << 16);
   1496	u32 result;
   1497	int status, nr_io_queues;
   1498
   1499	status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
   1500			&result);
   1501	if (status < 0)
   1502		return status;
   1503
   1504	/*
   1505	 * Degraded controllers might return an error when setting the queue
   1506	 * count.  We still want to be able to bring them online and offer
   1507	 * access to the admin queue, as that might be only way to fix them up.
   1508	 */
   1509	if (status > 0) {
   1510		dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
   1511		*count = 0;
   1512	} else {
   1513		nr_io_queues = min(result & 0xffff, result >> 16) + 1;
   1514		*count = min(*count, nr_io_queues);
   1515	}
   1516
   1517	return 0;
   1518}
   1519EXPORT_SYMBOL_GPL(nvme_set_queue_count);
   1520
   1521#define NVME_AEN_SUPPORTED \
   1522	(NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
   1523	 NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
   1524
   1525static void nvme_enable_aen(struct nvme_ctrl *ctrl)
   1526{
   1527	u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED;
   1528	int status;
   1529
   1530	if (!supported_aens)
   1531		return;
   1532
   1533	status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens,
   1534			NULL, 0, &result);
   1535	if (status)
   1536		dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
   1537			 supported_aens);
   1538
   1539	queue_work(nvme_wq, &ctrl->async_event_work);
   1540}
   1541
   1542static int nvme_ns_open(struct nvme_ns *ns)
   1543{
   1544
   1545	/* should never be called due to GENHD_FL_HIDDEN */
   1546	if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head)))
   1547		goto fail;
   1548	if (!nvme_get_ns(ns))
   1549		goto fail;
   1550	if (!try_module_get(ns->ctrl->ops->module))
   1551		goto fail_put_ns;
   1552
   1553	return 0;
   1554
   1555fail_put_ns:
   1556	nvme_put_ns(ns);
   1557fail:
   1558	return -ENXIO;
   1559}
   1560
   1561static void nvme_ns_release(struct nvme_ns *ns)
   1562{
   1563
   1564	module_put(ns->ctrl->ops->module);
   1565	nvme_put_ns(ns);
   1566}
   1567
   1568static int nvme_open(struct block_device *bdev, fmode_t mode)
   1569{
   1570	return nvme_ns_open(bdev->bd_disk->private_data);
   1571}
   1572
   1573static void nvme_release(struct gendisk *disk, fmode_t mode)
   1574{
   1575	nvme_ns_release(disk->private_data);
   1576}
   1577
   1578int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
   1579{
   1580	/* some standard values */
   1581	geo->heads = 1 << 6;
   1582	geo->sectors = 1 << 5;
   1583	geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
   1584	return 0;
   1585}
   1586
   1587#ifdef CONFIG_BLK_DEV_INTEGRITY
   1588static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
   1589				u32 max_integrity_segments)
   1590{
   1591	struct blk_integrity integrity = { };
   1592
   1593	switch (ns->pi_type) {
   1594	case NVME_NS_DPS_PI_TYPE3:
   1595		switch (ns->guard_type) {
   1596		case NVME_NVM_NS_16B_GUARD:
   1597			integrity.profile = &t10_pi_type3_crc;
   1598			integrity.tag_size = sizeof(u16) + sizeof(u32);
   1599			integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
   1600			break;
   1601		case NVME_NVM_NS_64B_GUARD:
   1602			integrity.profile = &ext_pi_type3_crc64;
   1603			integrity.tag_size = sizeof(u16) + 6;
   1604			integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
   1605			break;
   1606		default:
   1607			integrity.profile = NULL;
   1608			break;
   1609		}
   1610		break;
   1611	case NVME_NS_DPS_PI_TYPE1:
   1612	case NVME_NS_DPS_PI_TYPE2:
   1613		switch (ns->guard_type) {
   1614		case NVME_NVM_NS_16B_GUARD:
   1615			integrity.profile = &t10_pi_type1_crc;
   1616			integrity.tag_size = sizeof(u16);
   1617			integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
   1618			break;
   1619		case NVME_NVM_NS_64B_GUARD:
   1620			integrity.profile = &ext_pi_type1_crc64;
   1621			integrity.tag_size = sizeof(u16);
   1622			integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
   1623			break;
   1624		default:
   1625			integrity.profile = NULL;
   1626			break;
   1627		}
   1628		break;
   1629	default:
   1630		integrity.profile = NULL;
   1631		break;
   1632	}
   1633
   1634	integrity.tuple_size = ns->ms;
   1635	blk_integrity_register(disk, &integrity);
   1636	blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
   1637}
   1638#else
   1639static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
   1640				u32 max_integrity_segments)
   1641{
   1642}
   1643#endif /* CONFIG_BLK_DEV_INTEGRITY */
   1644
   1645static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
   1646{
   1647	struct nvme_ctrl *ctrl = ns->ctrl;
   1648	struct request_queue *queue = disk->queue;
   1649	u32 size = queue_logical_block_size(queue);
   1650
   1651	if (ctrl->max_discard_sectors == 0) {
   1652		blk_queue_max_discard_sectors(queue, 0);
   1653		return;
   1654	}
   1655
   1656	BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
   1657			NVME_DSM_MAX_RANGES);
   1658
   1659	queue->limits.discard_granularity = size;
   1660
   1661	/* If discard is already enabled, don't reset queue limits */
   1662	if (queue->limits.max_discard_sectors)
   1663		return;
   1664
   1665	if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns, UINT_MAX))
   1666		ctrl->max_discard_sectors = nvme_lba_to_sect(ns, ctrl->dmrsl);
   1667
   1668	blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors);
   1669	blk_queue_max_discard_segments(queue, ctrl->max_discard_segments);
   1670
   1671	if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
   1672		blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
   1673}
   1674
   1675static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
   1676{
   1677	return uuid_equal(&a->uuid, &b->uuid) &&
   1678		memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
   1679		memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
   1680		a->csi == b->csi;
   1681}
   1682
   1683static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
   1684{
   1685	bool first = id->dps & NVME_NS_DPS_PI_FIRST;
   1686	unsigned lbaf = nvme_lbaf_index(id->flbas);
   1687	struct nvme_ctrl *ctrl = ns->ctrl;
   1688	struct nvme_command c = { };
   1689	struct nvme_id_ns_nvm *nvm;
   1690	int ret = 0;
   1691	u32 elbaf;
   1692
   1693	ns->pi_size = 0;
   1694	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
   1695	if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
   1696		ns->pi_size = sizeof(struct t10_pi_tuple);
   1697		ns->guard_type = NVME_NVM_NS_16B_GUARD;
   1698		goto set_pi;
   1699	}
   1700
   1701	nvm = kzalloc(sizeof(*nvm), GFP_KERNEL);
   1702	if (!nvm)
   1703		return -ENOMEM;
   1704
   1705	c.identify.opcode = nvme_admin_identify;
   1706	c.identify.nsid = cpu_to_le32(ns->head->ns_id);
   1707	c.identify.cns = NVME_ID_CNS_CS_NS;
   1708	c.identify.csi = NVME_CSI_NVM;
   1709
   1710	ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, nvm, sizeof(*nvm));
   1711	if (ret)
   1712		goto free_data;
   1713
   1714	elbaf = le32_to_cpu(nvm->elbaf[lbaf]);
   1715
   1716	/* no support for storage tag formats right now */
   1717	if (nvme_elbaf_sts(elbaf))
   1718		goto free_data;
   1719
   1720	ns->guard_type = nvme_elbaf_guard_type(elbaf);
   1721	switch (ns->guard_type) {
   1722	case NVME_NVM_NS_64B_GUARD:
   1723		ns->pi_size = sizeof(struct crc64_pi_tuple);
   1724		break;
   1725	case NVME_NVM_NS_16B_GUARD:
   1726		ns->pi_size = sizeof(struct t10_pi_tuple);
   1727		break;
   1728	default:
   1729		break;
   1730	}
   1731
   1732free_data:
   1733	kfree(nvm);
   1734set_pi:
   1735	if (ns->pi_size && (first || ns->ms == ns->pi_size))
   1736		ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
   1737	else
   1738		ns->pi_type = 0;
   1739
   1740	return ret;
   1741}
   1742
   1743static void nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
   1744{
   1745	struct nvme_ctrl *ctrl = ns->ctrl;
   1746
   1747	if (nvme_init_ms(ns, id))
   1748		return;
   1749
   1750	ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
   1751	if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
   1752		return;
   1753
   1754	if (ctrl->ops->flags & NVME_F_FABRICS) {
   1755		/*
   1756		 * The NVMe over Fabrics specification only supports metadata as
   1757		 * part of the extended data LBA.  We rely on HCA/HBA support to
   1758		 * remap the separate metadata buffer from the block layer.
   1759		 */
   1760		if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
   1761			return;
   1762
   1763		ns->features |= NVME_NS_EXT_LBAS;
   1764
   1765		/*
   1766		 * The current fabrics transport drivers support namespace
   1767		 * metadata formats only if nvme_ns_has_pi() returns true.
   1768		 * Suppress support for all other formats so the namespace will
   1769		 * have a 0 capacity and not be usable through the block stack.
   1770		 *
   1771		 * Note, this check will need to be modified if any drivers
   1772		 * gain the ability to use other metadata formats.
   1773		 */
   1774		if (ctrl->max_integrity_segments && nvme_ns_has_pi(ns))
   1775			ns->features |= NVME_NS_METADATA_SUPPORTED;
   1776	} else {
   1777		/*
   1778		 * For PCIe controllers, we can't easily remap the separate
   1779		 * metadata buffer from the block layer and thus require a
   1780		 * separate metadata buffer for block layer metadata/PI support.
   1781		 * We allow extended LBAs for the passthrough interface, though.
   1782		 */
   1783		if (id->flbas & NVME_NS_FLBAS_META_EXT)
   1784			ns->features |= NVME_NS_EXT_LBAS;
   1785		else
   1786			ns->features |= NVME_NS_METADATA_SUPPORTED;
   1787	}
   1788}
   1789
   1790static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
   1791		struct request_queue *q)
   1792{
   1793	bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT;
   1794
   1795	if (ctrl->max_hw_sectors) {
   1796		u32 max_segments =
   1797			(ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
   1798
   1799		max_segments = min_not_zero(max_segments, ctrl->max_segments);
   1800		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
   1801		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
   1802	}
   1803	blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
   1804	blk_queue_dma_alignment(q, 3);
   1805	blk_queue_write_cache(q, vwc, vwc);
   1806}
   1807
   1808static void nvme_update_disk_info(struct gendisk *disk,
   1809		struct nvme_ns *ns, struct nvme_id_ns *id)
   1810{
   1811	sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
   1812	unsigned short bs = 1 << ns->lba_shift;
   1813	u32 atomic_bs, phys_bs, io_opt = 0;
   1814
   1815	/*
   1816	 * The block layer can't support LBA sizes larger than the page size
   1817	 * yet, so catch this early and don't allow block I/O.
   1818	 */
   1819	if (ns->lba_shift > PAGE_SHIFT) {
   1820		capacity = 0;
   1821		bs = (1 << 9);
   1822	}
   1823
   1824	blk_integrity_unregister(disk);
   1825
   1826	atomic_bs = phys_bs = bs;
   1827	if (id->nabo == 0) {
   1828		/*
   1829		 * Bit 1 indicates whether NAWUPF is defined for this namespace
   1830		 * and whether it should be used instead of AWUPF. If NAWUPF ==
   1831		 * 0 then AWUPF must be used instead.
   1832		 */
   1833		if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
   1834			atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
   1835		else
   1836			atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
   1837	}
   1838
   1839	if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
   1840		/* NPWG = Namespace Preferred Write Granularity */
   1841		phys_bs = bs * (1 + le16_to_cpu(id->npwg));
   1842		/* NOWS = Namespace Optimal Write Size */
   1843		io_opt = bs * (1 + le16_to_cpu(id->nows));
   1844	}
   1845
   1846	blk_queue_logical_block_size(disk->queue, bs);
   1847	/*
   1848	 * Linux filesystems assume writing a single physical block is
   1849	 * an atomic operation. Hence limit the physical block size to the
   1850	 * value of the Atomic Write Unit Power Fail parameter.
   1851	 */
   1852	blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
   1853	blk_queue_io_min(disk->queue, phys_bs);
   1854	blk_queue_io_opt(disk->queue, io_opt);
   1855
   1856	/*
   1857	 * Register a metadata profile for PI, or the plain non-integrity NVMe
   1858	 * metadata masquerading as Type 0 if supported, otherwise reject block
   1859	 * I/O to namespaces with metadata except when the namespace supports
   1860	 * PI, as it can strip/insert in that case.
   1861	 */
   1862	if (ns->ms) {
   1863		if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
   1864		    (ns->features & NVME_NS_METADATA_SUPPORTED))
   1865			nvme_init_integrity(disk, ns,
   1866					    ns->ctrl->max_integrity_segments);
   1867		else if (!nvme_ns_has_pi(ns))
   1868			capacity = 0;
   1869	}
   1870
   1871	set_capacity_and_notify(disk, capacity);
   1872
   1873	nvme_config_discard(disk, ns);
   1874	blk_queue_max_write_zeroes_sectors(disk->queue,
   1875					   ns->ctrl->max_zeroes_sectors);
   1876}
   1877
   1878static inline bool nvme_first_scan(struct gendisk *disk)
   1879{
   1880	/* nvme_alloc_ns() scans the disk prior to adding it */
   1881	return !disk_live(disk);
   1882}
   1883
   1884static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
   1885{
   1886	struct nvme_ctrl *ctrl = ns->ctrl;
   1887	u32 iob;
   1888
   1889	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
   1890	    is_power_of_2(ctrl->max_hw_sectors))
   1891		iob = ctrl->max_hw_sectors;
   1892	else
   1893		iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
   1894
   1895	if (!iob)
   1896		return;
   1897
   1898	if (!is_power_of_2(iob)) {
   1899		if (nvme_first_scan(ns->disk))
   1900			pr_warn("%s: ignoring unaligned IO boundary:%u\n",
   1901				ns->disk->disk_name, iob);
   1902		return;
   1903	}
   1904
   1905	if (blk_queue_is_zoned(ns->disk->queue)) {
   1906		if (nvme_first_scan(ns->disk))
   1907			pr_warn("%s: ignoring zoned namespace IO boundary\n",
   1908				ns->disk->disk_name);
   1909		return;
   1910	}
   1911
   1912	blk_queue_chunk_sectors(ns->queue, iob);
   1913}
   1914
   1915static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
   1916{
   1917	unsigned lbaf = nvme_lbaf_index(id->flbas);
   1918	int ret;
   1919
   1920	blk_mq_freeze_queue(ns->disk->queue);
   1921	ns->lba_shift = id->lbaf[lbaf].ds;
   1922	nvme_set_queue_limits(ns->ctrl, ns->queue);
   1923
   1924	nvme_configure_metadata(ns, id);
   1925	nvme_set_chunk_sectors(ns, id);
   1926	nvme_update_disk_info(ns->disk, ns, id);
   1927
   1928	if (ns->head->ids.csi == NVME_CSI_ZNS) {
   1929		ret = nvme_update_zone_info(ns, lbaf);
   1930		if (ret)
   1931			goto out_unfreeze;
   1932	}
   1933
   1934	set_disk_ro(ns->disk, (id->nsattr & NVME_NS_ATTR_RO) ||
   1935		test_bit(NVME_NS_FORCE_RO, &ns->flags));
   1936	set_bit(NVME_NS_READY, &ns->flags);
   1937	blk_mq_unfreeze_queue(ns->disk->queue);
   1938
   1939	if (blk_queue_is_zoned(ns->queue)) {
   1940		ret = nvme_revalidate_zones(ns);
   1941		if (ret && !nvme_first_scan(ns->disk))
   1942			return ret;
   1943	}
   1944
   1945	if (nvme_ns_head_multipath(ns->head)) {
   1946		blk_mq_freeze_queue(ns->head->disk->queue);
   1947		nvme_update_disk_info(ns->head->disk, ns, id);
   1948		set_disk_ro(ns->head->disk,
   1949			    (id->nsattr & NVME_NS_ATTR_RO) ||
   1950				    test_bit(NVME_NS_FORCE_RO, &ns->flags));
   1951		nvme_mpath_revalidate_paths(ns);
   1952		blk_stack_limits(&ns->head->disk->queue->limits,
   1953				 &ns->queue->limits, 0);
   1954		disk_update_readahead(ns->head->disk);
   1955		blk_mq_unfreeze_queue(ns->head->disk->queue);
   1956	}
   1957	return 0;
   1958
   1959out_unfreeze:
   1960	/*
   1961	 * If probing fails due an unsupported feature, hide the block device,
   1962	 * but still allow other access.
   1963	 */
   1964	if (ret == -ENODEV) {
   1965		ns->disk->flags |= GENHD_FL_HIDDEN;
   1966		set_bit(NVME_NS_READY, &ns->flags);
   1967		ret = 0;
   1968	}
   1969	blk_mq_unfreeze_queue(ns->disk->queue);
   1970	return ret;
   1971}
   1972
   1973static char nvme_pr_type(enum pr_type type)
   1974{
   1975	switch (type) {
   1976	case PR_WRITE_EXCLUSIVE:
   1977		return 1;
   1978	case PR_EXCLUSIVE_ACCESS:
   1979		return 2;
   1980	case PR_WRITE_EXCLUSIVE_REG_ONLY:
   1981		return 3;
   1982	case PR_EXCLUSIVE_ACCESS_REG_ONLY:
   1983		return 4;
   1984	case PR_WRITE_EXCLUSIVE_ALL_REGS:
   1985		return 5;
   1986	case PR_EXCLUSIVE_ACCESS_ALL_REGS:
   1987		return 6;
   1988	default:
   1989		return 0;
   1990	}
   1991}
   1992
   1993static int nvme_send_ns_head_pr_command(struct block_device *bdev,
   1994		struct nvme_command *c, u8 data[16])
   1995{
   1996	struct nvme_ns_head *head = bdev->bd_disk->private_data;
   1997	int srcu_idx = srcu_read_lock(&head->srcu);
   1998	struct nvme_ns *ns = nvme_find_path(head);
   1999	int ret = -EWOULDBLOCK;
   2000
   2001	if (ns) {
   2002		c->common.nsid = cpu_to_le32(ns->head->ns_id);
   2003		ret = nvme_submit_sync_cmd(ns->queue, c, data, 16);
   2004	}
   2005	srcu_read_unlock(&head->srcu, srcu_idx);
   2006	return ret;
   2007}
   2008	
   2009static int nvme_send_ns_pr_command(struct nvme_ns *ns, struct nvme_command *c,
   2010		u8 data[16])
   2011{
   2012	c->common.nsid = cpu_to_le32(ns->head->ns_id);
   2013	return nvme_submit_sync_cmd(ns->queue, c, data, 16);
   2014}
   2015
   2016static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
   2017				u64 key, u64 sa_key, u8 op)
   2018{
   2019	struct nvme_command c = { };
   2020	u8 data[16] = { 0, };
   2021
   2022	put_unaligned_le64(key, &data[0]);
   2023	put_unaligned_le64(sa_key, &data[8]);
   2024
   2025	c.common.opcode = op;
   2026	c.common.cdw10 = cpu_to_le32(cdw10);
   2027
   2028	if (IS_ENABLED(CONFIG_NVME_MULTIPATH) &&
   2029	    bdev->bd_disk->fops == &nvme_ns_head_ops)
   2030		return nvme_send_ns_head_pr_command(bdev, &c, data);
   2031	return nvme_send_ns_pr_command(bdev->bd_disk->private_data, &c, data);
   2032}
   2033
   2034static int nvme_pr_register(struct block_device *bdev, u64 old,
   2035		u64 new, unsigned flags)
   2036{
   2037	u32 cdw10;
   2038
   2039	if (flags & ~PR_FL_IGNORE_KEY)
   2040		return -EOPNOTSUPP;
   2041
   2042	cdw10 = old ? 2 : 0;
   2043	cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
   2044	cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
   2045	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
   2046}
   2047
   2048static int nvme_pr_reserve(struct block_device *bdev, u64 key,
   2049		enum pr_type type, unsigned flags)
   2050{
   2051	u32 cdw10;
   2052
   2053	if (flags & ~PR_FL_IGNORE_KEY)
   2054		return -EOPNOTSUPP;
   2055
   2056	cdw10 = nvme_pr_type(type) << 8;
   2057	cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
   2058	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
   2059}
   2060
   2061static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
   2062		enum pr_type type, bool abort)
   2063{
   2064	u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1);
   2065
   2066	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
   2067}
   2068
   2069static int nvme_pr_clear(struct block_device *bdev, u64 key)
   2070{
   2071	u32 cdw10 = 1 | (key ? 1 << 3 : 0);
   2072
   2073	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
   2074}
   2075
   2076static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
   2077{
   2078	u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0);
   2079
   2080	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
   2081}
   2082
   2083const struct pr_ops nvme_pr_ops = {
   2084	.pr_register	= nvme_pr_register,
   2085	.pr_reserve	= nvme_pr_reserve,
   2086	.pr_release	= nvme_pr_release,
   2087	.pr_preempt	= nvme_pr_preempt,
   2088	.pr_clear	= nvme_pr_clear,
   2089};
   2090
   2091#ifdef CONFIG_BLK_SED_OPAL
   2092int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
   2093		bool send)
   2094{
   2095	struct nvme_ctrl *ctrl = data;
   2096	struct nvme_command cmd = { };
   2097
   2098	if (send)
   2099		cmd.common.opcode = nvme_admin_security_send;
   2100	else
   2101		cmd.common.opcode = nvme_admin_security_recv;
   2102	cmd.common.nsid = 0;
   2103	cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
   2104	cmd.common.cdw11 = cpu_to_le32(len);
   2105
   2106	return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 0,
   2107			NVME_QID_ANY, 1, 0);
   2108}
   2109EXPORT_SYMBOL_GPL(nvme_sec_submit);
   2110#endif /* CONFIG_BLK_SED_OPAL */
   2111
   2112#ifdef CONFIG_BLK_DEV_ZONED
   2113static int nvme_report_zones(struct gendisk *disk, sector_t sector,
   2114		unsigned int nr_zones, report_zones_cb cb, void *data)
   2115{
   2116	return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb,
   2117			data);
   2118}
   2119#else
   2120#define nvme_report_zones	NULL
   2121#endif /* CONFIG_BLK_DEV_ZONED */
   2122
   2123static const struct block_device_operations nvme_bdev_ops = {
   2124	.owner		= THIS_MODULE,
   2125	.ioctl		= nvme_ioctl,
   2126	.open		= nvme_open,
   2127	.release	= nvme_release,
   2128	.getgeo		= nvme_getgeo,
   2129	.report_zones	= nvme_report_zones,
   2130	.pr_ops		= &nvme_pr_ops,
   2131};
   2132
   2133static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled)
   2134{
   2135	unsigned long timeout_jiffies = ((timeout + 1) * HZ / 2) + jiffies;
   2136	u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
   2137	int ret;
   2138
   2139	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
   2140		if (csts == ~0)
   2141			return -ENODEV;
   2142		if ((csts & NVME_CSTS_RDY) == bit)
   2143			break;
   2144
   2145		usleep_range(1000, 2000);
   2146		if (fatal_signal_pending(current))
   2147			return -EINTR;
   2148		if (time_after(jiffies, timeout_jiffies)) {
   2149			dev_err(ctrl->device,
   2150				"Device not ready; aborting %s, CSTS=0x%x\n",
   2151				enabled ? "initialisation" : "reset", csts);
   2152			return -ENODEV;
   2153		}
   2154	}
   2155
   2156	return ret;
   2157}
   2158
   2159/*
   2160 * If the device has been passed off to us in an enabled state, just clear
   2161 * the enabled bit.  The spec says we should set the 'shutdown notification
   2162 * bits', but doing so may cause the device to complete commands to the
   2163 * admin queue ... and we don't know what memory that might be pointing at!
   2164 */
   2165int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
   2166{
   2167	int ret;
   2168
   2169	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
   2170	ctrl->ctrl_config &= ~NVME_CC_ENABLE;
   2171
   2172	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
   2173	if (ret)
   2174		return ret;
   2175
   2176	if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
   2177		msleep(NVME_QUIRK_DELAY_AMOUNT);
   2178
   2179	return nvme_wait_ready(ctrl, NVME_CAP_TIMEOUT(ctrl->cap), false);
   2180}
   2181EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
   2182
   2183int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
   2184{
   2185	unsigned dev_page_min;
   2186	u32 timeout;
   2187	int ret;
   2188
   2189	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
   2190	if (ret) {
   2191		dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
   2192		return ret;
   2193	}
   2194	dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
   2195
   2196	if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
   2197		dev_err(ctrl->device,
   2198			"Minimum device page size %u too large for host (%u)\n",
   2199			1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
   2200		return -ENODEV;
   2201	}
   2202
   2203	if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
   2204		ctrl->ctrl_config = NVME_CC_CSS_CSI;
   2205	else
   2206		ctrl->ctrl_config = NVME_CC_CSS_NVM;
   2207
   2208	if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
   2209		u32 crto;
   2210
   2211		ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
   2212		if (ret) {
   2213			dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
   2214				ret);
   2215			return ret;
   2216		}
   2217
   2218		if (ctrl->cap & NVME_CAP_CRMS_CRIMS) {
   2219			ctrl->ctrl_config |= NVME_CC_CRIME;
   2220			timeout = NVME_CRTO_CRIMT(crto);
   2221		} else {
   2222			timeout = NVME_CRTO_CRWMT(crto);
   2223		}
   2224	} else {
   2225		timeout = NVME_CAP_TIMEOUT(ctrl->cap);
   2226	}
   2227
   2228	ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
   2229	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
   2230	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
   2231	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
   2232	if (ret)
   2233		return ret;
   2234
   2235	/* Flush write to device (required if transport is PCI) */
   2236	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CC, &ctrl->ctrl_config);
   2237	if (ret)
   2238		return ret;
   2239
   2240	ctrl->ctrl_config |= NVME_CC_ENABLE;
   2241	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
   2242	if (ret)
   2243		return ret;
   2244	return nvme_wait_ready(ctrl, timeout, true);
   2245}
   2246EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
   2247
   2248int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
   2249{
   2250	unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ);
   2251	u32 csts;
   2252	int ret;
   2253
   2254	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
   2255	ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
   2256
   2257	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
   2258	if (ret)
   2259		return ret;
   2260
   2261	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
   2262		if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
   2263			break;
   2264
   2265		msleep(100);
   2266		if (fatal_signal_pending(current))
   2267			return -EINTR;
   2268		if (time_after(jiffies, timeout)) {
   2269			dev_err(ctrl->device,
   2270				"Device shutdown incomplete; abort shutdown\n");
   2271			return -ENODEV;
   2272		}
   2273	}
   2274
   2275	return ret;
   2276}
   2277EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
   2278
   2279static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
   2280{
   2281	__le64 ts;
   2282	int ret;
   2283
   2284	if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
   2285		return 0;
   2286
   2287	ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
   2288	ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
   2289			NULL);
   2290	if (ret)
   2291		dev_warn_once(ctrl->device,
   2292			"could not set timestamp (%d)\n", ret);
   2293	return ret;
   2294}
   2295
   2296static int nvme_configure_host_options(struct nvme_ctrl *ctrl)
   2297{
   2298	struct nvme_feat_host_behavior *host;
   2299	u8 acre = 0, lbafee = 0;
   2300	int ret;
   2301
   2302	/* Don't bother enabling the feature if retry delay is not reported */
   2303	if (ctrl->crdt[0])
   2304		acre = NVME_ENABLE_ACRE;
   2305	if (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)
   2306		lbafee = NVME_ENABLE_LBAFEE;
   2307
   2308	if (!acre && !lbafee)
   2309		return 0;
   2310
   2311	host = kzalloc(sizeof(*host), GFP_KERNEL);
   2312	if (!host)
   2313		return 0;
   2314
   2315	host->acre = acre;
   2316	host->lbafee = lbafee;
   2317	ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
   2318				host, sizeof(*host), NULL);
   2319	kfree(host);
   2320	return ret;
   2321}
   2322
   2323/*
   2324 * The function checks whether the given total (exlat + enlat) latency of
   2325 * a power state allows the latter to be used as an APST transition target.
   2326 * It does so by comparing the latency to the primary and secondary latency
   2327 * tolerances defined by module params. If there's a match, the corresponding
   2328 * timeout value is returned and the matching tolerance index (1 or 2) is
   2329 * reported.
   2330 */
   2331static bool nvme_apst_get_transition_time(u64 total_latency,
   2332		u64 *transition_time, unsigned *last_index)
   2333{
   2334	if (total_latency <= apst_primary_latency_tol_us) {
   2335		if (*last_index == 1)
   2336			return false;
   2337		*last_index = 1;
   2338		*transition_time = apst_primary_timeout_ms;
   2339		return true;
   2340	}
   2341	if (apst_secondary_timeout_ms &&
   2342		total_latency <= apst_secondary_latency_tol_us) {
   2343		if (*last_index <= 2)
   2344			return false;
   2345		*last_index = 2;
   2346		*transition_time = apst_secondary_timeout_ms;
   2347		return true;
   2348	}
   2349	return false;
   2350}
   2351
   2352/*
   2353 * APST (Autonomous Power State Transition) lets us program a table of power
   2354 * state transitions that the controller will perform automatically.
   2355 *
   2356 * Depending on module params, one of the two supported techniques will be used:
   2357 *
   2358 * - If the parameters provide explicit timeouts and tolerances, they will be
   2359 *   used to build a table with up to 2 non-operational states to transition to.
   2360 *   The default parameter values were selected based on the values used by
   2361 *   Microsoft's and Intel's NVMe drivers. Yet, since we don't implement dynamic
   2362 *   regeneration of the APST table in the event of switching between external
   2363 *   and battery power, the timeouts and tolerances reflect a compromise
   2364 *   between values used by Microsoft for AC and battery scenarios.
   2365 * - If not, we'll configure the table with a simple heuristic: we are willing
   2366 *   to spend at most 2% of the time transitioning between power states.
   2367 *   Therefore, when running in any given state, we will enter the next
   2368 *   lower-power non-operational state after waiting 50 * (enlat + exlat)
   2369 *   microseconds, as long as that state's exit latency is under the requested
   2370 *   maximum latency.
   2371 *
   2372 * We will not autonomously enter any non-operational state for which the total
   2373 * latency exceeds ps_max_latency_us.
   2374 *
   2375 * Users can set ps_max_latency_us to zero to turn off APST.
   2376 */
   2377static int nvme_configure_apst(struct nvme_ctrl *ctrl)
   2378{
   2379	struct nvme_feat_auto_pst *table;
   2380	unsigned apste = 0;
   2381	u64 max_lat_us = 0;
   2382	__le64 target = 0;
   2383	int max_ps = -1;
   2384	int state;
   2385	int ret;
   2386	unsigned last_lt_index = UINT_MAX;
   2387
   2388	/*
   2389	 * If APST isn't supported or if we haven't been initialized yet,
   2390	 * then don't do anything.
   2391	 */
   2392	if (!ctrl->apsta)
   2393		return 0;
   2394
   2395	if (ctrl->npss > 31) {
   2396		dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
   2397		return 0;
   2398	}
   2399
   2400	table = kzalloc(sizeof(*table), GFP_KERNEL);
   2401	if (!table)
   2402		return 0;
   2403
   2404	if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
   2405		/* Turn off APST. */
   2406		dev_dbg(ctrl->device, "APST disabled\n");
   2407		goto done;
   2408	}
   2409
   2410	/*
   2411	 * Walk through all states from lowest- to highest-power.
   2412	 * According to the spec, lower-numbered states use more power.  NPSS,
   2413	 * despite the name, is the index of the lowest-power state, not the
   2414	 * number of states.
   2415	 */
   2416	for (state = (int)ctrl->npss; state >= 0; state--) {
   2417		u64 total_latency_us, exit_latency_us, transition_ms;
   2418
   2419		if (target)
   2420			table->entries[state] = target;
   2421
   2422		/*
   2423		 * Don't allow transitions to the deepest state if it's quirked
   2424		 * off.
   2425		 */
   2426		if (state == ctrl->npss &&
   2427		    (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
   2428			continue;
   2429
   2430		/*
   2431		 * Is this state a useful non-operational state for higher-power
   2432		 * states to autonomously transition to?
   2433		 */
   2434		if (!(ctrl->psd[state].flags & NVME_PS_FLAGS_NON_OP_STATE))
   2435			continue;
   2436
   2437		exit_latency_us = (u64)le32_to_cpu(ctrl->psd[state].exit_lat);
   2438		if (exit_latency_us > ctrl->ps_max_latency_us)
   2439			continue;
   2440
   2441		total_latency_us = exit_latency_us +
   2442			le32_to_cpu(ctrl->psd[state].entry_lat);
   2443
   2444		/*
   2445		 * This state is good. It can be used as the APST idle target
   2446		 * for higher power states.
   2447		 */
   2448		if (apst_primary_timeout_ms && apst_primary_latency_tol_us) {
   2449			if (!nvme_apst_get_transition_time(total_latency_us,
   2450					&transition_ms, &last_lt_index))
   2451				continue;
   2452		} else {
   2453			transition_ms = total_latency_us + 19;
   2454			do_div(transition_ms, 20);
   2455			if (transition_ms > (1 << 24) - 1)
   2456				transition_ms = (1 << 24) - 1;
   2457		}
   2458
   2459		target = cpu_to_le64((state << 3) | (transition_ms << 8));
   2460		if (max_ps == -1)
   2461			max_ps = state;
   2462		if (total_latency_us > max_lat_us)
   2463			max_lat_us = total_latency_us;
   2464	}
   2465
   2466	if (max_ps == -1)
   2467		dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
   2468	else
   2469		dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
   2470			max_ps, max_lat_us, (int)sizeof(*table), table);
   2471	apste = 1;
   2472
   2473done:
   2474	ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
   2475				table, sizeof(*table), NULL);
   2476	if (ret)
   2477		dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
   2478	kfree(table);
   2479	return ret;
   2480}
   2481
   2482static void nvme_set_latency_tolerance(struct device *dev, s32 val)
   2483{
   2484	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
   2485	u64 latency;
   2486
   2487	switch (val) {
   2488	case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
   2489	case PM_QOS_LATENCY_ANY:
   2490		latency = U64_MAX;
   2491		break;
   2492
   2493	default:
   2494		latency = val;
   2495	}
   2496
   2497	if (ctrl->ps_max_latency_us != latency) {
   2498		ctrl->ps_max_latency_us = latency;
   2499		if (ctrl->state == NVME_CTRL_LIVE)
   2500			nvme_configure_apst(ctrl);
   2501	}
   2502}
   2503
   2504struct nvme_core_quirk_entry {
   2505	/*
   2506	 * NVMe model and firmware strings are padded with spaces.  For
   2507	 * simplicity, strings in the quirk table are padded with NULLs
   2508	 * instead.
   2509	 */
   2510	u16 vid;
   2511	const char *mn;
   2512	const char *fr;
   2513	unsigned long quirks;
   2514};
   2515
   2516static const struct nvme_core_quirk_entry core_quirks[] = {
   2517	{
   2518		/*
   2519		 * This Toshiba device seems to die using any APST states.  See:
   2520		 * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
   2521		 */
   2522		.vid = 0x1179,
   2523		.mn = "THNSF5256GPUK TOSHIBA",
   2524		.quirks = NVME_QUIRK_NO_APST,
   2525	},
   2526	{
   2527		/*
   2528		 * This LiteON CL1-3D*-Q11 firmware version has a race
   2529		 * condition associated with actions related to suspend to idle
   2530		 * LiteON has resolved the problem in future firmware
   2531		 */
   2532		.vid = 0x14a4,
   2533		.fr = "22301111",
   2534		.quirks = NVME_QUIRK_SIMPLE_SUSPEND,
   2535	},
   2536	{
   2537		/*
   2538		 * This Kioxia CD6-V Series / HPE PE8030 device times out and
   2539		 * aborts I/O during any load, but more easily reproducible
   2540		 * with discards (fstrim).
   2541		 *
   2542		 * The device is left in a state where it is also not possible
   2543		 * to use "nvme set-feature" to disable APST, but booting with
   2544		 * nvme_core.default_ps_max_latency=0 works.
   2545		 */
   2546		.vid = 0x1e0f,
   2547		.mn = "KCD6XVUL6T40",
   2548		.quirks = NVME_QUIRK_NO_APST,
   2549	},
   2550	{
   2551		/*
   2552		 * The external Samsung X5 SSD fails initialization without a
   2553		 * delay before checking if it is ready and has a whole set of
   2554		 * other problems.  To make this even more interesting, it
   2555		 * shares the PCI ID with internal Samsung 970 Evo Plus that
   2556		 * does not need or want these quirks.
   2557		 */
   2558		.vid = 0x144d,
   2559		.mn = "Samsung Portable SSD X5",
   2560		.quirks = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
   2561			  NVME_QUIRK_NO_DEEPEST_PS |
   2562			  NVME_QUIRK_IGNORE_DEV_SUBNQN,
   2563	}
   2564};
   2565
   2566/* match is null-terminated but idstr is space-padded. */
   2567static bool string_matches(const char *idstr, const char *match, size_t len)
   2568{
   2569	size_t matchlen;
   2570
   2571	if (!match)
   2572		return true;
   2573
   2574	matchlen = strlen(match);
   2575	WARN_ON_ONCE(matchlen > len);
   2576
   2577	if (memcmp(idstr, match, matchlen))
   2578		return false;
   2579
   2580	for (; matchlen < len; matchlen++)
   2581		if (idstr[matchlen] != ' ')
   2582			return false;
   2583
   2584	return true;
   2585}
   2586
   2587static bool quirk_matches(const struct nvme_id_ctrl *id,
   2588			  const struct nvme_core_quirk_entry *q)
   2589{
   2590	return q->vid == le16_to_cpu(id->vid) &&
   2591		string_matches(id->mn, q->mn, sizeof(id->mn)) &&
   2592		string_matches(id->fr, q->fr, sizeof(id->fr));
   2593}
   2594
   2595static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
   2596		struct nvme_id_ctrl *id)
   2597{
   2598	size_t nqnlen;
   2599	int off;
   2600
   2601	if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
   2602		nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
   2603		if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
   2604			strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
   2605			return;
   2606		}
   2607
   2608		if (ctrl->vs >= NVME_VS(1, 2, 1))
   2609			dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
   2610	}
   2611
   2612	/* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
   2613	off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
   2614			"nqn.2014.08.org.nvmexpress:%04x%04x",
   2615			le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
   2616	memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
   2617	off += sizeof(id->sn);
   2618	memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
   2619	off += sizeof(id->mn);
   2620	memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
   2621}
   2622
   2623static void nvme_release_subsystem(struct device *dev)
   2624{
   2625	struct nvme_subsystem *subsys =
   2626		container_of(dev, struct nvme_subsystem, dev);
   2627
   2628	if (subsys->instance >= 0)
   2629		ida_free(&nvme_instance_ida, subsys->instance);
   2630	kfree(subsys);
   2631}
   2632
   2633static void nvme_destroy_subsystem(struct kref *ref)
   2634{
   2635	struct nvme_subsystem *subsys =
   2636			container_of(ref, struct nvme_subsystem, ref);
   2637
   2638	mutex_lock(&nvme_subsystems_lock);
   2639	list_del(&subsys->entry);
   2640	mutex_unlock(&nvme_subsystems_lock);
   2641
   2642	ida_destroy(&subsys->ns_ida);
   2643	device_del(&subsys->dev);
   2644	put_device(&subsys->dev);
   2645}
   2646
   2647static void nvme_put_subsystem(struct nvme_subsystem *subsys)
   2648{
   2649	kref_put(&subsys->ref, nvme_destroy_subsystem);
   2650}
   2651
   2652static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
   2653{
   2654	struct nvme_subsystem *subsys;
   2655
   2656	lockdep_assert_held(&nvme_subsystems_lock);
   2657
   2658	/*
   2659	 * Fail matches for discovery subsystems. This results
   2660	 * in each discovery controller bound to a unique subsystem.
   2661	 * This avoids issues with validating controller values
   2662	 * that can only be true when there is a single unique subsystem.
   2663	 * There may be multiple and completely independent entities
   2664	 * that provide discovery controllers.
   2665	 */
   2666	if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
   2667		return NULL;
   2668
   2669	list_for_each_entry(subsys, &nvme_subsystems, entry) {
   2670		if (strcmp(subsys->subnqn, subsysnqn))
   2671			continue;
   2672		if (!kref_get_unless_zero(&subsys->ref))
   2673			continue;
   2674		return subsys;
   2675	}
   2676
   2677	return NULL;
   2678}
   2679
   2680#define SUBSYS_ATTR_RO(_name, _mode, _show)			\
   2681	struct device_attribute subsys_attr_##_name = \
   2682		__ATTR(_name, _mode, _show, NULL)
   2683
   2684static ssize_t nvme_subsys_show_nqn(struct device *dev,
   2685				    struct device_attribute *attr,
   2686				    char *buf)
   2687{
   2688	struct nvme_subsystem *subsys =
   2689		container_of(dev, struct nvme_subsystem, dev);
   2690
   2691	return sysfs_emit(buf, "%s\n", subsys->subnqn);
   2692}
   2693static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
   2694
   2695static ssize_t nvme_subsys_show_type(struct device *dev,
   2696				    struct device_attribute *attr,
   2697				    char *buf)
   2698{
   2699	struct nvme_subsystem *subsys =
   2700		container_of(dev, struct nvme_subsystem, dev);
   2701
   2702	switch (subsys->subtype) {
   2703	case NVME_NQN_DISC:
   2704		return sysfs_emit(buf, "discovery\n");
   2705	case NVME_NQN_NVME:
   2706		return sysfs_emit(buf, "nvm\n");
   2707	default:
   2708		return sysfs_emit(buf, "reserved\n");
   2709	}
   2710}
   2711static SUBSYS_ATTR_RO(subsystype, S_IRUGO, nvme_subsys_show_type);
   2712
   2713#define nvme_subsys_show_str_function(field)				\
   2714static ssize_t subsys_##field##_show(struct device *dev,		\
   2715			    struct device_attribute *attr, char *buf)	\
   2716{									\
   2717	struct nvme_subsystem *subsys =					\
   2718		container_of(dev, struct nvme_subsystem, dev);		\
   2719	return sysfs_emit(buf, "%.*s\n",				\
   2720			   (int)sizeof(subsys->field), subsys->field);	\
   2721}									\
   2722static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
   2723
   2724nvme_subsys_show_str_function(model);
   2725nvme_subsys_show_str_function(serial);
   2726nvme_subsys_show_str_function(firmware_rev);
   2727
   2728static struct attribute *nvme_subsys_attrs[] = {
   2729	&subsys_attr_model.attr,
   2730	&subsys_attr_serial.attr,
   2731	&subsys_attr_firmware_rev.attr,
   2732	&subsys_attr_subsysnqn.attr,
   2733	&subsys_attr_subsystype.attr,
   2734#ifdef CONFIG_NVME_MULTIPATH
   2735	&subsys_attr_iopolicy.attr,
   2736#endif
   2737	NULL,
   2738};
   2739
   2740static const struct attribute_group nvme_subsys_attrs_group = {
   2741	.attrs = nvme_subsys_attrs,
   2742};
   2743
   2744static const struct attribute_group *nvme_subsys_attrs_groups[] = {
   2745	&nvme_subsys_attrs_group,
   2746	NULL,
   2747};
   2748
   2749static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl)
   2750{
   2751	return ctrl->opts && ctrl->opts->discovery_nqn;
   2752}
   2753
   2754static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
   2755		struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
   2756{
   2757	struct nvme_ctrl *tmp;
   2758
   2759	lockdep_assert_held(&nvme_subsystems_lock);
   2760
   2761	list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
   2762		if (nvme_state_terminal(tmp))
   2763			continue;
   2764
   2765		if (tmp->cntlid == ctrl->cntlid) {
   2766			dev_err(ctrl->device,
   2767				"Duplicate cntlid %u with %s, subsys %s, rejecting\n",
   2768				ctrl->cntlid, dev_name(tmp->device),
   2769				subsys->subnqn);
   2770			return false;
   2771		}
   2772
   2773		if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
   2774		    nvme_discovery_ctrl(ctrl))
   2775			continue;
   2776
   2777		dev_err(ctrl->device,
   2778			"Subsystem does not support multiple controllers\n");
   2779		return false;
   2780	}
   2781
   2782	return true;
   2783}
   2784
   2785static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
   2786{
   2787	struct nvme_subsystem *subsys, *found;
   2788	int ret;
   2789
   2790	subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
   2791	if (!subsys)
   2792		return -ENOMEM;
   2793
   2794	subsys->instance = -1;
   2795	mutex_init(&subsys->lock);
   2796	kref_init(&subsys->ref);
   2797	INIT_LIST_HEAD(&subsys->ctrls);
   2798	INIT_LIST_HEAD(&subsys->nsheads);
   2799	nvme_init_subnqn(subsys, ctrl, id);
   2800	memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
   2801	memcpy(subsys->model, id->mn, sizeof(subsys->model));
   2802	memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
   2803	subsys->vendor_id = le16_to_cpu(id->vid);
   2804	subsys->cmic = id->cmic;
   2805
   2806	/* Versions prior to 1.4 don't necessarily report a valid type */
   2807	if (id->cntrltype == NVME_CTRL_DISC ||
   2808	    !strcmp(subsys->subnqn, NVME_DISC_SUBSYS_NAME))
   2809		subsys->subtype = NVME_NQN_DISC;
   2810	else
   2811		subsys->subtype = NVME_NQN_NVME;
   2812
   2813	if (nvme_discovery_ctrl(ctrl) && subsys->subtype != NVME_NQN_DISC) {
   2814		dev_err(ctrl->device,
   2815			"Subsystem %s is not a discovery controller",
   2816			subsys->subnqn);
   2817		kfree(subsys);
   2818		return -EINVAL;
   2819	}
   2820	subsys->awupf = le16_to_cpu(id->awupf);
   2821	nvme_mpath_default_iopolicy(subsys);
   2822
   2823	subsys->dev.class = nvme_subsys_class;
   2824	subsys->dev.release = nvme_release_subsystem;
   2825	subsys->dev.groups = nvme_subsys_attrs_groups;
   2826	dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
   2827	device_initialize(&subsys->dev);
   2828
   2829	mutex_lock(&nvme_subsystems_lock);
   2830	found = __nvme_find_get_subsystem(subsys->subnqn);
   2831	if (found) {
   2832		put_device(&subsys->dev);
   2833		subsys = found;
   2834
   2835		if (!nvme_validate_cntlid(subsys, ctrl, id)) {
   2836			ret = -EINVAL;
   2837			goto out_put_subsystem;
   2838		}
   2839	} else {
   2840		ret = device_add(&subsys->dev);
   2841		if (ret) {
   2842			dev_err(ctrl->device,
   2843				"failed to register subsystem device.\n");
   2844			put_device(&subsys->dev);
   2845			goto out_unlock;
   2846		}
   2847		ida_init(&subsys->ns_ida);
   2848		list_add_tail(&subsys->entry, &nvme_subsystems);
   2849	}
   2850
   2851	ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
   2852				dev_name(ctrl->device));
   2853	if (ret) {
   2854		dev_err(ctrl->device,
   2855			"failed to create sysfs link from subsystem.\n");
   2856		goto out_put_subsystem;
   2857	}
   2858
   2859	if (!found)
   2860		subsys->instance = ctrl->instance;
   2861	ctrl->subsys = subsys;
   2862	list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
   2863	mutex_unlock(&nvme_subsystems_lock);
   2864	return 0;
   2865
   2866out_put_subsystem:
   2867	nvme_put_subsystem(subsys);
   2868out_unlock:
   2869	mutex_unlock(&nvme_subsystems_lock);
   2870	return ret;
   2871}
   2872
   2873int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
   2874		void *log, size_t size, u64 offset)
   2875{
   2876	struct nvme_command c = { };
   2877	u32 dwlen = nvme_bytes_to_numd(size);
   2878
   2879	c.get_log_page.opcode = nvme_admin_get_log_page;
   2880	c.get_log_page.nsid = cpu_to_le32(nsid);
   2881	c.get_log_page.lid = log_page;
   2882	c.get_log_page.lsp = lsp;
   2883	c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
   2884	c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
   2885	c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
   2886	c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
   2887	c.get_log_page.csi = csi;
   2888
   2889	return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
   2890}
   2891
   2892static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
   2893				struct nvme_effects_log **log)
   2894{
   2895	struct nvme_effects_log	*cel = xa_load(&ctrl->cels, csi);
   2896	int ret;
   2897
   2898	if (cel)
   2899		goto out;
   2900
   2901	cel = kzalloc(sizeof(*cel), GFP_KERNEL);
   2902	if (!cel)
   2903		return -ENOMEM;
   2904
   2905	ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
   2906			cel, sizeof(*cel), 0);
   2907	if (ret) {
   2908		kfree(cel);
   2909		return ret;
   2910	}
   2911
   2912	xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
   2913out:
   2914	*log = cel;
   2915	return 0;
   2916}
   2917
   2918static inline u32 nvme_mps_to_sectors(struct nvme_ctrl *ctrl, u32 units)
   2919{
   2920	u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12, val;
   2921
   2922	if (check_shl_overflow(1U, units + page_shift - 9, &val))
   2923		return UINT_MAX;
   2924	return val;
   2925}
   2926
   2927static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
   2928{
   2929	struct nvme_command c = { };
   2930	struct nvme_id_ctrl_nvm *id;
   2931	int ret;
   2932
   2933	if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
   2934		ctrl->max_discard_sectors = UINT_MAX;
   2935		ctrl->max_discard_segments = NVME_DSM_MAX_RANGES;
   2936	} else {
   2937		ctrl->max_discard_sectors = 0;
   2938		ctrl->max_discard_segments = 0;
   2939	}
   2940
   2941	/*
   2942	 * Even though NVMe spec explicitly states that MDTS is not applicable
   2943	 * to the write-zeroes, we are cautious and limit the size to the
   2944	 * controllers max_hw_sectors value, which is based on the MDTS field
   2945	 * and possibly other limiting factors.
   2946	 */
   2947	if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
   2948	    !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
   2949		ctrl->max_zeroes_sectors = ctrl->max_hw_sectors;
   2950	else
   2951		ctrl->max_zeroes_sectors = 0;
   2952
   2953	if (nvme_ctrl_limited_cns(ctrl))
   2954		return 0;
   2955
   2956	id = kzalloc(sizeof(*id), GFP_KERNEL);
   2957	if (!id)
   2958		return 0;
   2959
   2960	c.identify.opcode = nvme_admin_identify;
   2961	c.identify.cns = NVME_ID_CNS_CS_CTRL;
   2962	c.identify.csi = NVME_CSI_NVM;
   2963
   2964	ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
   2965	if (ret)
   2966		goto free_data;
   2967
   2968	if (id->dmrl)
   2969		ctrl->max_discard_segments = id->dmrl;
   2970	ctrl->dmrsl = le32_to_cpu(id->dmrsl);
   2971	if (id->wzsl)
   2972		ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);
   2973
   2974free_data:
   2975	kfree(id);
   2976	return ret;
   2977}
   2978
   2979static int nvme_init_identify(struct nvme_ctrl *ctrl)
   2980{
   2981	struct nvme_id_ctrl *id;
   2982	u32 max_hw_sectors;
   2983	bool prev_apst_enabled;
   2984	int ret;
   2985
   2986	ret = nvme_identify_ctrl(ctrl, &id);
   2987	if (ret) {
   2988		dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
   2989		return -EIO;
   2990	}
   2991
   2992	if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
   2993		ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
   2994		if (ret < 0)
   2995			goto out_free;
   2996	}
   2997
   2998	if (!(ctrl->ops->flags & NVME_F_FABRICS))
   2999		ctrl->cntlid = le16_to_cpu(id->cntlid);
   3000
   3001	if (!ctrl->identified) {
   3002		unsigned int i;
   3003
   3004		ret = nvme_init_subsystem(ctrl, id);
   3005		if (ret)
   3006			goto out_free;
   3007
   3008		/*
   3009		 * Check for quirks.  Quirk can depend on firmware version,
   3010		 * so, in principle, the set of quirks present can change
   3011		 * across a reset.  As a possible future enhancement, we
   3012		 * could re-scan for quirks every time we reinitialize
   3013		 * the device, but we'd have to make sure that the driver
   3014		 * behaves intelligently if the quirks change.
   3015		 */
   3016		for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
   3017			if (quirk_matches(id, &core_quirks[i]))
   3018				ctrl->quirks |= core_quirks[i].quirks;
   3019		}
   3020	}
   3021
   3022	if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
   3023		dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
   3024		ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
   3025	}
   3026
   3027	ctrl->crdt[0] = le16_to_cpu(id->crdt1);
   3028	ctrl->crdt[1] = le16_to_cpu(id->crdt2);
   3029	ctrl->crdt[2] = le16_to_cpu(id->crdt3);
   3030
   3031	ctrl->oacs = le16_to_cpu(id->oacs);
   3032	ctrl->oncs = le16_to_cpu(id->oncs);
   3033	ctrl->mtfa = le16_to_cpu(id->mtfa);
   3034	ctrl->oaes = le32_to_cpu(id->oaes);
   3035	ctrl->wctemp = le16_to_cpu(id->wctemp);
   3036	ctrl->cctemp = le16_to_cpu(id->cctemp);
   3037
   3038	atomic_set(&ctrl->abort_limit, id->acl + 1);
   3039	ctrl->vwc = id->vwc;
   3040	if (id->mdts)
   3041		max_hw_sectors = nvme_mps_to_sectors(ctrl, id->mdts);
   3042	else
   3043		max_hw_sectors = UINT_MAX;
   3044	ctrl->max_hw_sectors =
   3045		min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
   3046
   3047	nvme_set_queue_limits(ctrl, ctrl->admin_q);
   3048	ctrl->sgls = le32_to_cpu(id->sgls);
   3049	ctrl->kas = le16_to_cpu(id->kas);
   3050	ctrl->max_namespaces = le32_to_cpu(id->mnan);
   3051	ctrl->ctratt = le32_to_cpu(id->ctratt);
   3052
   3053	ctrl->cntrltype = id->cntrltype;
   3054	ctrl->dctype = id->dctype;
   3055
   3056	if (id->rtd3e) {
   3057		/* us -> s */
   3058		u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
   3059
   3060		ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
   3061						 shutdown_timeout, 60);
   3062
   3063		if (ctrl->shutdown_timeout != shutdown_timeout)
   3064			dev_info(ctrl->device,
   3065				 "Shutdown timeout set to %u seconds\n",
   3066				 ctrl->shutdown_timeout);
   3067	} else
   3068		ctrl->shutdown_timeout = shutdown_timeout;
   3069
   3070	ctrl->npss = id->npss;
   3071	ctrl->apsta = id->apsta;
   3072	prev_apst_enabled = ctrl->apst_enabled;
   3073	if (ctrl->quirks & NVME_QUIRK_NO_APST) {
   3074		if (force_apst && id->apsta) {
   3075			dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
   3076			ctrl->apst_enabled = true;
   3077		} else {
   3078			ctrl->apst_enabled = false;
   3079		}
   3080	} else {
   3081		ctrl->apst_enabled = id->apsta;
   3082	}
   3083	memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
   3084
   3085	if (ctrl->ops->flags & NVME_F_FABRICS) {
   3086		ctrl->icdoff = le16_to_cpu(id->icdoff);
   3087		ctrl->ioccsz = le32_to_cpu(id->ioccsz);
   3088		ctrl->iorcsz = le32_to_cpu(id->iorcsz);
   3089		ctrl->maxcmd = le16_to_cpu(id->maxcmd);
   3090
   3091		/*
   3092		 * In fabrics we need to verify the cntlid matches the
   3093		 * admin connect
   3094		 */
   3095		if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
   3096			dev_err(ctrl->device,
   3097				"Mismatching cntlid: Connect %u vs Identify "
   3098				"%u, rejecting\n",
   3099				ctrl->cntlid, le16_to_cpu(id->cntlid));
   3100			ret = -EINVAL;
   3101			goto out_free;
   3102		}
   3103
   3104		if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
   3105			dev_err(ctrl->device,
   3106				"keep-alive support is mandatory for fabrics\n");
   3107			ret = -EINVAL;
   3108			goto out_free;
   3109		}
   3110	} else {
   3111		ctrl->hmpre = le32_to_cpu(id->hmpre);
   3112		ctrl->hmmin = le32_to_cpu(id->hmmin);
   3113		ctrl->hmminds = le32_to_cpu(id->hmminds);
   3114		ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
   3115	}
   3116
   3117	ret = nvme_mpath_init_identify(ctrl, id);
   3118	if (ret < 0)
   3119		goto out_free;
   3120
   3121	if (ctrl->apst_enabled && !prev_apst_enabled)
   3122		dev_pm_qos_expose_latency_tolerance(ctrl->device);
   3123	else if (!ctrl->apst_enabled && prev_apst_enabled)
   3124		dev_pm_qos_hide_latency_tolerance(ctrl->device);
   3125
   3126out_free:
   3127	kfree(id);
   3128	return ret;
   3129}
   3130
   3131/*
   3132 * Initialize the cached copies of the Identify data and various controller
   3133 * register in our nvme_ctrl structure.  This should be called as soon as
   3134 * the admin queue is fully up and running.
   3135 */
   3136int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl)
   3137{
   3138	int ret;
   3139
   3140	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
   3141	if (ret) {
   3142		dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
   3143		return ret;
   3144	}
   3145
   3146	ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
   3147
   3148	if (ctrl->vs >= NVME_VS(1, 1, 0))
   3149		ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
   3150
   3151	ret = nvme_init_identify(ctrl);
   3152	if (ret)
   3153		return ret;
   3154
   3155	ret = nvme_configure_apst(ctrl);
   3156	if (ret < 0)
   3157		return ret;
   3158
   3159	ret = nvme_configure_timestamp(ctrl);
   3160	if (ret < 0)
   3161		return ret;
   3162
   3163	ret = nvme_configure_host_options(ctrl);
   3164	if (ret < 0)
   3165		return ret;
   3166
   3167	if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
   3168		ret = nvme_hwmon_init(ctrl);
   3169		if (ret < 0)
   3170			return ret;
   3171	}
   3172
   3173	ctrl->identified = true;
   3174
   3175	return 0;
   3176}
   3177EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish);
   3178
   3179static int nvme_dev_open(struct inode *inode, struct file *file)
   3180{
   3181	struct nvme_ctrl *ctrl =
   3182		container_of(inode->i_cdev, struct nvme_ctrl, cdev);
   3183
   3184	switch (ctrl->state) {
   3185	case NVME_CTRL_LIVE:
   3186		break;
   3187	default:
   3188		return -EWOULDBLOCK;
   3189	}
   3190
   3191	nvme_get_ctrl(ctrl);
   3192	if (!try_module_get(ctrl->ops->module)) {
   3193		nvme_put_ctrl(ctrl);
   3194		return -EINVAL;
   3195	}
   3196
   3197	file->private_data = ctrl;
   3198	return 0;
   3199}
   3200
   3201static int nvme_dev_release(struct inode *inode, struct file *file)
   3202{
   3203	struct nvme_ctrl *ctrl =
   3204		container_of(inode->i_cdev, struct nvme_ctrl, cdev);
   3205
   3206	module_put(ctrl->ops->module);
   3207	nvme_put_ctrl(ctrl);
   3208	return 0;
   3209}
   3210
   3211static const struct file_operations nvme_dev_fops = {
   3212	.owner		= THIS_MODULE,
   3213	.open		= nvme_dev_open,
   3214	.release	= nvme_dev_release,
   3215	.unlocked_ioctl	= nvme_dev_ioctl,
   3216	.compat_ioctl	= compat_ptr_ioctl,
   3217	.uring_cmd	= nvme_dev_uring_cmd,
   3218};
   3219
   3220static ssize_t nvme_sysfs_reset(struct device *dev,
   3221				struct device_attribute *attr, const char *buf,
   3222				size_t count)
   3223{
   3224	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
   3225	int ret;
   3226
   3227	ret = nvme_reset_ctrl_sync(ctrl);
   3228	if (ret < 0)
   3229		return ret;
   3230	return count;
   3231}
   3232static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
   3233
   3234static ssize_t nvme_sysfs_rescan(struct device *dev,
   3235				struct device_attribute *attr, const char *buf,
   3236				size_t count)
   3237{
   3238	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
   3239
   3240	nvme_queue_scan(ctrl);
   3241	return count;
   3242}
   3243static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
   3244
   3245static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev)
   3246{
   3247	struct gendisk *disk = dev_to_disk(dev);
   3248
   3249	if (disk->fops == &nvme_bdev_ops)
   3250		return nvme_get_ns_from_dev(dev)->head;
   3251	else
   3252		return disk->private_data;
   3253}
   3254
   3255static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
   3256		char *buf)
   3257{
   3258	struct nvme_ns_head *head = dev_to_ns_head(dev);
   3259	struct nvme_ns_ids *ids = &head->ids;
   3260	struct nvme_subsystem *subsys = head->subsys;
   3261	int serial_len = sizeof(subsys->serial);
   3262	int model_len = sizeof(subsys->model);
   3263
   3264	if (!uuid_is_null(&ids->uuid))
   3265		return sysfs_emit(buf, "uuid.%pU\n", &ids->uuid);
   3266
   3267	if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
   3268		return sysfs_emit(buf, "eui.%16phN\n", ids->nguid);
   3269
   3270	if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
   3271		return sysfs_emit(buf, "eui.%8phN\n", ids->eui64);
   3272
   3273	while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
   3274				  subsys->serial[serial_len - 1] == '\0'))
   3275		serial_len--;
   3276	while (model_len > 0 && (subsys->model[model_len - 1] == ' ' ||
   3277				 subsys->model[model_len - 1] == '\0'))
   3278		model_len--;
   3279
   3280	return sysfs_emit(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
   3281		serial_len, subsys->serial, model_len, subsys->model,
   3282		head->ns_id);
   3283}
   3284static DEVICE_ATTR_RO(wwid);
   3285
   3286static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
   3287		char *buf)
   3288{
   3289	return sysfs_emit(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
   3290}
   3291static DEVICE_ATTR_RO(nguid);
   3292
   3293static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
   3294		char *buf)
   3295{
   3296	struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
   3297
   3298	/* For backward compatibility expose the NGUID to userspace if
   3299	 * we have no UUID set
   3300	 */
   3301	if (uuid_is_null(&ids->uuid)) {
   3302		dev_warn_ratelimited(dev,
   3303			"No UUID available providing old NGUID\n");
   3304		return sysfs_emit(buf, "%pU\n", ids->nguid);
   3305	}
   3306	return sysfs_emit(buf, "%pU\n", &ids->uuid);
   3307}
   3308static DEVICE_ATTR_RO(uuid);
   3309
   3310static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
   3311		char *buf)
   3312{
   3313	return sysfs_emit(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
   3314}
   3315static DEVICE_ATTR_RO(eui);
   3316
   3317static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
   3318		char *buf)
   3319{
   3320	return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
   3321}
   3322static DEVICE_ATTR_RO(nsid);
   3323
   3324static struct attribute *nvme_ns_id_attrs[] = {
   3325	&dev_attr_wwid.attr,
   3326	&dev_attr_uuid.attr,
   3327	&dev_attr_nguid.attr,
   3328	&dev_attr_eui.attr,
   3329	&dev_attr_nsid.attr,
   3330#ifdef CONFIG_NVME_MULTIPATH
   3331	&dev_attr_ana_grpid.attr,
   3332	&dev_attr_ana_state.attr,
   3333#endif
   3334	NULL,
   3335};
   3336
   3337static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
   3338		struct attribute *a, int n)
   3339{
   3340	struct device *dev = container_of(kobj, struct device, kobj);
   3341	struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
   3342
   3343	if (a == &dev_attr_uuid.attr) {
   3344		if (uuid_is_null(&ids->uuid) &&
   3345		    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
   3346			return 0;
   3347	}
   3348	if (a == &dev_attr_nguid.attr) {
   3349		if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
   3350			return 0;
   3351	}
   3352	if (a == &dev_attr_eui.attr) {
   3353		if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
   3354			return 0;
   3355	}
   3356#ifdef CONFIG_NVME_MULTIPATH
   3357	if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) {
   3358		if (dev_to_disk(dev)->fops != &nvme_bdev_ops) /* per-path attr */
   3359			return 0;
   3360		if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl))
   3361			return 0;
   3362	}
   3363#endif
   3364	return a->mode;
   3365}
   3366
   3367static const struct attribute_group nvme_ns_id_attr_group = {
   3368	.attrs		= nvme_ns_id_attrs,
   3369	.is_visible	= nvme_ns_id_attrs_are_visible,
   3370};
   3371
   3372const struct attribute_group *nvme_ns_id_attr_groups[] = {
   3373	&nvme_ns_id_attr_group,
   3374	NULL,
   3375};
   3376
   3377#define nvme_show_str_function(field)						\
   3378static ssize_t  field##_show(struct device *dev,				\
   3379			    struct device_attribute *attr, char *buf)		\
   3380{										\
   3381        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);				\
   3382        return sysfs_emit(buf, "%.*s\n",					\
   3383		(int)sizeof(ctrl->subsys->field), ctrl->subsys->field);		\
   3384}										\
   3385static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
   3386
   3387nvme_show_str_function(model);
   3388nvme_show_str_function(serial);
   3389nvme_show_str_function(firmware_rev);
   3390
   3391#define nvme_show_int_function(field)						\
   3392static ssize_t  field##_show(struct device *dev,				\
   3393			    struct device_attribute *attr, char *buf)		\
   3394{										\
   3395        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);				\
   3396        return sysfs_emit(buf, "%d\n", ctrl->field);				\
   3397}										\
   3398static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
   3399
   3400nvme_show_int_function(cntlid);
   3401nvme_show_int_function(numa_node);
   3402nvme_show_int_function(queue_count);
   3403nvme_show_int_function(sqsize);
   3404nvme_show_int_function(kato);
   3405
   3406static ssize_t nvme_sysfs_delete(struct device *dev,
   3407				struct device_attribute *attr, const char *buf,
   3408				size_t count)
   3409{
   3410	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
   3411
   3412	if (device_remove_file_self(dev, attr))
   3413		nvme_delete_ctrl_sync(ctrl);
   3414	return count;
   3415}
   3416static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
   3417
   3418static ssize_t nvme_sysfs_show_transport(struct device *dev,
   3419					 struct device_attribute *attr,
   3420					 char *buf)
   3421{
   3422	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
   3423
   3424	return sysfs_emit(buf, "%s\n", ctrl->ops->name);
   3425}
   3426static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
   3427
   3428static ssize_t nvme_sysfs_show_state(struct device *dev,
   3429				     struct device_attribute *attr,
   3430				     char *buf)
   3431{
   3432	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
   3433	static const char *const state_name[] = {
   3434		[NVME_CTRL_NEW]		= "new",
   3435		[NVME_CTRL_LIVE]	= "live",
   3436		[NVME_CTRL_RESETTING]	= "resetting",
   3437		[NVME_CTRL_CONNECTING]	= "connecting",
   3438		[NVME_CTRL_DELETING]	= "deleting",
   3439		[NVME_CTRL_DELETING_NOIO]= "deleting (no IO)",
   3440		[NVME_CTRL_DEAD]	= "dead",
   3441	};
   3442
   3443	if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
   3444	    state_name[ctrl->state])
   3445		return sysfs_emit(buf, "%s\n", state_name[ctrl->state]);
   3446
   3447	return sysfs_emit(buf, "unknown state\n");
   3448}
   3449
   3450static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
   3451
   3452static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
   3453					 struct device_attribute *attr,
   3454					 char *buf)
   3455{
   3456	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
   3457
   3458	return sysfs_emit(buf, "%s\n", ctrl->subsys->subnqn);
   3459}
   3460static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
   3461
   3462static ssize_t nvme_sysfs_show_hostnqn(struct device *dev,
   3463					struct device_attribute *attr,
   3464					char *buf)
   3465{
   3466	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
   3467
   3468	return sysfs_emit(buf, "%s\n", ctrl->opts->host->nqn);
   3469}
   3470static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL);
   3471
   3472static ssize_t nvme_sysfs_show_hostid(struct device *dev,
   3473					struct device_attribute *attr,
   3474					char *buf)
   3475{
   3476	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
   3477
   3478	return sysfs_emit(buf, "%pU\n", &ctrl->opts->host->id);
   3479}
   3480static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL);
   3481
   3482static ssize_t nvme_sysfs_show_address(struct device *dev,
   3483					 struct device_attribute *attr,
   3484					 char *buf)
   3485{
   3486	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
   3487
   3488	return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
   3489}
   3490static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
   3491
   3492static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev,
   3493		struct device_attribute *attr, char *buf)
   3494{
   3495	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
   3496	struct nvmf_ctrl_options *opts = ctrl->opts;
   3497
   3498	if (ctrl->opts->max_reconnects == -1)
   3499		return sysfs_emit(buf, "off\n");
   3500	return sysfs_emit(buf, "%d\n",
   3501			  opts->max_reconnects * opts->reconnect_delay);
   3502}
   3503
   3504static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev,
   3505		struct device_attribute *attr, const char *buf, size_t count)
   3506{
   3507	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
   3508	struct nvmf_ctrl_options *opts = ctrl->opts;
   3509	int ctrl_loss_tmo, err;
   3510
   3511	err = kstrtoint(buf, 10, &ctrl_loss_tmo);
   3512	if (err)
   3513		return -EINVAL;
   3514
   3515	if (ctrl_loss_tmo < 0)
   3516		opts->max_reconnects = -1;
   3517	else
   3518		opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
   3519						opts->reconnect_delay);
   3520	return count;
   3521}
   3522static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR,
   3523	nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store);
   3524
   3525static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev,
   3526		struct device_attribute *attr, char *buf)
   3527{
   3528	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
   3529
   3530	if (ctrl->opts->reconnect_delay == -1)
   3531		return sysfs_emit(buf, "off\n");
   3532	return sysfs_emit(buf, "%d\n", ctrl->opts->reconnect_delay);
   3533}
   3534
   3535static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev,
   3536		struct device_attribute *attr, const char *buf, size_t count)
   3537{
   3538	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
   3539	unsigned int v;
   3540	int err;
   3541
   3542	err = kstrtou32(buf, 10, &v);
   3543	if (err)
   3544		return err;
   3545
   3546	ctrl->opts->reconnect_delay = v;
   3547	return count;
   3548}
   3549static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR,
   3550	nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store);
   3551
   3552static ssize_t nvme_ctrl_fast_io_fail_tmo_show(struct device *dev,
   3553		struct device_attribute *attr, char *buf)
   3554{
   3555	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
   3556
   3557	if (ctrl->opts->fast_io_fail_tmo == -1)
   3558		return sysfs_emit(buf, "off\n");
   3559	return sysfs_emit(buf, "%d\n", ctrl->opts->fast_io_fail_tmo);
   3560}
   3561
   3562static ssize_t nvme_ctrl_fast_io_fail_tmo_store(struct device *dev,
   3563		struct device_attribute *attr, const char *buf, size_t count)
   3564{
   3565	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
   3566	struct nvmf_ctrl_options *opts = ctrl->opts;
   3567	int fast_io_fail_tmo, err;
   3568
   3569	err = kstrtoint(buf, 10, &fast_io_fail_tmo);
   3570	if (err)
   3571		return -EINVAL;
   3572
   3573	if (fast_io_fail_tmo < 0)
   3574		opts->fast_io_fail_tmo = -1;
   3575	else
   3576		opts->fast_io_fail_tmo = fast_io_fail_tmo;
   3577	return count;
   3578}
   3579static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR,
   3580	nvme_ctrl_fast_io_fail_tmo_show, nvme_ctrl_fast_io_fail_tmo_store);
   3581
   3582static ssize_t cntrltype_show(struct device *dev,
   3583			      struct device_attribute *attr, char *buf)
   3584{
   3585	static const char * const type[] = {
   3586		[NVME_CTRL_IO] = "io\n",
   3587		[NVME_CTRL_DISC] = "discovery\n",
   3588		[NVME_CTRL_ADMIN] = "admin\n",
   3589	};
   3590	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
   3591
   3592	if (ctrl->cntrltype > NVME_CTRL_ADMIN || !type[ctrl->cntrltype])
   3593		return sysfs_emit(buf, "reserved\n");
   3594
   3595	return sysfs_emit(buf, type[ctrl->cntrltype]);
   3596}
   3597static DEVICE_ATTR_RO(cntrltype);
   3598
   3599static ssize_t dctype_show(struct device *dev,
   3600			   struct device_attribute *attr, char *buf)
   3601{
   3602	static const char * const type[] = {
   3603		[NVME_DCTYPE_NOT_REPORTED] = "none\n",
   3604		[NVME_DCTYPE_DDC] = "ddc\n",
   3605		[NVME_DCTYPE_CDC] = "cdc\n",
   3606	};
   3607	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
   3608
   3609	if (ctrl->dctype > NVME_DCTYPE_CDC || !type[ctrl->dctype])
   3610		return sysfs_emit(buf, "reserved\n");
   3611
   3612	return sysfs_emit(buf, type[ctrl->dctype]);
   3613}
   3614static DEVICE_ATTR_RO(dctype);
   3615
   3616static struct attribute *nvme_dev_attrs[] = {
   3617	&dev_attr_reset_controller.attr,
   3618	&dev_attr_rescan_controller.attr,
   3619	&dev_attr_model.attr,
   3620	&dev_attr_serial.attr,
   3621	&dev_attr_firmware_rev.attr,
   3622	&dev_attr_cntlid.attr,
   3623	&dev_attr_delete_controller.attr,
   3624	&dev_attr_transport.attr,
   3625	&dev_attr_subsysnqn.attr,
   3626	&dev_attr_address.attr,
   3627	&dev_attr_state.attr,
   3628	&dev_attr_numa_node.attr,
   3629	&dev_attr_queue_count.attr,
   3630	&dev_attr_sqsize.attr,
   3631	&dev_attr_hostnqn.attr,
   3632	&dev_attr_hostid.attr,
   3633	&dev_attr_ctrl_loss_tmo.attr,
   3634	&dev_attr_reconnect_delay.attr,
   3635	&dev_attr_fast_io_fail_tmo.attr,
   3636	&dev_attr_kato.attr,
   3637	&dev_attr_cntrltype.attr,
   3638	&dev_attr_dctype.attr,
   3639	NULL
   3640};
   3641
   3642static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
   3643		struct attribute *a, int n)
   3644{
   3645	struct device *dev = container_of(kobj, struct device, kobj);
   3646	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
   3647
   3648	if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
   3649		return 0;
   3650	if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
   3651		return 0;
   3652	if (a == &dev_attr_hostnqn.attr && !ctrl->opts)
   3653		return 0;
   3654	if (a == &dev_attr_hostid.attr && !ctrl->opts)
   3655		return 0;
   3656	if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts)
   3657		return 0;
   3658	if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts)
   3659		return 0;
   3660	if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts)
   3661		return 0;
   3662
   3663	return a->mode;
   3664}
   3665
   3666static const struct attribute_group nvme_dev_attrs_group = {
   3667	.attrs		= nvme_dev_attrs,
   3668	.is_visible	= nvme_dev_attrs_are_visible,
   3669};
   3670
   3671static const struct attribute_group *nvme_dev_attr_groups[] = {
   3672	&nvme_dev_attrs_group,
   3673	NULL,
   3674};
   3675
   3676static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
   3677		unsigned nsid)
   3678{
   3679	struct nvme_ns_head *h;
   3680
   3681	lockdep_assert_held(&ctrl->subsys->lock);
   3682
   3683	list_for_each_entry(h, &ctrl->subsys->nsheads, entry) {
   3684		/*
   3685		 * Private namespaces can share NSIDs under some conditions.
   3686		 * In that case we can't use the same ns_head for namespaces
   3687		 * with the same NSID.
   3688		 */
   3689		if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
   3690			continue;
   3691		if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
   3692			return h;
   3693	}
   3694
   3695	return NULL;
   3696}
   3697
   3698static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys,
   3699		struct nvme_ns_ids *ids)
   3700{
   3701	bool has_uuid = !uuid_is_null(&ids->uuid);
   3702	bool has_nguid = memchr_inv(ids->nguid, 0, sizeof(ids->nguid));
   3703	bool has_eui64 = memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
   3704	struct nvme_ns_head *h;
   3705
   3706	lockdep_assert_held(&subsys->lock);
   3707
   3708	list_for_each_entry(h, &subsys->nsheads, entry) {
   3709		if (has_uuid && uuid_equal(&ids->uuid, &h->ids.uuid))
   3710			return -EINVAL;
   3711		if (has_nguid &&
   3712		    memcmp(&ids->nguid, &h->ids.nguid, sizeof(ids->nguid)) == 0)
   3713			return -EINVAL;
   3714		if (has_eui64 &&
   3715		    memcmp(&ids->eui64, &h->ids.eui64, sizeof(ids->eui64)) == 0)
   3716			return -EINVAL;
   3717	}
   3718
   3719	return 0;
   3720}
   3721
   3722static void nvme_cdev_rel(struct device *dev)
   3723{
   3724	ida_free(&nvme_ns_chr_minor_ida, MINOR(dev->devt));
   3725}
   3726
   3727void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device)
   3728{
   3729	cdev_device_del(cdev, cdev_device);
   3730	put_device(cdev_device);
   3731}
   3732
   3733int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
   3734		const struct file_operations *fops, struct module *owner)
   3735{
   3736	int minor, ret;
   3737
   3738	minor = ida_alloc(&nvme_ns_chr_minor_ida, GFP_KERNEL);
   3739	if (minor < 0)
   3740		return minor;
   3741	cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor);
   3742	cdev_device->class = nvme_ns_chr_class;
   3743	cdev_device->release = nvme_cdev_rel;
   3744	device_initialize(cdev_device);
   3745	cdev_init(cdev, fops);
   3746	cdev->owner = owner;
   3747	ret = cdev_device_add(cdev, cdev_device);
   3748	if (ret)
   3749		put_device(cdev_device);
   3750
   3751	return ret;
   3752}
   3753
   3754static int nvme_ns_chr_open(struct inode *inode, struct file *file)
   3755{
   3756	return nvme_ns_open(container_of(inode->i_cdev, struct nvme_ns, cdev));
   3757}
   3758
   3759static int nvme_ns_chr_release(struct inode *inode, struct file *file)
   3760{
   3761	nvme_ns_release(container_of(inode->i_cdev, struct nvme_ns, cdev));
   3762	return 0;
   3763}
   3764
   3765static const struct file_operations nvme_ns_chr_fops = {
   3766	.owner		= THIS_MODULE,
   3767	.open		= nvme_ns_chr_open,
   3768	.release	= nvme_ns_chr_release,
   3769	.unlocked_ioctl	= nvme_ns_chr_ioctl,
   3770	.compat_ioctl	= compat_ptr_ioctl,
   3771	.uring_cmd	= nvme_ns_chr_uring_cmd,
   3772};
   3773
   3774static int nvme_add_ns_cdev(struct nvme_ns *ns)
   3775{
   3776	int ret;
   3777
   3778	ns->cdev_device.parent = ns->ctrl->device;
   3779	ret = dev_set_name(&ns->cdev_device, "ng%dn%d",
   3780			   ns->ctrl->instance, ns->head->instance);
   3781	if (ret)
   3782		return ret;
   3783
   3784	return nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops,
   3785			     ns->ctrl->ops->module);
   3786}
   3787
   3788static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
   3789		unsigned nsid, struct nvme_ns_ids *ids)
   3790{
   3791	struct nvme_ns_head *head;
   3792	size_t size = sizeof(*head);
   3793	int ret = -ENOMEM;
   3794
   3795#ifdef CONFIG_NVME_MULTIPATH
   3796	size += num_possible_nodes() * sizeof(struct nvme_ns *);
   3797#endif
   3798
   3799	head = kzalloc(size, GFP_KERNEL);
   3800	if (!head)
   3801		goto out;
   3802	ret = ida_alloc_min(&ctrl->subsys->ns_ida, 1, GFP_KERNEL);
   3803	if (ret < 0)
   3804		goto out_free_head;
   3805	head->instance = ret;
   3806	INIT_LIST_HEAD(&head->list);
   3807	ret = init_srcu_struct(&head->srcu);
   3808	if (ret)
   3809		goto out_ida_remove;
   3810	head->subsys = ctrl->subsys;
   3811	head->ns_id = nsid;
   3812	head->ids = *ids;
   3813	kref_init(&head->ref);
   3814
   3815	if (head->ids.csi) {
   3816		ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
   3817		if (ret)
   3818			goto out_cleanup_srcu;
   3819	} else
   3820		head->effects = ctrl->effects;
   3821
   3822	ret = nvme_mpath_alloc_disk(ctrl, head);
   3823	if (ret)
   3824		goto out_cleanup_srcu;
   3825
   3826	list_add_tail(&head->entry, &ctrl->subsys->nsheads);
   3827
   3828	kref_get(&ctrl->subsys->ref);
   3829
   3830	return head;
   3831out_cleanup_srcu:
   3832	cleanup_srcu_struct(&head->srcu);
   3833out_ida_remove:
   3834	ida_free(&ctrl->subsys->ns_ida, head->instance);
   3835out_free_head:
   3836	kfree(head);
   3837out:
   3838	if (ret > 0)
   3839		ret = blk_status_to_errno(nvme_error_status(ret));
   3840	return ERR_PTR(ret);
   3841}
   3842
   3843static int nvme_global_check_duplicate_ids(struct nvme_subsystem *this,
   3844		struct nvme_ns_ids *ids)
   3845{
   3846	struct nvme_subsystem *s;
   3847	int ret = 0;
   3848
   3849	/*
   3850	 * Note that this check is racy as we try to avoid holding the global
   3851	 * lock over the whole ns_head creation.  But it is only intended as
   3852	 * a sanity check anyway.
   3853	 */
   3854	mutex_lock(&nvme_subsystems_lock);
   3855	list_for_each_entry(s, &nvme_subsystems, entry) {
   3856		if (s == this)
   3857			continue;
   3858		mutex_lock(&s->lock);
   3859		ret = nvme_subsys_check_duplicate_ids(s, ids);
   3860		mutex_unlock(&s->lock);
   3861		if (ret)
   3862			break;
   3863	}
   3864	mutex_unlock(&nvme_subsystems_lock);
   3865
   3866	return ret;
   3867}
   3868
   3869static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
   3870		struct nvme_ns_ids *ids, bool is_shared)
   3871{
   3872	struct nvme_ctrl *ctrl = ns->ctrl;
   3873	struct nvme_ns_head *head = NULL;
   3874	int ret;
   3875
   3876	ret = nvme_global_check_duplicate_ids(ctrl->subsys, ids);
   3877	if (ret) {
   3878		dev_err(ctrl->device,
   3879			"globally duplicate IDs for nsid %d\n", nsid);
   3880		nvme_print_device_info(ctrl);
   3881		return ret;
   3882	}
   3883
   3884	mutex_lock(&ctrl->subsys->lock);
   3885	head = nvme_find_ns_head(ctrl, nsid);
   3886	if (!head) {
   3887		ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, ids);
   3888		if (ret) {
   3889			dev_err(ctrl->device,
   3890				"duplicate IDs in subsystem for nsid %d\n",
   3891				nsid);
   3892			goto out_unlock;
   3893		}
   3894		head = nvme_alloc_ns_head(ctrl, nsid, ids);
   3895		if (IS_ERR(head)) {
   3896			ret = PTR_ERR(head);
   3897			goto out_unlock;
   3898		}
   3899		head->shared = is_shared;
   3900	} else {
   3901		ret = -EINVAL;
   3902		if (!is_shared || !head->shared) {
   3903			dev_err(ctrl->device,
   3904				"Duplicate unshared namespace %d\n", nsid);
   3905			goto out_put_ns_head;
   3906		}
   3907		if (!nvme_ns_ids_equal(&head->ids, ids)) {
   3908			dev_err(ctrl->device,
   3909				"IDs don't match for shared namespace %d\n",
   3910					nsid);
   3911			goto out_put_ns_head;
   3912		}
   3913
   3914		if (!multipath && !list_empty(&head->list)) {
   3915			dev_warn(ctrl->device,
   3916				"Found shared namespace %d, but multipathing not supported.\n",
   3917				nsid);
   3918			dev_warn_once(ctrl->device,
   3919				"Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0\n.");
   3920		}
   3921	}
   3922
   3923	list_add_tail_rcu(&ns->siblings, &head->list);
   3924	ns->head = head;
   3925	mutex_unlock(&ctrl->subsys->lock);
   3926	return 0;
   3927
   3928out_put_ns_head:
   3929	nvme_put_ns_head(head);
   3930out_unlock:
   3931	mutex_unlock(&ctrl->subsys->lock);
   3932	return ret;
   3933}
   3934
   3935struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
   3936{
   3937	struct nvme_ns *ns, *ret = NULL;
   3938
   3939	down_read(&ctrl->namespaces_rwsem);
   3940	list_for_each_entry(ns, &ctrl->namespaces, list) {
   3941		if (ns->head->ns_id == nsid) {
   3942			if (!nvme_get_ns(ns))
   3943				continue;
   3944			ret = ns;
   3945			break;
   3946		}
   3947		if (ns->head->ns_id > nsid)
   3948			break;
   3949	}
   3950	up_read(&ctrl->namespaces_rwsem);
   3951	return ret;
   3952}
   3953EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
   3954
   3955/*
   3956 * Add the namespace to the controller list while keeping the list ordered.
   3957 */
   3958static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
   3959{
   3960	struct nvme_ns *tmp;
   3961
   3962	list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
   3963		if (tmp->head->ns_id < ns->head->ns_id) {
   3964			list_add(&ns->list, &tmp->list);
   3965			return;
   3966		}
   3967	}
   3968	list_add(&ns->list, &ns->ctrl->namespaces);
   3969}
   3970
   3971static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
   3972		struct nvme_ns_ids *ids)
   3973{
   3974	struct nvme_ns *ns;
   3975	struct gendisk *disk;
   3976	struct nvme_id_ns *id;
   3977	int node = ctrl->numa_node;
   3978
   3979	if (nvme_identify_ns(ctrl, nsid, ids, &id))
   3980		return;
   3981
   3982	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
   3983	if (!ns)
   3984		goto out_free_id;
   3985
   3986	disk = blk_mq_alloc_disk(ctrl->tagset, ns);
   3987	if (IS_ERR(disk))
   3988		goto out_free_ns;
   3989	disk->fops = &nvme_bdev_ops;
   3990	disk->private_data = ns;
   3991
   3992	ns->disk = disk;
   3993	ns->queue = disk->queue;
   3994
   3995	if (ctrl->opts && ctrl->opts->data_digest)
   3996		blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
   3997
   3998	blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
   3999	if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
   4000		blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
   4001
   4002	ns->ctrl = ctrl;
   4003	kref_init(&ns->kref);
   4004
   4005	if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED))
   4006		goto out_cleanup_disk;
   4007
   4008	/*
   4009	 * If multipathing is enabled, the device name for all disks and not
   4010	 * just those that represent shared namespaces needs to be based on the
   4011	 * subsystem instance.  Using the controller instance for private
   4012	 * namespaces could lead to naming collisions between shared and private
   4013	 * namespaces if they don't use a common numbering scheme.
   4014	 *
   4015	 * If multipathing is not enabled, disk names must use the controller
   4016	 * instance as shared namespaces will show up as multiple block
   4017	 * devices.
   4018	 */
   4019	if (ns->head->disk) {
   4020		sprintf(disk->disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
   4021			ctrl->instance, ns->head->instance);
   4022		disk->flags |= GENHD_FL_HIDDEN;
   4023	} else if (multipath) {
   4024		sprintf(disk->disk_name, "nvme%dn%d", ctrl->subsys->instance,
   4025			ns->head->instance);
   4026	} else {
   4027		sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance,
   4028			ns->head->instance);
   4029	}
   4030
   4031	if (nvme_update_ns_info(ns, id))
   4032		goto out_unlink_ns;
   4033
   4034	down_write(&ctrl->namespaces_rwsem);
   4035	nvme_ns_add_to_ctrl_list(ns);
   4036	up_write(&ctrl->namespaces_rwsem);
   4037	nvme_get_ctrl(ctrl);
   4038
   4039	if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups))
   4040		goto out_cleanup_ns_from_list;
   4041
   4042	if (!nvme_ns_head_multipath(ns->head))
   4043		nvme_add_ns_cdev(ns);
   4044
   4045	nvme_mpath_add_disk(ns, id);
   4046	nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
   4047	kfree(id);
   4048
   4049	return;
   4050
   4051 out_cleanup_ns_from_list:
   4052	nvme_put_ctrl(ctrl);
   4053	down_write(&ctrl->namespaces_rwsem);
   4054	list_del_init(&ns->list);
   4055	up_write(&ctrl->namespaces_rwsem);
   4056 out_unlink_ns:
   4057	mutex_lock(&ctrl->subsys->lock);
   4058	list_del_rcu(&ns->siblings);
   4059	if (list_empty(&ns->head->list))
   4060		list_del_init(&ns->head->entry);
   4061	mutex_unlock(&ctrl->subsys->lock);
   4062	nvme_put_ns_head(ns->head);
   4063 out_cleanup_disk:
   4064	blk_cleanup_disk(disk);
   4065 out_free_ns:
   4066	kfree(ns);
   4067 out_free_id:
   4068	kfree(id);
   4069}
   4070
   4071static void nvme_ns_remove(struct nvme_ns *ns)
   4072{
   4073	bool last_path = false;
   4074
   4075	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
   4076		return;
   4077
   4078	clear_bit(NVME_NS_READY, &ns->flags);
   4079	set_capacity(ns->disk, 0);
   4080	nvme_fault_inject_fini(&ns->fault_inject);
   4081
   4082	/*
   4083	 * Ensure that !NVME_NS_READY is seen by other threads to prevent
   4084	 * this ns going back into current_path.
   4085	 */
   4086	synchronize_srcu(&ns->head->srcu);
   4087
   4088	/* wait for concurrent submissions */
   4089	if (nvme_mpath_clear_current_path(ns))
   4090		synchronize_srcu(&ns->head->srcu);
   4091
   4092	mutex_lock(&ns->ctrl->subsys->lock);
   4093	list_del_rcu(&ns->siblings);
   4094	if (list_empty(&ns->head->list)) {
   4095		list_del_init(&ns->head->entry);
   4096		last_path = true;
   4097	}
   4098	mutex_unlock(&ns->ctrl->subsys->lock);
   4099
   4100	/* guarantee not available in head->list */
   4101	synchronize_rcu();
   4102
   4103	if (!nvme_ns_head_multipath(ns->head))
   4104		nvme_cdev_del(&ns->cdev, &ns->cdev_device);
   4105	del_gendisk(ns->disk);
   4106	blk_cleanup_queue(ns->queue);
   4107
   4108	down_write(&ns->ctrl->namespaces_rwsem);
   4109	list_del_init(&ns->list);
   4110	up_write(&ns->ctrl->namespaces_rwsem);
   4111
   4112	if (last_path)
   4113		nvme_mpath_shutdown_disk(ns->head);
   4114	nvme_put_ns(ns);
   4115}
   4116
   4117static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
   4118{
   4119	struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid);
   4120
   4121	if (ns) {
   4122		nvme_ns_remove(ns);
   4123		nvme_put_ns(ns);
   4124	}
   4125}
   4126
   4127static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
   4128{
   4129	struct nvme_id_ns *id;
   4130	int ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
   4131
   4132	if (test_bit(NVME_NS_DEAD, &ns->flags))
   4133		goto out;
   4134
   4135	ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id);
   4136	if (ret)
   4137		goto out;
   4138
   4139	ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
   4140	if (!nvme_ns_ids_equal(&ns->head->ids, ids)) {
   4141		dev_err(ns->ctrl->device,
   4142			"identifiers changed for nsid %d\n", ns->head->ns_id);
   4143		goto out_free_id;
   4144	}
   4145
   4146	ret = nvme_update_ns_info(ns, id);
   4147
   4148out_free_id:
   4149	kfree(id);
   4150out:
   4151	/*
   4152	 * Only remove the namespace if we got a fatal error back from the
   4153	 * device, otherwise ignore the error and just move on.
   4154	 *
   4155	 * TODO: we should probably schedule a delayed retry here.
   4156	 */
   4157	if (ret > 0 && (ret & NVME_SC_DNR))
   4158		nvme_ns_remove(ns);
   4159}
   4160
   4161static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
   4162{
   4163	struct nvme_ns_ids ids = { };
   4164	struct nvme_id_ns_cs_indep *id;
   4165	struct nvme_ns *ns;
   4166	bool ready = true;
   4167
   4168	if (nvme_identify_ns_descs(ctrl, nsid, &ids))
   4169		return;
   4170
   4171	/*
   4172	 * Check if the namespace is ready.  If not ignore it, we will get an
   4173	 * AEN once it becomes ready and restart the scan.
   4174	 */
   4175	if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) &&
   4176	    !nvme_identify_ns_cs_indep(ctrl, nsid, &id)) {
   4177		ready = id->nstat & NVME_NSTAT_NRDY;
   4178		kfree(id);
   4179	}
   4180
   4181	if (!ready)
   4182		return;
   4183
   4184	ns = nvme_find_get_ns(ctrl, nsid);
   4185	if (ns) {
   4186		nvme_validate_ns(ns, &ids);
   4187		nvme_put_ns(ns);
   4188		return;
   4189	}
   4190
   4191	switch (ids.csi) {
   4192	case NVME_CSI_NVM:
   4193		nvme_alloc_ns(ctrl, nsid, &ids);
   4194		break;
   4195	case NVME_CSI_ZNS:
   4196		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
   4197			dev_warn(ctrl->device,
   4198				"nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
   4199				nsid);
   4200			break;
   4201		}
   4202		if (!nvme_multi_css(ctrl)) {
   4203			dev_warn(ctrl->device,
   4204				"command set not reported for nsid: %d\n",
   4205				nsid);
   4206			break;
   4207		}
   4208		nvme_alloc_ns(ctrl, nsid, &ids);
   4209		break;
   4210	default:
   4211		dev_warn(ctrl->device, "unknown csi %u for nsid %u\n",
   4212			ids.csi, nsid);
   4213		break;
   4214	}
   4215}
   4216
   4217static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
   4218					unsigned nsid)
   4219{
   4220	struct nvme_ns *ns, *next;
   4221	LIST_HEAD(rm_list);
   4222
   4223	down_write(&ctrl->namespaces_rwsem);
   4224	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
   4225		if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags))
   4226			list_move_tail(&ns->list, &rm_list);
   4227	}
   4228	up_write(&ctrl->namespaces_rwsem);
   4229
   4230	list_for_each_entry_safe(ns, next, &rm_list, list)
   4231		nvme_ns_remove(ns);
   4232
   4233}
   4234
   4235static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
   4236{
   4237	const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
   4238	__le32 *ns_list;
   4239	u32 prev = 0;
   4240	int ret = 0, i;
   4241
   4242	if (nvme_ctrl_limited_cns(ctrl))
   4243		return -EOPNOTSUPP;
   4244
   4245	ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
   4246	if (!ns_list)
   4247		return -ENOMEM;
   4248
   4249	for (;;) {
   4250		struct nvme_command cmd = {
   4251			.identify.opcode	= nvme_admin_identify,
   4252			.identify.cns		= NVME_ID_CNS_NS_ACTIVE_LIST,
   4253			.identify.nsid		= cpu_to_le32(prev),
   4254		};
   4255
   4256		ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list,
   4257					    NVME_IDENTIFY_DATA_SIZE);
   4258		if (ret) {
   4259			dev_warn(ctrl->device,
   4260				"Identify NS List failed (status=0x%x)\n", ret);
   4261			goto free;
   4262		}
   4263
   4264		for (i = 0; i < nr_entries; i++) {
   4265			u32 nsid = le32_to_cpu(ns_list[i]);
   4266
   4267			if (!nsid)	/* end of the list? */
   4268				goto out;
   4269			nvme_validate_or_alloc_ns(ctrl, nsid);
   4270			while (++prev < nsid)
   4271				nvme_ns_remove_by_nsid(ctrl, prev);
   4272		}
   4273	}
   4274 out:
   4275	nvme_remove_invalid_namespaces(ctrl, prev);
   4276 free:
   4277	kfree(ns_list);
   4278	return ret;
   4279}
   4280
   4281static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
   4282{
   4283	struct nvme_id_ctrl *id;
   4284	u32 nn, i;
   4285
   4286	if (nvme_identify_ctrl(ctrl, &id))
   4287		return;
   4288	nn = le32_to_cpu(id->nn);
   4289	kfree(id);
   4290
   4291	for (i = 1; i <= nn; i++)
   4292		nvme_validate_or_alloc_ns(ctrl, i);
   4293
   4294	nvme_remove_invalid_namespaces(ctrl, nn);
   4295}
   4296
   4297static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
   4298{
   4299	size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32);
   4300	__le32 *log;
   4301	int error;
   4302
   4303	log = kzalloc(log_size, GFP_KERNEL);
   4304	if (!log)
   4305		return;
   4306
   4307	/*
   4308	 * We need to read the log to clear the AEN, but we don't want to rely
   4309	 * on it for the changed namespace information as userspace could have
   4310	 * raced with us in reading the log page, which could cause us to miss
   4311	 * updates.
   4312	 */
   4313	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
   4314			NVME_CSI_NVM, log, log_size, 0);
   4315	if (error)
   4316		dev_warn(ctrl->device,
   4317			"reading changed ns log failed: %d\n", error);
   4318
   4319	kfree(log);
   4320}
   4321
   4322static void nvme_scan_work(struct work_struct *work)
   4323{
   4324	struct nvme_ctrl *ctrl =
   4325		container_of(work, struct nvme_ctrl, scan_work);
   4326	int ret;
   4327
   4328	/* No tagset on a live ctrl means IO queues could not created */
   4329	if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
   4330		return;
   4331
   4332	/*
   4333	 * Identify controller limits can change at controller reset due to
   4334	 * new firmware download, even though it is not common we cannot ignore
   4335	 * such scenario. Controller's non-mdts limits are reported in the unit
   4336	 * of logical blocks that is dependent on the format of attached
   4337	 * namespace. Hence re-read the limits at the time of ns allocation.
   4338	 */
   4339	ret = nvme_init_non_mdts_limits(ctrl);
   4340	if (ret < 0) {
   4341		dev_warn(ctrl->device,
   4342			"reading non-mdts-limits failed: %d\n", ret);
   4343		return;
   4344	}
   4345
   4346	if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
   4347		dev_info(ctrl->device, "rescanning namespaces.\n");
   4348		nvme_clear_changed_ns_log(ctrl);
   4349	}
   4350
   4351	mutex_lock(&ctrl->scan_lock);
   4352	if (nvme_scan_ns_list(ctrl) != 0)
   4353		nvme_scan_ns_sequential(ctrl);
   4354	mutex_unlock(&ctrl->scan_lock);
   4355}
   4356
   4357/*
   4358 * This function iterates the namespace list unlocked to allow recovery from
   4359 * controller failure. It is up to the caller to ensure the namespace list is
   4360 * not modified by scan work while this function is executing.
   4361 */
   4362void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
   4363{
   4364	struct nvme_ns *ns, *next;
   4365	LIST_HEAD(ns_list);
   4366
   4367	/*
   4368	 * make sure to requeue I/O to all namespaces as these
   4369	 * might result from the scan itself and must complete
   4370	 * for the scan_work to make progress
   4371	 */
   4372	nvme_mpath_clear_ctrl_paths(ctrl);
   4373
   4374	/* prevent racing with ns scanning */
   4375	flush_work(&ctrl->scan_work);
   4376
   4377	/*
   4378	 * The dead states indicates the controller was not gracefully
   4379	 * disconnected. In that case, we won't be able to flush any data while
   4380	 * removing the namespaces' disks; fail all the queues now to avoid
   4381	 * potentially having to clean up the failed sync later.
   4382	 */
   4383	if (ctrl->state == NVME_CTRL_DEAD)
   4384		nvme_kill_queues(ctrl);
   4385
   4386	/* this is a no-op when called from the controller reset handler */
   4387	nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
   4388
   4389	down_write(&ctrl->namespaces_rwsem);
   4390	list_splice_init(&ctrl->namespaces, &ns_list);
   4391	up_write(&ctrl->namespaces_rwsem);
   4392
   4393	list_for_each_entry_safe(ns, next, &ns_list, list)
   4394		nvme_ns_remove(ns);
   4395}
   4396EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
   4397
   4398static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env)
   4399{
   4400	struct nvme_ctrl *ctrl =
   4401		container_of(dev, struct nvme_ctrl, ctrl_device);
   4402	struct nvmf_ctrl_options *opts = ctrl->opts;
   4403	int ret;
   4404
   4405	ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
   4406	if (ret)
   4407		return ret;
   4408
   4409	if (opts) {
   4410		ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
   4411		if (ret)
   4412			return ret;
   4413
   4414		ret = add_uevent_var(env, "NVME_TRSVCID=%s",
   4415				opts->trsvcid ?: "none");
   4416		if (ret)
   4417			return ret;
   4418
   4419		ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
   4420				opts->host_traddr ?: "none");
   4421		if (ret)
   4422			return ret;
   4423
   4424		ret = add_uevent_var(env, "NVME_HOST_IFACE=%s",
   4425				opts->host_iface ?: "none");
   4426	}
   4427	return ret;
   4428}
   4429
   4430static void nvme_change_uevent(struct nvme_ctrl *ctrl, char *envdata)
   4431{
   4432	char *envp[2] = { envdata, NULL };
   4433
   4434	kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
   4435}
   4436
   4437static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
   4438{
   4439	char *envp[2] = { NULL, NULL };
   4440	u32 aen_result = ctrl->aen_result;
   4441
   4442	ctrl->aen_result = 0;
   4443	if (!aen_result)
   4444		return;
   4445
   4446	envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
   4447	if (!envp[0])
   4448		return;
   4449	kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
   4450	kfree(envp[0]);
   4451}
   4452
   4453static void nvme_async_event_work(struct work_struct *work)
   4454{
   4455	struct nvme_ctrl *ctrl =
   4456		container_of(work, struct nvme_ctrl, async_event_work);
   4457
   4458	nvme_aen_uevent(ctrl);
   4459
   4460	/*
   4461	 * The transport drivers must guarantee AER submission here is safe by
   4462	 * flushing ctrl async_event_work after changing the controller state
   4463	 * from LIVE and before freeing the admin queue.
   4464	*/
   4465	if (ctrl->state == NVME_CTRL_LIVE)
   4466		ctrl->ops->submit_async_event(ctrl);
   4467}
   4468
   4469static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
   4470{
   4471
   4472	u32 csts;
   4473
   4474	if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
   4475		return false;
   4476
   4477	if (csts == ~0)
   4478		return false;
   4479
   4480	return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
   4481}
   4482
   4483static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
   4484{
   4485	struct nvme_fw_slot_info_log *log;
   4486
   4487	log = kmalloc(sizeof(*log), GFP_KERNEL);
   4488	if (!log)
   4489		return;
   4490
   4491	if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
   4492			log, sizeof(*log), 0))
   4493		dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
   4494	kfree(log);
   4495}
   4496
   4497static void nvme_fw_act_work(struct work_struct *work)
   4498{
   4499	struct nvme_ctrl *ctrl = container_of(work,
   4500				struct nvme_ctrl, fw_act_work);
   4501	unsigned long fw_act_timeout;
   4502
   4503	if (ctrl->mtfa)
   4504		fw_act_timeout = jiffies +
   4505				msecs_to_jiffies(ctrl->mtfa * 100);
   4506	else
   4507		fw_act_timeout = jiffies +
   4508				msecs_to_jiffies(admin_timeout * 1000);
   4509
   4510	nvme_stop_queues(ctrl);
   4511	while (nvme_ctrl_pp_status(ctrl)) {
   4512		if (time_after(jiffies, fw_act_timeout)) {
   4513			dev_warn(ctrl->device,
   4514				"Fw activation timeout, reset controller\n");
   4515			nvme_try_sched_reset(ctrl);
   4516			return;
   4517		}
   4518		msleep(100);
   4519	}
   4520
   4521	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
   4522		return;
   4523
   4524	nvme_start_queues(ctrl);
   4525	/* read FW slot information to clear the AER */
   4526	nvme_get_fw_slot_info(ctrl);
   4527}
   4528
   4529static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
   4530{
   4531	u32 aer_notice_type = (result & 0xff00) >> 8;
   4532
   4533	trace_nvme_async_event(ctrl, aer_notice_type);
   4534
   4535	switch (aer_notice_type) {
   4536	case NVME_AER_NOTICE_NS_CHANGED:
   4537		set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
   4538		nvme_queue_scan(ctrl);
   4539		break;
   4540	case NVME_AER_NOTICE_FW_ACT_STARTING:
   4541		/*
   4542		 * We are (ab)using the RESETTING state to prevent subsequent
   4543		 * recovery actions from interfering with the controller's
   4544		 * firmware activation.
   4545		 */
   4546		if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
   4547			queue_work(nvme_wq, &ctrl->fw_act_work);
   4548		break;
   4549#ifdef CONFIG_NVME_MULTIPATH
   4550	case NVME_AER_NOTICE_ANA:
   4551		if (!ctrl->ana_log_buf)
   4552			break;
   4553		queue_work(nvme_wq, &ctrl->ana_work);
   4554		break;
   4555#endif
   4556	case NVME_AER_NOTICE_DISC_CHANGED:
   4557		ctrl->aen_result = result;
   4558		break;
   4559	default:
   4560		dev_warn(ctrl->device, "async event result %08x\n", result);
   4561	}
   4562}
   4563
   4564void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
   4565		volatile union nvme_result *res)
   4566{
   4567	u32 result = le32_to_cpu(res->u32);
   4568	u32 aer_type = result & 0x07;
   4569
   4570	if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
   4571		return;
   4572
   4573	switch (aer_type) {
   4574	case NVME_AER_NOTICE:
   4575		nvme_handle_aen_notice(ctrl, result);
   4576		break;
   4577	case NVME_AER_ERROR:
   4578	case NVME_AER_SMART:
   4579	case NVME_AER_CSS:
   4580	case NVME_AER_VS:
   4581		trace_nvme_async_event(ctrl, aer_type);
   4582		ctrl->aen_result = result;
   4583		break;
   4584	default:
   4585		break;
   4586	}
   4587	queue_work(nvme_wq, &ctrl->async_event_work);
   4588}
   4589EXPORT_SYMBOL_GPL(nvme_complete_async_event);
   4590
   4591void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
   4592{
   4593	nvme_mpath_stop(ctrl);
   4594	nvme_stop_keep_alive(ctrl);
   4595	nvme_stop_failfast_work(ctrl);
   4596	flush_work(&ctrl->async_event_work);
   4597	cancel_work_sync(&ctrl->fw_act_work);
   4598	if (ctrl->ops->stop_ctrl)
   4599		ctrl->ops->stop_ctrl(ctrl);
   4600}
   4601EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
   4602
   4603void nvme_start_ctrl(struct nvme_ctrl *ctrl)
   4604{
   4605	nvme_start_keep_alive(ctrl);
   4606
   4607	nvme_enable_aen(ctrl);
   4608
   4609	if (ctrl->queue_count > 1) {
   4610		nvme_queue_scan(ctrl);
   4611		nvme_start_queues(ctrl);
   4612		nvme_mpath_update(ctrl);
   4613	}
   4614
   4615	nvme_change_uevent(ctrl, "NVME_EVENT=connected");
   4616}
   4617EXPORT_SYMBOL_GPL(nvme_start_ctrl);
   4618
   4619void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
   4620{
   4621	nvme_hwmon_exit(ctrl);
   4622	nvme_fault_inject_fini(&ctrl->fault_inject);
   4623	dev_pm_qos_hide_latency_tolerance(ctrl->device);
   4624	cdev_device_del(&ctrl->cdev, ctrl->device);
   4625	nvme_put_ctrl(ctrl);
   4626}
   4627EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
   4628
   4629static void nvme_free_cels(struct nvme_ctrl *ctrl)
   4630{
   4631	struct nvme_effects_log	*cel;
   4632	unsigned long i;
   4633
   4634	xa_for_each(&ctrl->cels, i, cel) {
   4635		xa_erase(&ctrl->cels, i);
   4636		kfree(cel);
   4637	}
   4638
   4639	xa_destroy(&ctrl->cels);
   4640}
   4641
   4642static void nvme_free_ctrl(struct device *dev)
   4643{
   4644	struct nvme_ctrl *ctrl =
   4645		container_of(dev, struct nvme_ctrl, ctrl_device);
   4646	struct nvme_subsystem *subsys = ctrl->subsys;
   4647
   4648	if (!subsys || ctrl->instance != subsys->instance)
   4649		ida_free(&nvme_instance_ida, ctrl->instance);
   4650
   4651	nvme_free_cels(ctrl);
   4652	nvme_mpath_uninit(ctrl);
   4653	__free_page(ctrl->discard_page);
   4654
   4655	if (subsys) {
   4656		mutex_lock(&nvme_subsystems_lock);
   4657		list_del(&ctrl->subsys_entry);
   4658		sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
   4659		mutex_unlock(&nvme_subsystems_lock);
   4660	}
   4661
   4662	ctrl->ops->free_ctrl(ctrl);
   4663
   4664	if (subsys)
   4665		nvme_put_subsystem(subsys);
   4666}
   4667
   4668/*
   4669 * Initialize a NVMe controller structures.  This needs to be called during
   4670 * earliest initialization so that we have the initialized structured around
   4671 * during probing.
   4672 */
   4673int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
   4674		const struct nvme_ctrl_ops *ops, unsigned long quirks)
   4675{
   4676	int ret;
   4677
   4678	ctrl->state = NVME_CTRL_NEW;
   4679	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
   4680	spin_lock_init(&ctrl->lock);
   4681	mutex_init(&ctrl->scan_lock);
   4682	INIT_LIST_HEAD(&ctrl->namespaces);
   4683	xa_init(&ctrl->cels);
   4684	init_rwsem(&ctrl->namespaces_rwsem);
   4685	ctrl->dev = dev;
   4686	ctrl->ops = ops;
   4687	ctrl->quirks = quirks;
   4688	ctrl->numa_node = NUMA_NO_NODE;
   4689	INIT_WORK(&ctrl->scan_work, nvme_scan_work);
   4690	INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
   4691	INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
   4692	INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
   4693	init_waitqueue_head(&ctrl->state_wq);
   4694
   4695	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
   4696	INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
   4697	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
   4698	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
   4699
   4700	BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
   4701			PAGE_SIZE);
   4702	ctrl->discard_page = alloc_page(GFP_KERNEL);
   4703	if (!ctrl->discard_page) {
   4704		ret = -ENOMEM;
   4705		goto out;
   4706	}
   4707
   4708	ret = ida_alloc(&nvme_instance_ida, GFP_KERNEL);
   4709	if (ret < 0)
   4710		goto out;
   4711	ctrl->instance = ret;
   4712
   4713	device_initialize(&ctrl->ctrl_device);
   4714	ctrl->device = &ctrl->ctrl_device;
   4715	ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
   4716			ctrl->instance);
   4717	ctrl->device->class = nvme_class;
   4718	ctrl->device->parent = ctrl->dev;
   4719	ctrl->device->groups = nvme_dev_attr_groups;
   4720	ctrl->device->release = nvme_free_ctrl;
   4721	dev_set_drvdata(ctrl->device, ctrl);
   4722	ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
   4723	if (ret)
   4724		goto out_release_instance;
   4725
   4726	nvme_get_ctrl(ctrl);
   4727	cdev_init(&ctrl->cdev, &nvme_dev_fops);
   4728	ctrl->cdev.owner = ops->module;
   4729	ret = cdev_device_add(&ctrl->cdev, ctrl->device);
   4730	if (ret)
   4731		goto out_free_name;
   4732
   4733	/*
   4734	 * Initialize latency tolerance controls.  The sysfs files won't
   4735	 * be visible to userspace unless the device actually supports APST.
   4736	 */
   4737	ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
   4738	dev_pm_qos_update_user_latency_tolerance(ctrl->device,
   4739		min(default_ps_max_latency_us, (unsigned long)S32_MAX));
   4740
   4741	nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
   4742	nvme_mpath_init_ctrl(ctrl);
   4743
   4744	return 0;
   4745out_free_name:
   4746	nvme_put_ctrl(ctrl);
   4747	kfree_const(ctrl->device->kobj.name);
   4748out_release_instance:
   4749	ida_free(&nvme_instance_ida, ctrl->instance);
   4750out:
   4751	if (ctrl->discard_page)
   4752		__free_page(ctrl->discard_page);
   4753	return ret;
   4754}
   4755EXPORT_SYMBOL_GPL(nvme_init_ctrl);
   4756
   4757static void nvme_start_ns_queue(struct nvme_ns *ns)
   4758{
   4759	if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags))
   4760		blk_mq_unquiesce_queue(ns->queue);
   4761}
   4762
   4763static void nvme_stop_ns_queue(struct nvme_ns *ns)
   4764{
   4765	if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
   4766		blk_mq_quiesce_queue(ns->queue);
   4767	else
   4768		blk_mq_wait_quiesce_done(ns->queue);
   4769}
   4770
   4771/*
   4772 * Prepare a queue for teardown.
   4773 *
   4774 * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
   4775 * the capacity to 0 after that to avoid blocking dispatchers that may be
   4776 * holding bd_butex.  This will end buffered writers dirtying pages that can't
   4777 * be synced.
   4778 */
   4779static void nvme_set_queue_dying(struct nvme_ns *ns)
   4780{
   4781	if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
   4782		return;
   4783
   4784	blk_mark_disk_dead(ns->disk);
   4785	nvme_start_ns_queue(ns);
   4786
   4787	set_capacity_and_notify(ns->disk, 0);
   4788}
   4789
   4790/**
   4791 * nvme_kill_queues(): Ends all namespace queues
   4792 * @ctrl: the dead controller that needs to end
   4793 *
   4794 * Call this function when the driver determines it is unable to get the
   4795 * controller in a state capable of servicing IO.
   4796 */
   4797void nvme_kill_queues(struct nvme_ctrl *ctrl)
   4798{
   4799	struct nvme_ns *ns;
   4800
   4801	down_read(&ctrl->namespaces_rwsem);
   4802
   4803	/* Forcibly unquiesce queues to avoid blocking dispatch */
   4804	if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
   4805		nvme_start_admin_queue(ctrl);
   4806
   4807	list_for_each_entry(ns, &ctrl->namespaces, list)
   4808		nvme_set_queue_dying(ns);
   4809
   4810	up_read(&ctrl->namespaces_rwsem);
   4811}
   4812EXPORT_SYMBOL_GPL(nvme_kill_queues);
   4813
   4814void nvme_unfreeze(struct nvme_ctrl *ctrl)
   4815{
   4816	struct nvme_ns *ns;
   4817
   4818	down_read(&ctrl->namespaces_rwsem);
   4819	list_for_each_entry(ns, &ctrl->namespaces, list)
   4820		blk_mq_unfreeze_queue(ns->queue);
   4821	up_read(&ctrl->namespaces_rwsem);
   4822}
   4823EXPORT_SYMBOL_GPL(nvme_unfreeze);
   4824
   4825int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
   4826{
   4827	struct nvme_ns *ns;
   4828
   4829	down_read(&ctrl->namespaces_rwsem);
   4830	list_for_each_entry(ns, &ctrl->namespaces, list) {
   4831		timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
   4832		if (timeout <= 0)
   4833			break;
   4834	}
   4835	up_read(&ctrl->namespaces_rwsem);
   4836	return timeout;
   4837}
   4838EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
   4839
   4840void nvme_wait_freeze(struct nvme_ctrl *ctrl)
   4841{
   4842	struct nvme_ns *ns;
   4843
   4844	down_read(&ctrl->namespaces_rwsem);
   4845	list_for_each_entry(ns, &ctrl->namespaces, list)
   4846		blk_mq_freeze_queue_wait(ns->queue);
   4847	up_read(&ctrl->namespaces_rwsem);
   4848}
   4849EXPORT_SYMBOL_GPL(nvme_wait_freeze);
   4850
   4851void nvme_start_freeze(struct nvme_ctrl *ctrl)
   4852{
   4853	struct nvme_ns *ns;
   4854
   4855	down_read(&ctrl->namespaces_rwsem);
   4856	list_for_each_entry(ns, &ctrl->namespaces, list)
   4857		blk_freeze_queue_start(ns->queue);
   4858	up_read(&ctrl->namespaces_rwsem);
   4859}
   4860EXPORT_SYMBOL_GPL(nvme_start_freeze);
   4861
   4862void nvme_stop_queues(struct nvme_ctrl *ctrl)
   4863{
   4864	struct nvme_ns *ns;
   4865
   4866	down_read(&ctrl->namespaces_rwsem);
   4867	list_for_each_entry(ns, &ctrl->namespaces, list)
   4868		nvme_stop_ns_queue(ns);
   4869	up_read(&ctrl->namespaces_rwsem);
   4870}
   4871EXPORT_SYMBOL_GPL(nvme_stop_queues);
   4872
   4873void nvme_start_queues(struct nvme_ctrl *ctrl)
   4874{
   4875	struct nvme_ns *ns;
   4876
   4877	down_read(&ctrl->namespaces_rwsem);
   4878	list_for_each_entry(ns, &ctrl->namespaces, list)
   4879		nvme_start_ns_queue(ns);
   4880	up_read(&ctrl->namespaces_rwsem);
   4881}
   4882EXPORT_SYMBOL_GPL(nvme_start_queues);
   4883
   4884void nvme_stop_admin_queue(struct nvme_ctrl *ctrl)
   4885{
   4886	if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
   4887		blk_mq_quiesce_queue(ctrl->admin_q);
   4888	else
   4889		blk_mq_wait_quiesce_done(ctrl->admin_q);
   4890}
   4891EXPORT_SYMBOL_GPL(nvme_stop_admin_queue);
   4892
   4893void nvme_start_admin_queue(struct nvme_ctrl *ctrl)
   4894{
   4895	if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
   4896		blk_mq_unquiesce_queue(ctrl->admin_q);
   4897}
   4898EXPORT_SYMBOL_GPL(nvme_start_admin_queue);
   4899
   4900void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
   4901{
   4902	struct nvme_ns *ns;
   4903
   4904	down_read(&ctrl->namespaces_rwsem);
   4905	list_for_each_entry(ns, &ctrl->namespaces, list)
   4906		blk_sync_queue(ns->queue);
   4907	up_read(&ctrl->namespaces_rwsem);
   4908}
   4909EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
   4910
   4911void nvme_sync_queues(struct nvme_ctrl *ctrl)
   4912{
   4913	nvme_sync_io_queues(ctrl);
   4914	if (ctrl->admin_q)
   4915		blk_sync_queue(ctrl->admin_q);
   4916}
   4917EXPORT_SYMBOL_GPL(nvme_sync_queues);
   4918
   4919struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
   4920{
   4921	if (file->f_op != &nvme_dev_fops)
   4922		return NULL;
   4923	return file->private_data;
   4924}
   4925EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU);
   4926
   4927/*
   4928 * Check we didn't inadvertently grow the command structure sizes:
   4929 */
   4930static inline void _nvme_check_size(void)
   4931{
   4932	BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
   4933	BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
   4934	BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
   4935	BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
   4936	BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
   4937	BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
   4938	BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
   4939	BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
   4940	BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
   4941	BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
   4942	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
   4943	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
   4944	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
   4945	BUILD_BUG_ON(sizeof(struct nvme_id_ns_cs_indep) !=
   4946			NVME_IDENTIFY_DATA_SIZE);
   4947	BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
   4948	BUILD_BUG_ON(sizeof(struct nvme_id_ns_nvm) != NVME_IDENTIFY_DATA_SIZE);
   4949	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
   4950	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE);
   4951	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
   4952	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
   4953	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
   4954	BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
   4955	BUILD_BUG_ON(sizeof(struct nvme_feat_host_behavior) != 512);
   4956}
   4957
   4958
   4959static int __init nvme_core_init(void)
   4960{
   4961	int result = -ENOMEM;
   4962
   4963	_nvme_check_size();
   4964
   4965	nvme_wq = alloc_workqueue("nvme-wq",
   4966			WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
   4967	if (!nvme_wq)
   4968		goto out;
   4969
   4970	nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
   4971			WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
   4972	if (!nvme_reset_wq)
   4973		goto destroy_wq;
   4974
   4975	nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
   4976			WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
   4977	if (!nvme_delete_wq)
   4978		goto destroy_reset_wq;
   4979
   4980	result = alloc_chrdev_region(&nvme_ctrl_base_chr_devt, 0,
   4981			NVME_MINORS, "nvme");
   4982	if (result < 0)
   4983		goto destroy_delete_wq;
   4984
   4985	nvme_class = class_create(THIS_MODULE, "nvme");
   4986	if (IS_ERR(nvme_class)) {
   4987		result = PTR_ERR(nvme_class);
   4988		goto unregister_chrdev;
   4989	}
   4990	nvme_class->dev_uevent = nvme_class_uevent;
   4991
   4992	nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
   4993	if (IS_ERR(nvme_subsys_class)) {
   4994		result = PTR_ERR(nvme_subsys_class);
   4995		goto destroy_class;
   4996	}
   4997
   4998	result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS,
   4999				     "nvme-generic");
   5000	if (result < 0)
   5001		goto destroy_subsys_class;
   5002
   5003	nvme_ns_chr_class = class_create(THIS_MODULE, "nvme-generic");
   5004	if (IS_ERR(nvme_ns_chr_class)) {
   5005		result = PTR_ERR(nvme_ns_chr_class);
   5006		goto unregister_generic_ns;
   5007	}
   5008
   5009	return 0;
   5010
   5011unregister_generic_ns:
   5012	unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
   5013destroy_subsys_class:
   5014	class_destroy(nvme_subsys_class);
   5015destroy_class:
   5016	class_destroy(nvme_class);
   5017unregister_chrdev:
   5018	unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
   5019destroy_delete_wq:
   5020	destroy_workqueue(nvme_delete_wq);
   5021destroy_reset_wq:
   5022	destroy_workqueue(nvme_reset_wq);
   5023destroy_wq:
   5024	destroy_workqueue(nvme_wq);
   5025out:
   5026	return result;
   5027}
   5028
   5029static void __exit nvme_core_exit(void)
   5030{
   5031	class_destroy(nvme_ns_chr_class);
   5032	class_destroy(nvme_subsys_class);
   5033	class_destroy(nvme_class);
   5034	unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
   5035	unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
   5036	destroy_workqueue(nvme_delete_wq);
   5037	destroy_workqueue(nvme_reset_wq);
   5038	destroy_workqueue(nvme_wq);
   5039	ida_destroy(&nvme_ns_chr_minor_ida);
   5040	ida_destroy(&nvme_instance_ida);
   5041}
   5042
   5043MODULE_LICENSE("GPL");
   5044MODULE_VERSION("1.0");
   5045module_init(nvme_core_init);
   5046module_exit(nvme_core_exit);