cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

core.c (40555B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Common code for the NVMe target.
      4 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
      5 */
      6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
      7#include <linux/module.h>
      8#include <linux/random.h>
      9#include <linux/rculist.h>
     10#include <linux/pci-p2pdma.h>
     11#include <linux/scatterlist.h>
     12
     13#define CREATE_TRACE_POINTS
     14#include "trace.h"
     15
     16#include "nvmet.h"
     17
     18struct workqueue_struct *buffered_io_wq;
     19struct workqueue_struct *zbd_wq;
     20static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
     21static DEFINE_IDA(cntlid_ida);
     22
     23struct workqueue_struct *nvmet_wq;
     24EXPORT_SYMBOL_GPL(nvmet_wq);
     25
     26/*
     27 * This read/write semaphore is used to synchronize access to configuration
     28 * information on a target system that will result in discovery log page
     29 * information change for at least one host.
     30 * The full list of resources to protected by this semaphore is:
     31 *
     32 *  - subsystems list
     33 *  - per-subsystem allowed hosts list
     34 *  - allow_any_host subsystem attribute
     35 *  - nvmet_genctr
     36 *  - the nvmet_transports array
     37 *
     38 * When updating any of those lists/structures write lock should be obtained,
     39 * while when reading (popolating discovery log page or checking host-subsystem
     40 * link) read lock is obtained to allow concurrent reads.
     41 */
     42DECLARE_RWSEM(nvmet_config_sem);
     43
     44u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
     45u64 nvmet_ana_chgcnt;
     46DECLARE_RWSEM(nvmet_ana_sem);
     47
     48inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno)
     49{
     50	switch (errno) {
     51	case 0:
     52		return NVME_SC_SUCCESS;
     53	case -ENOSPC:
     54		req->error_loc = offsetof(struct nvme_rw_command, length);
     55		return NVME_SC_CAP_EXCEEDED | NVME_SC_DNR;
     56	case -EREMOTEIO:
     57		req->error_loc = offsetof(struct nvme_rw_command, slba);
     58		return  NVME_SC_LBA_RANGE | NVME_SC_DNR;
     59	case -EOPNOTSUPP:
     60		req->error_loc = offsetof(struct nvme_common_command, opcode);
     61		switch (req->cmd->common.opcode) {
     62		case nvme_cmd_dsm:
     63		case nvme_cmd_write_zeroes:
     64			return NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR;
     65		default:
     66			return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
     67		}
     68		break;
     69	case -ENODATA:
     70		req->error_loc = offsetof(struct nvme_rw_command, nsid);
     71		return NVME_SC_ACCESS_DENIED;
     72	case -EIO:
     73		fallthrough;
     74	default:
     75		req->error_loc = offsetof(struct nvme_common_command, opcode);
     76		return NVME_SC_INTERNAL | NVME_SC_DNR;
     77	}
     78}
     79
     80u16 nvmet_report_invalid_opcode(struct nvmet_req *req)
     81{
     82	pr_debug("unhandled cmd %d on qid %d\n", req->cmd->common.opcode,
     83		 req->sq->qid);
     84
     85	req->error_loc = offsetof(struct nvme_common_command, opcode);
     86	return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
     87}
     88
     89static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
     90		const char *subsysnqn);
     91
     92u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
     93		size_t len)
     94{
     95	if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
     96		req->error_loc = offsetof(struct nvme_common_command, dptr);
     97		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
     98	}
     99	return 0;
    100}
    101
    102u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len)
    103{
    104	if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
    105		req->error_loc = offsetof(struct nvme_common_command, dptr);
    106		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
    107	}
    108	return 0;
    109}
    110
    111u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len)
    112{
    113	if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) {
    114		req->error_loc = offsetof(struct nvme_common_command, dptr);
    115		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
    116	}
    117	return 0;
    118}
    119
    120static u32 nvmet_max_nsid(struct nvmet_subsys *subsys)
    121{
    122	struct nvmet_ns *cur;
    123	unsigned long idx;
    124	u32 nsid = 0;
    125
    126	xa_for_each(&subsys->namespaces, idx, cur)
    127		nsid = cur->nsid;
    128
    129	return nsid;
    130}
    131
    132static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
    133{
    134	return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16);
    135}
    136
    137static void nvmet_async_events_failall(struct nvmet_ctrl *ctrl)
    138{
    139	struct nvmet_req *req;
    140
    141	mutex_lock(&ctrl->lock);
    142	while (ctrl->nr_async_event_cmds) {
    143		req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
    144		mutex_unlock(&ctrl->lock);
    145		nvmet_req_complete(req, NVME_SC_INTERNAL | NVME_SC_DNR);
    146		mutex_lock(&ctrl->lock);
    147	}
    148	mutex_unlock(&ctrl->lock);
    149}
    150
    151static void nvmet_async_events_process(struct nvmet_ctrl *ctrl)
    152{
    153	struct nvmet_async_event *aen;
    154	struct nvmet_req *req;
    155
    156	mutex_lock(&ctrl->lock);
    157	while (ctrl->nr_async_event_cmds && !list_empty(&ctrl->async_events)) {
    158		aen = list_first_entry(&ctrl->async_events,
    159				       struct nvmet_async_event, entry);
    160		req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
    161		nvmet_set_result(req, nvmet_async_event_result(aen));
    162
    163		list_del(&aen->entry);
    164		kfree(aen);
    165
    166		mutex_unlock(&ctrl->lock);
    167		trace_nvmet_async_event(ctrl, req->cqe->result.u32);
    168		nvmet_req_complete(req, 0);
    169		mutex_lock(&ctrl->lock);
    170	}
    171	mutex_unlock(&ctrl->lock);
    172}
    173
    174static void nvmet_async_events_free(struct nvmet_ctrl *ctrl)
    175{
    176	struct nvmet_async_event *aen, *tmp;
    177
    178	mutex_lock(&ctrl->lock);
    179	list_for_each_entry_safe(aen, tmp, &ctrl->async_events, entry) {
    180		list_del(&aen->entry);
    181		kfree(aen);
    182	}
    183	mutex_unlock(&ctrl->lock);
    184}
    185
    186static void nvmet_async_event_work(struct work_struct *work)
    187{
    188	struct nvmet_ctrl *ctrl =
    189		container_of(work, struct nvmet_ctrl, async_event_work);
    190
    191	nvmet_async_events_process(ctrl);
    192}
    193
    194void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
    195		u8 event_info, u8 log_page)
    196{
    197	struct nvmet_async_event *aen;
    198
    199	aen = kmalloc(sizeof(*aen), GFP_KERNEL);
    200	if (!aen)
    201		return;
    202
    203	aen->event_type = event_type;
    204	aen->event_info = event_info;
    205	aen->log_page = log_page;
    206
    207	mutex_lock(&ctrl->lock);
    208	list_add_tail(&aen->entry, &ctrl->async_events);
    209	mutex_unlock(&ctrl->lock);
    210
    211	queue_work(nvmet_wq, &ctrl->async_event_work);
    212}
    213
    214static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
    215{
    216	u32 i;
    217
    218	mutex_lock(&ctrl->lock);
    219	if (ctrl->nr_changed_ns > NVME_MAX_CHANGED_NAMESPACES)
    220		goto out_unlock;
    221
    222	for (i = 0; i < ctrl->nr_changed_ns; i++) {
    223		if (ctrl->changed_ns_list[i] == nsid)
    224			goto out_unlock;
    225	}
    226
    227	if (ctrl->nr_changed_ns == NVME_MAX_CHANGED_NAMESPACES) {
    228		ctrl->changed_ns_list[0] = cpu_to_le32(0xffffffff);
    229		ctrl->nr_changed_ns = U32_MAX;
    230		goto out_unlock;
    231	}
    232
    233	ctrl->changed_ns_list[ctrl->nr_changed_ns++] = nsid;
    234out_unlock:
    235	mutex_unlock(&ctrl->lock);
    236}
    237
    238void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
    239{
    240	struct nvmet_ctrl *ctrl;
    241
    242	lockdep_assert_held(&subsys->lock);
    243
    244	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
    245		nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid));
    246		if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR))
    247			continue;
    248		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
    249				NVME_AER_NOTICE_NS_CHANGED,
    250				NVME_LOG_CHANGED_NS);
    251	}
    252}
    253
    254void nvmet_send_ana_event(struct nvmet_subsys *subsys,
    255		struct nvmet_port *port)
    256{
    257	struct nvmet_ctrl *ctrl;
    258
    259	mutex_lock(&subsys->lock);
    260	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
    261		if (port && ctrl->port != port)
    262			continue;
    263		if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_ANA_CHANGE))
    264			continue;
    265		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
    266				NVME_AER_NOTICE_ANA, NVME_LOG_ANA);
    267	}
    268	mutex_unlock(&subsys->lock);
    269}
    270
    271void nvmet_port_send_ana_event(struct nvmet_port *port)
    272{
    273	struct nvmet_subsys_link *p;
    274
    275	down_read(&nvmet_config_sem);
    276	list_for_each_entry(p, &port->subsystems, entry)
    277		nvmet_send_ana_event(p->subsys, port);
    278	up_read(&nvmet_config_sem);
    279}
    280
    281int nvmet_register_transport(const struct nvmet_fabrics_ops *ops)
    282{
    283	int ret = 0;
    284
    285	down_write(&nvmet_config_sem);
    286	if (nvmet_transports[ops->type])
    287		ret = -EINVAL;
    288	else
    289		nvmet_transports[ops->type] = ops;
    290	up_write(&nvmet_config_sem);
    291
    292	return ret;
    293}
    294EXPORT_SYMBOL_GPL(nvmet_register_transport);
    295
    296void nvmet_unregister_transport(const struct nvmet_fabrics_ops *ops)
    297{
    298	down_write(&nvmet_config_sem);
    299	nvmet_transports[ops->type] = NULL;
    300	up_write(&nvmet_config_sem);
    301}
    302EXPORT_SYMBOL_GPL(nvmet_unregister_transport);
    303
    304void nvmet_port_del_ctrls(struct nvmet_port *port, struct nvmet_subsys *subsys)
    305{
    306	struct nvmet_ctrl *ctrl;
    307
    308	mutex_lock(&subsys->lock);
    309	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
    310		if (ctrl->port == port)
    311			ctrl->ops->delete_ctrl(ctrl);
    312	}
    313	mutex_unlock(&subsys->lock);
    314}
    315
    316int nvmet_enable_port(struct nvmet_port *port)
    317{
    318	const struct nvmet_fabrics_ops *ops;
    319	int ret;
    320
    321	lockdep_assert_held(&nvmet_config_sem);
    322
    323	ops = nvmet_transports[port->disc_addr.trtype];
    324	if (!ops) {
    325		up_write(&nvmet_config_sem);
    326		request_module("nvmet-transport-%d", port->disc_addr.trtype);
    327		down_write(&nvmet_config_sem);
    328		ops = nvmet_transports[port->disc_addr.trtype];
    329		if (!ops) {
    330			pr_err("transport type %d not supported\n",
    331				port->disc_addr.trtype);
    332			return -EINVAL;
    333		}
    334	}
    335
    336	if (!try_module_get(ops->owner))
    337		return -EINVAL;
    338
    339	/*
    340	 * If the user requested PI support and the transport isn't pi capable,
    341	 * don't enable the port.
    342	 */
    343	if (port->pi_enable && !(ops->flags & NVMF_METADATA_SUPPORTED)) {
    344		pr_err("T10-PI is not supported by transport type %d\n",
    345		       port->disc_addr.trtype);
    346		ret = -EINVAL;
    347		goto out_put;
    348	}
    349
    350	ret = ops->add_port(port);
    351	if (ret)
    352		goto out_put;
    353
    354	/* If the transport didn't set inline_data_size, then disable it. */
    355	if (port->inline_data_size < 0)
    356		port->inline_data_size = 0;
    357
    358	port->enabled = true;
    359	port->tr_ops = ops;
    360	return 0;
    361
    362out_put:
    363	module_put(ops->owner);
    364	return ret;
    365}
    366
    367void nvmet_disable_port(struct nvmet_port *port)
    368{
    369	const struct nvmet_fabrics_ops *ops;
    370
    371	lockdep_assert_held(&nvmet_config_sem);
    372
    373	port->enabled = false;
    374	port->tr_ops = NULL;
    375
    376	ops = nvmet_transports[port->disc_addr.trtype];
    377	ops->remove_port(port);
    378	module_put(ops->owner);
    379}
    380
    381static void nvmet_keep_alive_timer(struct work_struct *work)
    382{
    383	struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work),
    384			struct nvmet_ctrl, ka_work);
    385	bool reset_tbkas = ctrl->reset_tbkas;
    386
    387	ctrl->reset_tbkas = false;
    388	if (reset_tbkas) {
    389		pr_debug("ctrl %d reschedule traffic based keep-alive timer\n",
    390			ctrl->cntlid);
    391		queue_delayed_work(nvmet_wq, &ctrl->ka_work, ctrl->kato * HZ);
    392		return;
    393	}
    394
    395	pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n",
    396		ctrl->cntlid, ctrl->kato);
    397
    398	nvmet_ctrl_fatal_error(ctrl);
    399}
    400
    401void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl)
    402{
    403	if (unlikely(ctrl->kato == 0))
    404		return;
    405
    406	pr_debug("ctrl %d start keep-alive timer for %d secs\n",
    407		ctrl->cntlid, ctrl->kato);
    408
    409	queue_delayed_work(nvmet_wq, &ctrl->ka_work, ctrl->kato * HZ);
    410}
    411
    412void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
    413{
    414	if (unlikely(ctrl->kato == 0))
    415		return;
    416
    417	pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid);
    418
    419	cancel_delayed_work_sync(&ctrl->ka_work);
    420}
    421
    422u16 nvmet_req_find_ns(struct nvmet_req *req)
    423{
    424	u32 nsid = le32_to_cpu(req->cmd->common.nsid);
    425
    426	req->ns = xa_load(&nvmet_req_subsys(req)->namespaces, nsid);
    427	if (unlikely(!req->ns)) {
    428		req->error_loc = offsetof(struct nvme_common_command, nsid);
    429		return NVME_SC_INVALID_NS | NVME_SC_DNR;
    430	}
    431
    432	percpu_ref_get(&req->ns->ref);
    433	return NVME_SC_SUCCESS;
    434}
    435
    436static void nvmet_destroy_namespace(struct percpu_ref *ref)
    437{
    438	struct nvmet_ns *ns = container_of(ref, struct nvmet_ns, ref);
    439
    440	complete(&ns->disable_done);
    441}
    442
    443void nvmet_put_namespace(struct nvmet_ns *ns)
    444{
    445	percpu_ref_put(&ns->ref);
    446}
    447
    448static void nvmet_ns_dev_disable(struct nvmet_ns *ns)
    449{
    450	nvmet_bdev_ns_disable(ns);
    451	nvmet_file_ns_disable(ns);
    452}
    453
    454static int nvmet_p2pmem_ns_enable(struct nvmet_ns *ns)
    455{
    456	int ret;
    457	struct pci_dev *p2p_dev;
    458
    459	if (!ns->use_p2pmem)
    460		return 0;
    461
    462	if (!ns->bdev) {
    463		pr_err("peer-to-peer DMA is not supported by non-block device namespaces\n");
    464		return -EINVAL;
    465	}
    466
    467	if (!blk_queue_pci_p2pdma(ns->bdev->bd_disk->queue)) {
    468		pr_err("peer-to-peer DMA is not supported by the driver of %s\n",
    469		       ns->device_path);
    470		return -EINVAL;
    471	}
    472
    473	if (ns->p2p_dev) {
    474		ret = pci_p2pdma_distance(ns->p2p_dev, nvmet_ns_dev(ns), true);
    475		if (ret < 0)
    476			return -EINVAL;
    477	} else {
    478		/*
    479		 * Right now we just check that there is p2pmem available so
    480		 * we can report an error to the user right away if there
    481		 * is not. We'll find the actual device to use once we
    482		 * setup the controller when the port's device is available.
    483		 */
    484
    485		p2p_dev = pci_p2pmem_find(nvmet_ns_dev(ns));
    486		if (!p2p_dev) {
    487			pr_err("no peer-to-peer memory is available for %s\n",
    488			       ns->device_path);
    489			return -EINVAL;
    490		}
    491
    492		pci_dev_put(p2p_dev);
    493	}
    494
    495	return 0;
    496}
    497
    498/*
    499 * Note: ctrl->subsys->lock should be held when calling this function
    500 */
    501static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl,
    502				    struct nvmet_ns *ns)
    503{
    504	struct device *clients[2];
    505	struct pci_dev *p2p_dev;
    506	int ret;
    507
    508	if (!ctrl->p2p_client || !ns->use_p2pmem)
    509		return;
    510
    511	if (ns->p2p_dev) {
    512		ret = pci_p2pdma_distance(ns->p2p_dev, ctrl->p2p_client, true);
    513		if (ret < 0)
    514			return;
    515
    516		p2p_dev = pci_dev_get(ns->p2p_dev);
    517	} else {
    518		clients[0] = ctrl->p2p_client;
    519		clients[1] = nvmet_ns_dev(ns);
    520
    521		p2p_dev = pci_p2pmem_find_many(clients, ARRAY_SIZE(clients));
    522		if (!p2p_dev) {
    523			pr_err("no peer-to-peer memory is available that's supported by %s and %s\n",
    524			       dev_name(ctrl->p2p_client), ns->device_path);
    525			return;
    526		}
    527	}
    528
    529	ret = radix_tree_insert(&ctrl->p2p_ns_map, ns->nsid, p2p_dev);
    530	if (ret < 0)
    531		pci_dev_put(p2p_dev);
    532
    533	pr_info("using p2pmem on %s for nsid %d\n", pci_name(p2p_dev),
    534		ns->nsid);
    535}
    536
    537bool nvmet_ns_revalidate(struct nvmet_ns *ns)
    538{
    539	loff_t oldsize = ns->size;
    540
    541	if (ns->bdev)
    542		nvmet_bdev_ns_revalidate(ns);
    543	else
    544		nvmet_file_ns_revalidate(ns);
    545
    546	return oldsize != ns->size;
    547}
    548
    549int nvmet_ns_enable(struct nvmet_ns *ns)
    550{
    551	struct nvmet_subsys *subsys = ns->subsys;
    552	struct nvmet_ctrl *ctrl;
    553	int ret;
    554
    555	mutex_lock(&subsys->lock);
    556	ret = 0;
    557
    558	if (nvmet_is_passthru_subsys(subsys)) {
    559		pr_info("cannot enable both passthru and regular namespaces for a single subsystem");
    560		goto out_unlock;
    561	}
    562
    563	if (ns->enabled)
    564		goto out_unlock;
    565
    566	ret = -EMFILE;
    567	if (subsys->nr_namespaces == NVMET_MAX_NAMESPACES)
    568		goto out_unlock;
    569
    570	ret = nvmet_bdev_ns_enable(ns);
    571	if (ret == -ENOTBLK)
    572		ret = nvmet_file_ns_enable(ns);
    573	if (ret)
    574		goto out_unlock;
    575
    576	ret = nvmet_p2pmem_ns_enable(ns);
    577	if (ret)
    578		goto out_dev_disable;
    579
    580	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
    581		nvmet_p2pmem_ns_add_p2p(ctrl, ns);
    582
    583	ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace,
    584				0, GFP_KERNEL);
    585	if (ret)
    586		goto out_dev_put;
    587
    588	if (ns->nsid > subsys->max_nsid)
    589		subsys->max_nsid = ns->nsid;
    590
    591	ret = xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL);
    592	if (ret)
    593		goto out_restore_subsys_maxnsid;
    594
    595	subsys->nr_namespaces++;
    596
    597	nvmet_ns_changed(subsys, ns->nsid);
    598	ns->enabled = true;
    599	ret = 0;
    600out_unlock:
    601	mutex_unlock(&subsys->lock);
    602	return ret;
    603
    604out_restore_subsys_maxnsid:
    605	subsys->max_nsid = nvmet_max_nsid(subsys);
    606	percpu_ref_exit(&ns->ref);
    607out_dev_put:
    608	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
    609		pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
    610out_dev_disable:
    611	nvmet_ns_dev_disable(ns);
    612	goto out_unlock;
    613}
    614
    615void nvmet_ns_disable(struct nvmet_ns *ns)
    616{
    617	struct nvmet_subsys *subsys = ns->subsys;
    618	struct nvmet_ctrl *ctrl;
    619
    620	mutex_lock(&subsys->lock);
    621	if (!ns->enabled)
    622		goto out_unlock;
    623
    624	ns->enabled = false;
    625	xa_erase(&ns->subsys->namespaces, ns->nsid);
    626	if (ns->nsid == subsys->max_nsid)
    627		subsys->max_nsid = nvmet_max_nsid(subsys);
    628
    629	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
    630		pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
    631
    632	mutex_unlock(&subsys->lock);
    633
    634	/*
    635	 * Now that we removed the namespaces from the lookup list, we
    636	 * can kill the per_cpu ref and wait for any remaining references
    637	 * to be dropped, as well as a RCU grace period for anyone only
    638	 * using the namepace under rcu_read_lock().  Note that we can't
    639	 * use call_rcu here as we need to ensure the namespaces have
    640	 * been fully destroyed before unloading the module.
    641	 */
    642	percpu_ref_kill(&ns->ref);
    643	synchronize_rcu();
    644	wait_for_completion(&ns->disable_done);
    645	percpu_ref_exit(&ns->ref);
    646
    647	mutex_lock(&subsys->lock);
    648
    649	subsys->nr_namespaces--;
    650	nvmet_ns_changed(subsys, ns->nsid);
    651	nvmet_ns_dev_disable(ns);
    652out_unlock:
    653	mutex_unlock(&subsys->lock);
    654}
    655
    656void nvmet_ns_free(struct nvmet_ns *ns)
    657{
    658	nvmet_ns_disable(ns);
    659
    660	down_write(&nvmet_ana_sem);
    661	nvmet_ana_group_enabled[ns->anagrpid]--;
    662	up_write(&nvmet_ana_sem);
    663
    664	kfree(ns->device_path);
    665	kfree(ns);
    666}
    667
    668struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
    669{
    670	struct nvmet_ns *ns;
    671
    672	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
    673	if (!ns)
    674		return NULL;
    675
    676	init_completion(&ns->disable_done);
    677
    678	ns->nsid = nsid;
    679	ns->subsys = subsys;
    680
    681	down_write(&nvmet_ana_sem);
    682	ns->anagrpid = NVMET_DEFAULT_ANA_GRPID;
    683	nvmet_ana_group_enabled[ns->anagrpid]++;
    684	up_write(&nvmet_ana_sem);
    685
    686	uuid_gen(&ns->uuid);
    687	ns->buffered_io = false;
    688	ns->csi = NVME_CSI_NVM;
    689
    690	return ns;
    691}
    692
    693static void nvmet_update_sq_head(struct nvmet_req *req)
    694{
    695	if (req->sq->size) {
    696		u32 old_sqhd, new_sqhd;
    697
    698		do {
    699			old_sqhd = req->sq->sqhd;
    700			new_sqhd = (old_sqhd + 1) % req->sq->size;
    701		} while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) !=
    702					old_sqhd);
    703	}
    704	req->cqe->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF);
    705}
    706
    707static void nvmet_set_error(struct nvmet_req *req, u16 status)
    708{
    709	struct nvmet_ctrl *ctrl = req->sq->ctrl;
    710	struct nvme_error_slot *new_error_slot;
    711	unsigned long flags;
    712
    713	req->cqe->status = cpu_to_le16(status << 1);
    714
    715	if (!ctrl || req->error_loc == NVMET_NO_ERROR_LOC)
    716		return;
    717
    718	spin_lock_irqsave(&ctrl->error_lock, flags);
    719	ctrl->err_counter++;
    720	new_error_slot =
    721		&ctrl->slots[ctrl->err_counter % NVMET_ERROR_LOG_SLOTS];
    722
    723	new_error_slot->error_count = cpu_to_le64(ctrl->err_counter);
    724	new_error_slot->sqid = cpu_to_le16(req->sq->qid);
    725	new_error_slot->cmdid = cpu_to_le16(req->cmd->common.command_id);
    726	new_error_slot->status_field = cpu_to_le16(status << 1);
    727	new_error_slot->param_error_location = cpu_to_le16(req->error_loc);
    728	new_error_slot->lba = cpu_to_le64(req->error_slba);
    729	new_error_slot->nsid = req->cmd->common.nsid;
    730	spin_unlock_irqrestore(&ctrl->error_lock, flags);
    731
    732	/* set the more bit for this request */
    733	req->cqe->status |= cpu_to_le16(1 << 14);
    734}
    735
    736static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
    737{
    738	if (!req->sq->sqhd_disabled)
    739		nvmet_update_sq_head(req);
    740	req->cqe->sq_id = cpu_to_le16(req->sq->qid);
    741	req->cqe->command_id = req->cmd->common.command_id;
    742
    743	if (unlikely(status))
    744		nvmet_set_error(req, status);
    745
    746	trace_nvmet_req_complete(req);
    747
    748	if (req->ns)
    749		nvmet_put_namespace(req->ns);
    750	req->ops->queue_response(req);
    751}
    752
    753void nvmet_req_complete(struct nvmet_req *req, u16 status)
    754{
    755	__nvmet_req_complete(req, status);
    756	percpu_ref_put(&req->sq->ref);
    757}
    758EXPORT_SYMBOL_GPL(nvmet_req_complete);
    759
    760void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq,
    761		u16 qid, u16 size)
    762{
    763	cq->qid = qid;
    764	cq->size = size;
    765}
    766
    767void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
    768		u16 qid, u16 size)
    769{
    770	sq->sqhd = 0;
    771	sq->qid = qid;
    772	sq->size = size;
    773
    774	ctrl->sqs[qid] = sq;
    775}
    776
    777static void nvmet_confirm_sq(struct percpu_ref *ref)
    778{
    779	struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref);
    780
    781	complete(&sq->confirm_done);
    782}
    783
    784void nvmet_sq_destroy(struct nvmet_sq *sq)
    785{
    786	struct nvmet_ctrl *ctrl = sq->ctrl;
    787
    788	/*
    789	 * If this is the admin queue, complete all AERs so that our
    790	 * queue doesn't have outstanding requests on it.
    791	 */
    792	if (ctrl && ctrl->sqs && ctrl->sqs[0] == sq)
    793		nvmet_async_events_failall(ctrl);
    794	percpu_ref_kill_and_confirm(&sq->ref, nvmet_confirm_sq);
    795	wait_for_completion(&sq->confirm_done);
    796	wait_for_completion(&sq->free_done);
    797	percpu_ref_exit(&sq->ref);
    798
    799	if (ctrl) {
    800		/*
    801		 * The teardown flow may take some time, and the host may not
    802		 * send us keep-alive during this period, hence reset the
    803		 * traffic based keep-alive timer so we don't trigger a
    804		 * controller teardown as a result of a keep-alive expiration.
    805		 */
    806		ctrl->reset_tbkas = true;
    807		sq->ctrl->sqs[sq->qid] = NULL;
    808		nvmet_ctrl_put(ctrl);
    809		sq->ctrl = NULL; /* allows reusing the queue later */
    810	}
    811}
    812EXPORT_SYMBOL_GPL(nvmet_sq_destroy);
    813
    814static void nvmet_sq_free(struct percpu_ref *ref)
    815{
    816	struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref);
    817
    818	complete(&sq->free_done);
    819}
    820
    821int nvmet_sq_init(struct nvmet_sq *sq)
    822{
    823	int ret;
    824
    825	ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL);
    826	if (ret) {
    827		pr_err("percpu_ref init failed!\n");
    828		return ret;
    829	}
    830	init_completion(&sq->free_done);
    831	init_completion(&sq->confirm_done);
    832
    833	return 0;
    834}
    835EXPORT_SYMBOL_GPL(nvmet_sq_init);
    836
    837static inline u16 nvmet_check_ana_state(struct nvmet_port *port,
    838		struct nvmet_ns *ns)
    839{
    840	enum nvme_ana_state state = port->ana_state[ns->anagrpid];
    841
    842	if (unlikely(state == NVME_ANA_INACCESSIBLE))
    843		return NVME_SC_ANA_INACCESSIBLE;
    844	if (unlikely(state == NVME_ANA_PERSISTENT_LOSS))
    845		return NVME_SC_ANA_PERSISTENT_LOSS;
    846	if (unlikely(state == NVME_ANA_CHANGE))
    847		return NVME_SC_ANA_TRANSITION;
    848	return 0;
    849}
    850
    851static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req)
    852{
    853	if (unlikely(req->ns->readonly)) {
    854		switch (req->cmd->common.opcode) {
    855		case nvme_cmd_read:
    856		case nvme_cmd_flush:
    857			break;
    858		default:
    859			return NVME_SC_NS_WRITE_PROTECTED;
    860		}
    861	}
    862
    863	return 0;
    864}
    865
    866static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
    867{
    868	u16 ret;
    869
    870	ret = nvmet_check_ctrl_status(req);
    871	if (unlikely(ret))
    872		return ret;
    873
    874	if (nvmet_is_passthru_req(req))
    875		return nvmet_parse_passthru_io_cmd(req);
    876
    877	ret = nvmet_req_find_ns(req);
    878	if (unlikely(ret))
    879		return ret;
    880
    881	ret = nvmet_check_ana_state(req->port, req->ns);
    882	if (unlikely(ret)) {
    883		req->error_loc = offsetof(struct nvme_common_command, nsid);
    884		return ret;
    885	}
    886	ret = nvmet_io_cmd_check_access(req);
    887	if (unlikely(ret)) {
    888		req->error_loc = offsetof(struct nvme_common_command, nsid);
    889		return ret;
    890	}
    891
    892	switch (req->ns->csi) {
    893	case NVME_CSI_NVM:
    894		if (req->ns->file)
    895			return nvmet_file_parse_io_cmd(req);
    896		return nvmet_bdev_parse_io_cmd(req);
    897	case NVME_CSI_ZNS:
    898		if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
    899			return nvmet_bdev_zns_parse_io_cmd(req);
    900		return NVME_SC_INVALID_IO_CMD_SET;
    901	default:
    902		return NVME_SC_INVALID_IO_CMD_SET;
    903	}
    904}
    905
    906bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
    907		struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops)
    908{
    909	u8 flags = req->cmd->common.flags;
    910	u16 status;
    911
    912	req->cq = cq;
    913	req->sq = sq;
    914	req->ops = ops;
    915	req->sg = NULL;
    916	req->metadata_sg = NULL;
    917	req->sg_cnt = 0;
    918	req->metadata_sg_cnt = 0;
    919	req->transfer_len = 0;
    920	req->metadata_len = 0;
    921	req->cqe->status = 0;
    922	req->cqe->sq_head = 0;
    923	req->ns = NULL;
    924	req->error_loc = NVMET_NO_ERROR_LOC;
    925	req->error_slba = 0;
    926
    927	/* no support for fused commands yet */
    928	if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
    929		req->error_loc = offsetof(struct nvme_common_command, flags);
    930		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
    931		goto fail;
    932	}
    933
    934	/*
    935	 * For fabrics, PSDT field shall describe metadata pointer (MPTR) that
    936	 * contains an address of a single contiguous physical buffer that is
    937	 * byte aligned.
    938	 */
    939	if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) {
    940		req->error_loc = offsetof(struct nvme_common_command, flags);
    941		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
    942		goto fail;
    943	}
    944
    945	if (unlikely(!req->sq->ctrl))
    946		/* will return an error for any non-connect command: */
    947		status = nvmet_parse_connect_cmd(req);
    948	else if (likely(req->sq->qid != 0))
    949		status = nvmet_parse_io_cmd(req);
    950	else
    951		status = nvmet_parse_admin_cmd(req);
    952
    953	if (status)
    954		goto fail;
    955
    956	trace_nvmet_req_init(req, req->cmd);
    957
    958	if (unlikely(!percpu_ref_tryget_live(&sq->ref))) {
    959		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
    960		goto fail;
    961	}
    962
    963	if (sq->ctrl)
    964		sq->ctrl->reset_tbkas = true;
    965
    966	return true;
    967
    968fail:
    969	__nvmet_req_complete(req, status);
    970	return false;
    971}
    972EXPORT_SYMBOL_GPL(nvmet_req_init);
    973
    974void nvmet_req_uninit(struct nvmet_req *req)
    975{
    976	percpu_ref_put(&req->sq->ref);
    977	if (req->ns)
    978		nvmet_put_namespace(req->ns);
    979}
    980EXPORT_SYMBOL_GPL(nvmet_req_uninit);
    981
    982bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len)
    983{
    984	if (unlikely(len != req->transfer_len)) {
    985		req->error_loc = offsetof(struct nvme_common_command, dptr);
    986		nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
    987		return false;
    988	}
    989
    990	return true;
    991}
    992EXPORT_SYMBOL_GPL(nvmet_check_transfer_len);
    993
    994bool nvmet_check_data_len_lte(struct nvmet_req *req, size_t data_len)
    995{
    996	if (unlikely(data_len > req->transfer_len)) {
    997		req->error_loc = offsetof(struct nvme_common_command, dptr);
    998		nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
    999		return false;
   1000	}
   1001
   1002	return true;
   1003}
   1004
   1005static unsigned int nvmet_data_transfer_len(struct nvmet_req *req)
   1006{
   1007	return req->transfer_len - req->metadata_len;
   1008}
   1009
   1010static int nvmet_req_alloc_p2pmem_sgls(struct pci_dev *p2p_dev,
   1011		struct nvmet_req *req)
   1012{
   1013	req->sg = pci_p2pmem_alloc_sgl(p2p_dev, &req->sg_cnt,
   1014			nvmet_data_transfer_len(req));
   1015	if (!req->sg)
   1016		goto out_err;
   1017
   1018	if (req->metadata_len) {
   1019		req->metadata_sg = pci_p2pmem_alloc_sgl(p2p_dev,
   1020				&req->metadata_sg_cnt, req->metadata_len);
   1021		if (!req->metadata_sg)
   1022			goto out_free_sg;
   1023	}
   1024
   1025	req->p2p_dev = p2p_dev;
   1026
   1027	return 0;
   1028out_free_sg:
   1029	pci_p2pmem_free_sgl(req->p2p_dev, req->sg);
   1030out_err:
   1031	return -ENOMEM;
   1032}
   1033
   1034static struct pci_dev *nvmet_req_find_p2p_dev(struct nvmet_req *req)
   1035{
   1036	if (!IS_ENABLED(CONFIG_PCI_P2PDMA) ||
   1037	    !req->sq->ctrl || !req->sq->qid || !req->ns)
   1038		return NULL;
   1039	return radix_tree_lookup(&req->sq->ctrl->p2p_ns_map, req->ns->nsid);
   1040}
   1041
   1042int nvmet_req_alloc_sgls(struct nvmet_req *req)
   1043{
   1044	struct pci_dev *p2p_dev = nvmet_req_find_p2p_dev(req);
   1045
   1046	if (p2p_dev && !nvmet_req_alloc_p2pmem_sgls(p2p_dev, req))
   1047		return 0;
   1048
   1049	req->sg = sgl_alloc(nvmet_data_transfer_len(req), GFP_KERNEL,
   1050			    &req->sg_cnt);
   1051	if (unlikely(!req->sg))
   1052		goto out;
   1053
   1054	if (req->metadata_len) {
   1055		req->metadata_sg = sgl_alloc(req->metadata_len, GFP_KERNEL,
   1056					     &req->metadata_sg_cnt);
   1057		if (unlikely(!req->metadata_sg))
   1058			goto out_free;
   1059	}
   1060
   1061	return 0;
   1062out_free:
   1063	sgl_free(req->sg);
   1064out:
   1065	return -ENOMEM;
   1066}
   1067EXPORT_SYMBOL_GPL(nvmet_req_alloc_sgls);
   1068
   1069void nvmet_req_free_sgls(struct nvmet_req *req)
   1070{
   1071	if (req->p2p_dev) {
   1072		pci_p2pmem_free_sgl(req->p2p_dev, req->sg);
   1073		if (req->metadata_sg)
   1074			pci_p2pmem_free_sgl(req->p2p_dev, req->metadata_sg);
   1075		req->p2p_dev = NULL;
   1076	} else {
   1077		sgl_free(req->sg);
   1078		if (req->metadata_sg)
   1079			sgl_free(req->metadata_sg);
   1080	}
   1081
   1082	req->sg = NULL;
   1083	req->metadata_sg = NULL;
   1084	req->sg_cnt = 0;
   1085	req->metadata_sg_cnt = 0;
   1086}
   1087EXPORT_SYMBOL_GPL(nvmet_req_free_sgls);
   1088
   1089static inline bool nvmet_cc_en(u32 cc)
   1090{
   1091	return (cc >> NVME_CC_EN_SHIFT) & 0x1;
   1092}
   1093
   1094static inline u8 nvmet_cc_css(u32 cc)
   1095{
   1096	return (cc >> NVME_CC_CSS_SHIFT) & 0x7;
   1097}
   1098
   1099static inline u8 nvmet_cc_mps(u32 cc)
   1100{
   1101	return (cc >> NVME_CC_MPS_SHIFT) & 0xf;
   1102}
   1103
   1104static inline u8 nvmet_cc_ams(u32 cc)
   1105{
   1106	return (cc >> NVME_CC_AMS_SHIFT) & 0x7;
   1107}
   1108
   1109static inline u8 nvmet_cc_shn(u32 cc)
   1110{
   1111	return (cc >> NVME_CC_SHN_SHIFT) & 0x3;
   1112}
   1113
   1114static inline u8 nvmet_cc_iosqes(u32 cc)
   1115{
   1116	return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf;
   1117}
   1118
   1119static inline u8 nvmet_cc_iocqes(u32 cc)
   1120{
   1121	return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf;
   1122}
   1123
   1124static inline bool nvmet_css_supported(u8 cc_css)
   1125{
   1126	switch (cc_css << NVME_CC_CSS_SHIFT) {
   1127	case NVME_CC_CSS_NVM:
   1128	case NVME_CC_CSS_CSI:
   1129		return true;
   1130	default:
   1131		return false;
   1132	}
   1133}
   1134
   1135static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
   1136{
   1137	lockdep_assert_held(&ctrl->lock);
   1138
   1139	/*
   1140	 * Only I/O controllers should verify iosqes,iocqes.
   1141	 * Strictly speaking, the spec says a discovery controller
   1142	 * should verify iosqes,iocqes are zeroed, however that
   1143	 * would break backwards compatibility, so don't enforce it.
   1144	 */
   1145	if (!nvmet_is_disc_subsys(ctrl->subsys) &&
   1146	    (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES ||
   1147	     nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES)) {
   1148		ctrl->csts = NVME_CSTS_CFS;
   1149		return;
   1150	}
   1151
   1152	if (nvmet_cc_mps(ctrl->cc) != 0 ||
   1153	    nvmet_cc_ams(ctrl->cc) != 0 ||
   1154	    !nvmet_css_supported(nvmet_cc_css(ctrl->cc))) {
   1155		ctrl->csts = NVME_CSTS_CFS;
   1156		return;
   1157	}
   1158
   1159	ctrl->csts = NVME_CSTS_RDY;
   1160
   1161	/*
   1162	 * Controllers that are not yet enabled should not really enforce the
   1163	 * keep alive timeout, but we still want to track a timeout and cleanup
   1164	 * in case a host died before it enabled the controller.  Hence, simply
   1165	 * reset the keep alive timer when the controller is enabled.
   1166	 */
   1167	if (ctrl->kato)
   1168		mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ);
   1169}
   1170
   1171static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl)
   1172{
   1173	lockdep_assert_held(&ctrl->lock);
   1174
   1175	/* XXX: tear down queues? */
   1176	ctrl->csts &= ~NVME_CSTS_RDY;
   1177	ctrl->cc = 0;
   1178}
   1179
   1180void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new)
   1181{
   1182	u32 old;
   1183
   1184	mutex_lock(&ctrl->lock);
   1185	old = ctrl->cc;
   1186	ctrl->cc = new;
   1187
   1188	if (nvmet_cc_en(new) && !nvmet_cc_en(old))
   1189		nvmet_start_ctrl(ctrl);
   1190	if (!nvmet_cc_en(new) && nvmet_cc_en(old))
   1191		nvmet_clear_ctrl(ctrl);
   1192	if (nvmet_cc_shn(new) && !nvmet_cc_shn(old)) {
   1193		nvmet_clear_ctrl(ctrl);
   1194		ctrl->csts |= NVME_CSTS_SHST_CMPLT;
   1195	}
   1196	if (!nvmet_cc_shn(new) && nvmet_cc_shn(old))
   1197		ctrl->csts &= ~NVME_CSTS_SHST_CMPLT;
   1198	mutex_unlock(&ctrl->lock);
   1199}
   1200
   1201static void nvmet_init_cap(struct nvmet_ctrl *ctrl)
   1202{
   1203	/* command sets supported: NVMe command set: */
   1204	ctrl->cap = (1ULL << 37);
   1205	/* Controller supports one or more I/O Command Sets */
   1206	ctrl->cap |= (1ULL << 43);
   1207	/* CC.EN timeout in 500msec units: */
   1208	ctrl->cap |= (15ULL << 24);
   1209	/* maximum queue entries supported: */
   1210	if (ctrl->ops->get_max_queue_size)
   1211		ctrl->cap |= ctrl->ops->get_max_queue_size(ctrl) - 1;
   1212	else
   1213		ctrl->cap |= NVMET_QUEUE_SIZE - 1;
   1214
   1215	if (nvmet_is_passthru_subsys(ctrl->subsys))
   1216		nvmet_passthrough_override_cap(ctrl);
   1217}
   1218
   1219struct nvmet_ctrl *nvmet_ctrl_find_get(const char *subsysnqn,
   1220				       const char *hostnqn, u16 cntlid,
   1221				       struct nvmet_req *req)
   1222{
   1223	struct nvmet_ctrl *ctrl = NULL;
   1224	struct nvmet_subsys *subsys;
   1225
   1226	subsys = nvmet_find_get_subsys(req->port, subsysnqn);
   1227	if (!subsys) {
   1228		pr_warn("connect request for invalid subsystem %s!\n",
   1229			subsysnqn);
   1230		req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
   1231		goto out;
   1232	}
   1233
   1234	mutex_lock(&subsys->lock);
   1235	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
   1236		if (ctrl->cntlid == cntlid) {
   1237			if (strncmp(hostnqn, ctrl->hostnqn, NVMF_NQN_SIZE)) {
   1238				pr_warn("hostnqn mismatch.\n");
   1239				continue;
   1240			}
   1241			if (!kref_get_unless_zero(&ctrl->ref))
   1242				continue;
   1243
   1244			/* ctrl found */
   1245			goto found;
   1246		}
   1247	}
   1248
   1249	ctrl = NULL; /* ctrl not found */
   1250	pr_warn("could not find controller %d for subsys %s / host %s\n",
   1251		cntlid, subsysnqn, hostnqn);
   1252	req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid);
   1253
   1254found:
   1255	mutex_unlock(&subsys->lock);
   1256	nvmet_subsys_put(subsys);
   1257out:
   1258	return ctrl;
   1259}
   1260
   1261u16 nvmet_check_ctrl_status(struct nvmet_req *req)
   1262{
   1263	if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
   1264		pr_err("got cmd %d while CC.EN == 0 on qid = %d\n",
   1265		       req->cmd->common.opcode, req->sq->qid);
   1266		return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
   1267	}
   1268
   1269	if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
   1270		pr_err("got cmd %d while CSTS.RDY == 0 on qid = %d\n",
   1271		       req->cmd->common.opcode, req->sq->qid);
   1272		return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
   1273	}
   1274	return 0;
   1275}
   1276
   1277bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn)
   1278{
   1279	struct nvmet_host_link *p;
   1280
   1281	lockdep_assert_held(&nvmet_config_sem);
   1282
   1283	if (subsys->allow_any_host)
   1284		return true;
   1285
   1286	if (nvmet_is_disc_subsys(subsys)) /* allow all access to disc subsys */
   1287		return true;
   1288
   1289	list_for_each_entry(p, &subsys->hosts, entry) {
   1290		if (!strcmp(nvmet_host_name(p->host), hostnqn))
   1291			return true;
   1292	}
   1293
   1294	return false;
   1295}
   1296
   1297/*
   1298 * Note: ctrl->subsys->lock should be held when calling this function
   1299 */
   1300static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl,
   1301		struct nvmet_req *req)
   1302{
   1303	struct nvmet_ns *ns;
   1304	unsigned long idx;
   1305
   1306	if (!req->p2p_client)
   1307		return;
   1308
   1309	ctrl->p2p_client = get_device(req->p2p_client);
   1310
   1311	xa_for_each(&ctrl->subsys->namespaces, idx, ns)
   1312		nvmet_p2pmem_ns_add_p2p(ctrl, ns);
   1313}
   1314
   1315/*
   1316 * Note: ctrl->subsys->lock should be held when calling this function
   1317 */
   1318static void nvmet_release_p2p_ns_map(struct nvmet_ctrl *ctrl)
   1319{
   1320	struct radix_tree_iter iter;
   1321	void __rcu **slot;
   1322
   1323	radix_tree_for_each_slot(slot, &ctrl->p2p_ns_map, &iter, 0)
   1324		pci_dev_put(radix_tree_deref_slot(slot));
   1325
   1326	put_device(ctrl->p2p_client);
   1327}
   1328
   1329static void nvmet_fatal_error_handler(struct work_struct *work)
   1330{
   1331	struct nvmet_ctrl *ctrl =
   1332			container_of(work, struct nvmet_ctrl, fatal_err_work);
   1333
   1334	pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid);
   1335	ctrl->ops->delete_ctrl(ctrl);
   1336}
   1337
   1338u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
   1339		struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp)
   1340{
   1341	struct nvmet_subsys *subsys;
   1342	struct nvmet_ctrl *ctrl;
   1343	int ret;
   1344	u16 status;
   1345
   1346	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
   1347	subsys = nvmet_find_get_subsys(req->port, subsysnqn);
   1348	if (!subsys) {
   1349		pr_warn("connect request for invalid subsystem %s!\n",
   1350			subsysnqn);
   1351		req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn);
   1352		req->error_loc = offsetof(struct nvme_common_command, dptr);
   1353		goto out;
   1354	}
   1355
   1356	down_read(&nvmet_config_sem);
   1357	if (!nvmet_host_allowed(subsys, hostnqn)) {
   1358		pr_info("connect by host %s for subsystem %s not allowed\n",
   1359			hostnqn, subsysnqn);
   1360		req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn);
   1361		up_read(&nvmet_config_sem);
   1362		status = NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR;
   1363		req->error_loc = offsetof(struct nvme_common_command, dptr);
   1364		goto out_put_subsystem;
   1365	}
   1366	up_read(&nvmet_config_sem);
   1367
   1368	status = NVME_SC_INTERNAL;
   1369	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
   1370	if (!ctrl)
   1371		goto out_put_subsystem;
   1372	mutex_init(&ctrl->lock);
   1373
   1374	ctrl->port = req->port;
   1375	ctrl->ops = req->ops;
   1376
   1377#ifdef CONFIG_NVME_TARGET_PASSTHRU
   1378	/* By default, set loop targets to clear IDS by default */
   1379	if (ctrl->port->disc_addr.trtype == NVMF_TRTYPE_LOOP)
   1380		subsys->clear_ids = 1;
   1381#endif
   1382
   1383	INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
   1384	INIT_LIST_HEAD(&ctrl->async_events);
   1385	INIT_RADIX_TREE(&ctrl->p2p_ns_map, GFP_KERNEL);
   1386	INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler);
   1387	INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer);
   1388
   1389	memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE);
   1390	memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE);
   1391
   1392	kref_init(&ctrl->ref);
   1393	ctrl->subsys = subsys;
   1394	nvmet_init_cap(ctrl);
   1395	WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL);
   1396
   1397	ctrl->changed_ns_list = kmalloc_array(NVME_MAX_CHANGED_NAMESPACES,
   1398			sizeof(__le32), GFP_KERNEL);
   1399	if (!ctrl->changed_ns_list)
   1400		goto out_free_ctrl;
   1401
   1402	ctrl->sqs = kcalloc(subsys->max_qid + 1,
   1403			sizeof(struct nvmet_sq *),
   1404			GFP_KERNEL);
   1405	if (!ctrl->sqs)
   1406		goto out_free_changed_ns_list;
   1407
   1408	if (subsys->cntlid_min > subsys->cntlid_max)
   1409		goto out_free_sqs;
   1410
   1411	ret = ida_alloc_range(&cntlid_ida,
   1412			     subsys->cntlid_min, subsys->cntlid_max,
   1413			     GFP_KERNEL);
   1414	if (ret < 0) {
   1415		status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
   1416		goto out_free_sqs;
   1417	}
   1418	ctrl->cntlid = ret;
   1419
   1420	/*
   1421	 * Discovery controllers may use some arbitrary high value
   1422	 * in order to cleanup stale discovery sessions
   1423	 */
   1424	if (nvmet_is_disc_subsys(ctrl->subsys) && !kato)
   1425		kato = NVMET_DISC_KATO_MS;
   1426
   1427	/* keep-alive timeout in seconds */
   1428	ctrl->kato = DIV_ROUND_UP(kato, 1000);
   1429
   1430	ctrl->err_counter = 0;
   1431	spin_lock_init(&ctrl->error_lock);
   1432
   1433	nvmet_start_keep_alive_timer(ctrl);
   1434
   1435	mutex_lock(&subsys->lock);
   1436	list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
   1437	nvmet_setup_p2p_ns_map(ctrl, req);
   1438	mutex_unlock(&subsys->lock);
   1439
   1440	*ctrlp = ctrl;
   1441	return 0;
   1442
   1443out_free_sqs:
   1444	kfree(ctrl->sqs);
   1445out_free_changed_ns_list:
   1446	kfree(ctrl->changed_ns_list);
   1447out_free_ctrl:
   1448	kfree(ctrl);
   1449out_put_subsystem:
   1450	nvmet_subsys_put(subsys);
   1451out:
   1452	return status;
   1453}
   1454
   1455static void nvmet_ctrl_free(struct kref *ref)
   1456{
   1457	struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref);
   1458	struct nvmet_subsys *subsys = ctrl->subsys;
   1459
   1460	mutex_lock(&subsys->lock);
   1461	nvmet_release_p2p_ns_map(ctrl);
   1462	list_del(&ctrl->subsys_entry);
   1463	mutex_unlock(&subsys->lock);
   1464
   1465	nvmet_stop_keep_alive_timer(ctrl);
   1466
   1467	flush_work(&ctrl->async_event_work);
   1468	cancel_work_sync(&ctrl->fatal_err_work);
   1469
   1470	ida_free(&cntlid_ida, ctrl->cntlid);
   1471
   1472	nvmet_async_events_free(ctrl);
   1473	kfree(ctrl->sqs);
   1474	kfree(ctrl->changed_ns_list);
   1475	kfree(ctrl);
   1476
   1477	nvmet_subsys_put(subsys);
   1478}
   1479
   1480void nvmet_ctrl_put(struct nvmet_ctrl *ctrl)
   1481{
   1482	kref_put(&ctrl->ref, nvmet_ctrl_free);
   1483}
   1484
   1485void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl)
   1486{
   1487	mutex_lock(&ctrl->lock);
   1488	if (!(ctrl->csts & NVME_CSTS_CFS)) {
   1489		ctrl->csts |= NVME_CSTS_CFS;
   1490		queue_work(nvmet_wq, &ctrl->fatal_err_work);
   1491	}
   1492	mutex_unlock(&ctrl->lock);
   1493}
   1494EXPORT_SYMBOL_GPL(nvmet_ctrl_fatal_error);
   1495
   1496static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
   1497		const char *subsysnqn)
   1498{
   1499	struct nvmet_subsys_link *p;
   1500
   1501	if (!port)
   1502		return NULL;
   1503
   1504	if (!strcmp(NVME_DISC_SUBSYS_NAME, subsysnqn)) {
   1505		if (!kref_get_unless_zero(&nvmet_disc_subsys->ref))
   1506			return NULL;
   1507		return nvmet_disc_subsys;
   1508	}
   1509
   1510	down_read(&nvmet_config_sem);
   1511	list_for_each_entry(p, &port->subsystems, entry) {
   1512		if (!strncmp(p->subsys->subsysnqn, subsysnqn,
   1513				NVMF_NQN_SIZE)) {
   1514			if (!kref_get_unless_zero(&p->subsys->ref))
   1515				break;
   1516			up_read(&nvmet_config_sem);
   1517			return p->subsys;
   1518		}
   1519	}
   1520	up_read(&nvmet_config_sem);
   1521	return NULL;
   1522}
   1523
   1524struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
   1525		enum nvme_subsys_type type)
   1526{
   1527	struct nvmet_subsys *subsys;
   1528	char serial[NVMET_SN_MAX_SIZE / 2];
   1529	int ret;
   1530
   1531	subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
   1532	if (!subsys)
   1533		return ERR_PTR(-ENOMEM);
   1534
   1535	subsys->ver = NVMET_DEFAULT_VS;
   1536	/* generate a random serial number as our controllers are ephemeral: */
   1537	get_random_bytes(&serial, sizeof(serial));
   1538	bin2hex(subsys->serial, &serial, sizeof(serial));
   1539
   1540	subsys->model_number = kstrdup(NVMET_DEFAULT_CTRL_MODEL, GFP_KERNEL);
   1541	if (!subsys->model_number) {
   1542		ret = -ENOMEM;
   1543		goto free_subsys;
   1544	}
   1545
   1546	switch (type) {
   1547	case NVME_NQN_NVME:
   1548		subsys->max_qid = NVMET_NR_QUEUES;
   1549		break;
   1550	case NVME_NQN_DISC:
   1551	case NVME_NQN_CURR:
   1552		subsys->max_qid = 0;
   1553		break;
   1554	default:
   1555		pr_err("%s: Unknown Subsystem type - %d\n", __func__, type);
   1556		ret = -EINVAL;
   1557		goto free_mn;
   1558	}
   1559	subsys->type = type;
   1560	subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE,
   1561			GFP_KERNEL);
   1562	if (!subsys->subsysnqn) {
   1563		ret = -ENOMEM;
   1564		goto free_mn;
   1565	}
   1566	subsys->cntlid_min = NVME_CNTLID_MIN;
   1567	subsys->cntlid_max = NVME_CNTLID_MAX;
   1568	kref_init(&subsys->ref);
   1569
   1570	mutex_init(&subsys->lock);
   1571	xa_init(&subsys->namespaces);
   1572	INIT_LIST_HEAD(&subsys->ctrls);
   1573	INIT_LIST_HEAD(&subsys->hosts);
   1574
   1575	return subsys;
   1576
   1577free_mn:
   1578	kfree(subsys->model_number);
   1579free_subsys:
   1580	kfree(subsys);
   1581	return ERR_PTR(ret);
   1582}
   1583
   1584static void nvmet_subsys_free(struct kref *ref)
   1585{
   1586	struct nvmet_subsys *subsys =
   1587		container_of(ref, struct nvmet_subsys, ref);
   1588
   1589	WARN_ON_ONCE(!xa_empty(&subsys->namespaces));
   1590
   1591	xa_destroy(&subsys->namespaces);
   1592	nvmet_passthru_subsys_free(subsys);
   1593
   1594	kfree(subsys->subsysnqn);
   1595	kfree(subsys->model_number);
   1596	kfree(subsys);
   1597}
   1598
   1599void nvmet_subsys_del_ctrls(struct nvmet_subsys *subsys)
   1600{
   1601	struct nvmet_ctrl *ctrl;
   1602
   1603	mutex_lock(&subsys->lock);
   1604	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
   1605		ctrl->ops->delete_ctrl(ctrl);
   1606	mutex_unlock(&subsys->lock);
   1607}
   1608
   1609void nvmet_subsys_put(struct nvmet_subsys *subsys)
   1610{
   1611	kref_put(&subsys->ref, nvmet_subsys_free);
   1612}
   1613
   1614static int __init nvmet_init(void)
   1615{
   1616	int error;
   1617
   1618	nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1;
   1619
   1620	zbd_wq = alloc_workqueue("nvmet-zbd-wq", WQ_MEM_RECLAIM, 0);
   1621	if (!zbd_wq)
   1622		return -ENOMEM;
   1623
   1624	buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq",
   1625			WQ_MEM_RECLAIM, 0);
   1626	if (!buffered_io_wq) {
   1627		error = -ENOMEM;
   1628		goto out_free_zbd_work_queue;
   1629	}
   1630
   1631	nvmet_wq = alloc_workqueue("nvmet-wq", WQ_MEM_RECLAIM, 0);
   1632	if (!nvmet_wq) {
   1633		error = -ENOMEM;
   1634		goto out_free_buffered_work_queue;
   1635	}
   1636
   1637	error = nvmet_init_discovery();
   1638	if (error)
   1639		goto out_free_nvmet_work_queue;
   1640
   1641	error = nvmet_init_configfs();
   1642	if (error)
   1643		goto out_exit_discovery;
   1644	return 0;
   1645
   1646out_exit_discovery:
   1647	nvmet_exit_discovery();
   1648out_free_nvmet_work_queue:
   1649	destroy_workqueue(nvmet_wq);
   1650out_free_buffered_work_queue:
   1651	destroy_workqueue(buffered_io_wq);
   1652out_free_zbd_work_queue:
   1653	destroy_workqueue(zbd_wq);
   1654	return error;
   1655}
   1656
   1657static void __exit nvmet_exit(void)
   1658{
   1659	nvmet_exit_configfs();
   1660	nvmet_exit_discovery();
   1661	ida_destroy(&cntlid_ida);
   1662	destroy_workqueue(nvmet_wq);
   1663	destroy_workqueue(buffered_io_wq);
   1664	destroy_workqueue(zbd_wq);
   1665
   1666	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
   1667	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
   1668}
   1669
   1670module_init(nvmet_init);
   1671module_exit(nvmet_exit);
   1672
   1673MODULE_LICENSE("GPL v2");