cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

pci.c (95663B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * NVM Express device driver
      4 * Copyright (c) 2011-2014, Intel Corporation.
      5 */
      6
      7#include <linux/acpi.h>
      8#include <linux/aer.h>
      9#include <linux/async.h>
     10#include <linux/blkdev.h>
     11#include <linux/blk-mq.h>
     12#include <linux/blk-mq-pci.h>
     13#include <linux/blk-integrity.h>
     14#include <linux/dmi.h>
     15#include <linux/init.h>
     16#include <linux/interrupt.h>
     17#include <linux/io.h>
     18#include <linux/memremap.h>
     19#include <linux/mm.h>
     20#include <linux/module.h>
     21#include <linux/mutex.h>
     22#include <linux/once.h>
     23#include <linux/pci.h>
     24#include <linux/suspend.h>
     25#include <linux/t10-pi.h>
     26#include <linux/types.h>
     27#include <linux/io-64-nonatomic-lo-hi.h>
     28#include <linux/io-64-nonatomic-hi-lo.h>
     29#include <linux/sed-opal.h>
     30#include <linux/pci-p2pdma.h>
     31
     32#include "trace.h"
     33#include "nvme.h"
     34
     35#define SQ_SIZE(q)	((q)->q_depth << (q)->sqes)
     36#define CQ_SIZE(q)	((q)->q_depth * sizeof(struct nvme_completion))
     37
     38#define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
     39
     40/*
     41 * These can be higher, but we need to ensure that any command doesn't
     42 * require an sg allocation that needs more than a page of data.
     43 */
     44#define NVME_MAX_KB_SZ	4096
     45#define NVME_MAX_SEGS	127
     46
     47static int use_threaded_interrupts;
     48module_param(use_threaded_interrupts, int, 0444);
     49
     50static bool use_cmb_sqes = true;
     51module_param(use_cmb_sqes, bool, 0444);
     52MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
     53
     54static unsigned int max_host_mem_size_mb = 128;
     55module_param(max_host_mem_size_mb, uint, 0444);
     56MODULE_PARM_DESC(max_host_mem_size_mb,
     57	"Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
     58
     59static unsigned int sgl_threshold = SZ_32K;
     60module_param(sgl_threshold, uint, 0644);
     61MODULE_PARM_DESC(sgl_threshold,
     62		"Use SGLs when average request segment size is larger or equal to "
     63		"this size. Use 0 to disable SGLs.");
     64
     65#define NVME_PCI_MIN_QUEUE_SIZE 2
     66#define NVME_PCI_MAX_QUEUE_SIZE 4095
     67static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
     68static const struct kernel_param_ops io_queue_depth_ops = {
     69	.set = io_queue_depth_set,
     70	.get = param_get_uint,
     71};
     72
     73static unsigned int io_queue_depth = 1024;
     74module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
     75MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2 and < 4096");
     76
     77static int io_queue_count_set(const char *val, const struct kernel_param *kp)
     78{
     79	unsigned int n;
     80	int ret;
     81
     82	ret = kstrtouint(val, 10, &n);
     83	if (ret != 0 || n > num_possible_cpus())
     84		return -EINVAL;
     85	return param_set_uint(val, kp);
     86}
     87
     88static const struct kernel_param_ops io_queue_count_ops = {
     89	.set = io_queue_count_set,
     90	.get = param_get_uint,
     91};
     92
     93static unsigned int write_queues;
     94module_param_cb(write_queues, &io_queue_count_ops, &write_queues, 0644);
     95MODULE_PARM_DESC(write_queues,
     96	"Number of queues to use for writes. If not set, reads and writes "
     97	"will share a queue set.");
     98
     99static unsigned int poll_queues;
    100module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644);
    101MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
    102
    103static bool noacpi;
    104module_param(noacpi, bool, 0444);
    105MODULE_PARM_DESC(noacpi, "disable acpi bios quirks");
    106
    107struct nvme_dev;
    108struct nvme_queue;
    109
    110static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
    111static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
    112
    113/*
    114 * Represents an NVM Express device.  Each nvme_dev is a PCI function.
    115 */
    116struct nvme_dev {
    117	struct nvme_queue *queues;
    118	struct blk_mq_tag_set tagset;
    119	struct blk_mq_tag_set admin_tagset;
    120	u32 __iomem *dbs;
    121	struct device *dev;
    122	struct dma_pool *prp_page_pool;
    123	struct dma_pool *prp_small_pool;
    124	unsigned online_queues;
    125	unsigned max_qid;
    126	unsigned io_queues[HCTX_MAX_TYPES];
    127	unsigned int num_vecs;
    128	u32 q_depth;
    129	int io_sqes;
    130	u32 db_stride;
    131	void __iomem *bar;
    132	unsigned long bar_mapped_size;
    133	struct work_struct remove_work;
    134	struct mutex shutdown_lock;
    135	bool subsystem;
    136	u64 cmb_size;
    137	bool cmb_use_sqes;
    138	u32 cmbsz;
    139	u32 cmbloc;
    140	struct nvme_ctrl ctrl;
    141	u32 last_ps;
    142	bool hmb;
    143
    144	mempool_t *iod_mempool;
    145
    146	/* shadow doorbell buffer support: */
    147	u32 *dbbuf_dbs;
    148	dma_addr_t dbbuf_dbs_dma_addr;
    149	u32 *dbbuf_eis;
    150	dma_addr_t dbbuf_eis_dma_addr;
    151
    152	/* host memory buffer support: */
    153	u64 host_mem_size;
    154	u32 nr_host_mem_descs;
    155	dma_addr_t host_mem_descs_dma;
    156	struct nvme_host_mem_buf_desc *host_mem_descs;
    157	void **host_mem_desc_bufs;
    158	unsigned int nr_allocated_queues;
    159	unsigned int nr_write_queues;
    160	unsigned int nr_poll_queues;
    161
    162	bool attrs_added;
    163};
    164
    165static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
    166{
    167	return param_set_uint_minmax(val, kp, NVME_PCI_MIN_QUEUE_SIZE,
    168			NVME_PCI_MAX_QUEUE_SIZE);
    169}
    170
    171static inline unsigned int sq_idx(unsigned int qid, u32 stride)
    172{
    173	return qid * 2 * stride;
    174}
    175
    176static inline unsigned int cq_idx(unsigned int qid, u32 stride)
    177{
    178	return (qid * 2 + 1) * stride;
    179}
    180
    181static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
    182{
    183	return container_of(ctrl, struct nvme_dev, ctrl);
    184}
    185
    186/*
    187 * An NVM Express queue.  Each device has at least two (one for admin
    188 * commands and one for I/O commands).
    189 */
    190struct nvme_queue {
    191	struct nvme_dev *dev;
    192	spinlock_t sq_lock;
    193	void *sq_cmds;
    194	 /* only used for poll queues: */
    195	spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
    196	struct nvme_completion *cqes;
    197	dma_addr_t sq_dma_addr;
    198	dma_addr_t cq_dma_addr;
    199	u32 __iomem *q_db;
    200	u32 q_depth;
    201	u16 cq_vector;
    202	u16 sq_tail;
    203	u16 last_sq_tail;
    204	u16 cq_head;
    205	u16 qid;
    206	u8 cq_phase;
    207	u8 sqes;
    208	unsigned long flags;
    209#define NVMEQ_ENABLED		0
    210#define NVMEQ_SQ_CMB		1
    211#define NVMEQ_DELETE_ERROR	2
    212#define NVMEQ_POLLED		3
    213	u32 *dbbuf_sq_db;
    214	u32 *dbbuf_cq_db;
    215	u32 *dbbuf_sq_ei;
    216	u32 *dbbuf_cq_ei;
    217	struct completion delete_done;
    218};
    219
    220/*
    221 * The nvme_iod describes the data in an I/O.
    222 *
    223 * The sg pointer contains the list of PRP/SGL chunk allocations in addition
    224 * to the actual struct scatterlist.
    225 */
    226struct nvme_iod {
    227	struct nvme_request req;
    228	struct nvme_command cmd;
    229	struct nvme_queue *nvmeq;
    230	bool use_sgl;
    231	int aborted;
    232	int npages;		/* In the PRP list. 0 means small pool in use */
    233	int nents;		/* Used in scatterlist */
    234	dma_addr_t first_dma;
    235	unsigned int dma_len;	/* length of single DMA segment mapping */
    236	dma_addr_t meta_dma;
    237	struct scatterlist *sg;
    238};
    239
    240static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
    241{
    242	return dev->nr_allocated_queues * 8 * dev->db_stride;
    243}
    244
    245static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
    246{
    247	unsigned int mem_size = nvme_dbbuf_size(dev);
    248
    249	if (dev->dbbuf_dbs) {
    250		/*
    251		 * Clear the dbbuf memory so the driver doesn't observe stale
    252		 * values from the previous instantiation.
    253		 */
    254		memset(dev->dbbuf_dbs, 0, mem_size);
    255		memset(dev->dbbuf_eis, 0, mem_size);
    256		return 0;
    257	}
    258
    259	dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
    260					    &dev->dbbuf_dbs_dma_addr,
    261					    GFP_KERNEL);
    262	if (!dev->dbbuf_dbs)
    263		return -ENOMEM;
    264	dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
    265					    &dev->dbbuf_eis_dma_addr,
    266					    GFP_KERNEL);
    267	if (!dev->dbbuf_eis) {
    268		dma_free_coherent(dev->dev, mem_size,
    269				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
    270		dev->dbbuf_dbs = NULL;
    271		return -ENOMEM;
    272	}
    273
    274	return 0;
    275}
    276
    277static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
    278{
    279	unsigned int mem_size = nvme_dbbuf_size(dev);
    280
    281	if (dev->dbbuf_dbs) {
    282		dma_free_coherent(dev->dev, mem_size,
    283				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
    284		dev->dbbuf_dbs = NULL;
    285	}
    286	if (dev->dbbuf_eis) {
    287		dma_free_coherent(dev->dev, mem_size,
    288				  dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
    289		dev->dbbuf_eis = NULL;
    290	}
    291}
    292
    293static void nvme_dbbuf_init(struct nvme_dev *dev,
    294			    struct nvme_queue *nvmeq, int qid)
    295{
    296	if (!dev->dbbuf_dbs || !qid)
    297		return;
    298
    299	nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
    300	nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
    301	nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
    302	nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
    303}
    304
    305static void nvme_dbbuf_free(struct nvme_queue *nvmeq)
    306{
    307	if (!nvmeq->qid)
    308		return;
    309
    310	nvmeq->dbbuf_sq_db = NULL;
    311	nvmeq->dbbuf_cq_db = NULL;
    312	nvmeq->dbbuf_sq_ei = NULL;
    313	nvmeq->dbbuf_cq_ei = NULL;
    314}
    315
    316static void nvme_dbbuf_set(struct nvme_dev *dev)
    317{
    318	struct nvme_command c = { };
    319	unsigned int i;
    320
    321	if (!dev->dbbuf_dbs)
    322		return;
    323
    324	c.dbbuf.opcode = nvme_admin_dbbuf;
    325	c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
    326	c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);
    327
    328	if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
    329		dev_warn(dev->ctrl.device, "unable to set dbbuf\n");
    330		/* Free memory and continue on */
    331		nvme_dbbuf_dma_free(dev);
    332
    333		for (i = 1; i <= dev->online_queues; i++)
    334			nvme_dbbuf_free(&dev->queues[i]);
    335	}
    336}
    337
    338static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
    339{
    340	return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
    341}
    342
    343/* Update dbbuf and return true if an MMIO is required */
    344static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
    345					      volatile u32 *dbbuf_ei)
    346{
    347	if (dbbuf_db) {
    348		u16 old_value;
    349
    350		/*
    351		 * Ensure that the queue is written before updating
    352		 * the doorbell in memory
    353		 */
    354		wmb();
    355
    356		old_value = *dbbuf_db;
    357		*dbbuf_db = value;
    358
    359		/*
    360		 * Ensure that the doorbell is updated before reading the event
    361		 * index from memory.  The controller needs to provide similar
    362		 * ordering to ensure the envent index is updated before reading
    363		 * the doorbell.
    364		 */
    365		mb();
    366
    367		if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
    368			return false;
    369	}
    370
    371	return true;
    372}
    373
    374/*
    375 * Will slightly overestimate the number of pages needed.  This is OK
    376 * as it only leads to a small amount of wasted memory for the lifetime of
    377 * the I/O.
    378 */
    379static int nvme_pci_npages_prp(void)
    380{
    381	unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE,
    382				      NVME_CTRL_PAGE_SIZE);
    383	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
    384}
    385
    386/*
    387 * Calculates the number of pages needed for the SGL segments. For example a 4k
    388 * page can accommodate 256 SGL descriptors.
    389 */
    390static int nvme_pci_npages_sgl(void)
    391{
    392	return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc),
    393			PAGE_SIZE);
    394}
    395
    396static size_t nvme_pci_iod_alloc_size(void)
    397{
    398	size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
    399
    400	return sizeof(__le64 *) * npages +
    401		sizeof(struct scatterlist) * NVME_MAX_SEGS;
    402}
    403
    404static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
    405				unsigned int hctx_idx)
    406{
    407	struct nvme_dev *dev = data;
    408	struct nvme_queue *nvmeq = &dev->queues[0];
    409
    410	WARN_ON(hctx_idx != 0);
    411	WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
    412
    413	hctx->driver_data = nvmeq;
    414	return 0;
    415}
    416
    417static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
    418			  unsigned int hctx_idx)
    419{
    420	struct nvme_dev *dev = data;
    421	struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
    422
    423	WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
    424	hctx->driver_data = nvmeq;
    425	return 0;
    426}
    427
    428static int nvme_pci_init_request(struct blk_mq_tag_set *set,
    429		struct request *req, unsigned int hctx_idx,
    430		unsigned int numa_node)
    431{
    432	struct nvme_dev *dev = set->driver_data;
    433	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
    434	int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
    435	struct nvme_queue *nvmeq = &dev->queues[queue_idx];
    436
    437	BUG_ON(!nvmeq);
    438	iod->nvmeq = nvmeq;
    439
    440	nvme_req(req)->ctrl = &dev->ctrl;
    441	nvme_req(req)->cmd = &iod->cmd;
    442	return 0;
    443}
    444
    445static int queue_irq_offset(struct nvme_dev *dev)
    446{
    447	/* if we have more than 1 vec, admin queue offsets us by 1 */
    448	if (dev->num_vecs > 1)
    449		return 1;
    450
    451	return 0;
    452}
    453
    454static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
    455{
    456	struct nvme_dev *dev = set->driver_data;
    457	int i, qoff, offset;
    458
    459	offset = queue_irq_offset(dev);
    460	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
    461		struct blk_mq_queue_map *map = &set->map[i];
    462
    463		map->nr_queues = dev->io_queues[i];
    464		if (!map->nr_queues) {
    465			BUG_ON(i == HCTX_TYPE_DEFAULT);
    466			continue;
    467		}
    468
    469		/*
    470		 * The poll queue(s) doesn't have an IRQ (and hence IRQ
    471		 * affinity), so use the regular blk-mq cpu mapping
    472		 */
    473		map->queue_offset = qoff;
    474		if (i != HCTX_TYPE_POLL && offset)
    475			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
    476		else
    477			blk_mq_map_queues(map);
    478		qoff += map->nr_queues;
    479		offset += map->nr_queues;
    480	}
    481
    482	return 0;
    483}
    484
    485/*
    486 * Write sq tail if we are asked to, or if the next command would wrap.
    487 */
    488static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
    489{
    490	if (!write_sq) {
    491		u16 next_tail = nvmeq->sq_tail + 1;
    492
    493		if (next_tail == nvmeq->q_depth)
    494			next_tail = 0;
    495		if (next_tail != nvmeq->last_sq_tail)
    496			return;
    497	}
    498
    499	if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
    500			nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
    501		writel(nvmeq->sq_tail, nvmeq->q_db);
    502	nvmeq->last_sq_tail = nvmeq->sq_tail;
    503}
    504
    505static inline void nvme_sq_copy_cmd(struct nvme_queue *nvmeq,
    506				    struct nvme_command *cmd)
    507{
    508	memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
    509		absolute_pointer(cmd), sizeof(*cmd));
    510	if (++nvmeq->sq_tail == nvmeq->q_depth)
    511		nvmeq->sq_tail = 0;
    512}
    513
    514static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
    515{
    516	struct nvme_queue *nvmeq = hctx->driver_data;
    517
    518	spin_lock(&nvmeq->sq_lock);
    519	if (nvmeq->sq_tail != nvmeq->last_sq_tail)
    520		nvme_write_sq_db(nvmeq, true);
    521	spin_unlock(&nvmeq->sq_lock);
    522}
    523
    524static void **nvme_pci_iod_list(struct request *req)
    525{
    526	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
    527	return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
    528}
    529
    530static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
    531{
    532	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
    533	int nseg = blk_rq_nr_phys_segments(req);
    534	unsigned int avg_seg_size;
    535
    536	avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
    537
    538	if (!nvme_ctrl_sgl_supported(&dev->ctrl))
    539		return false;
    540	if (!iod->nvmeq->qid)
    541		return false;
    542	if (!sgl_threshold || avg_seg_size < sgl_threshold)
    543		return false;
    544	return true;
    545}
    546
    547static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
    548{
    549	const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
    550	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
    551	dma_addr_t dma_addr = iod->first_dma;
    552	int i;
    553
    554	for (i = 0; i < iod->npages; i++) {
    555		__le64 *prp_list = nvme_pci_iod_list(req)[i];
    556		dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
    557
    558		dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
    559		dma_addr = next_dma_addr;
    560	}
    561}
    562
    563static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
    564{
    565	const int last_sg = SGES_PER_PAGE - 1;
    566	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
    567	dma_addr_t dma_addr = iod->first_dma;
    568	int i;
    569
    570	for (i = 0; i < iod->npages; i++) {
    571		struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i];
    572		dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr);
    573
    574		dma_pool_free(dev->prp_page_pool, sg_list, dma_addr);
    575		dma_addr = next_dma_addr;
    576	}
    577}
    578
    579static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req)
    580{
    581	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
    582
    583	if (is_pci_p2pdma_page(sg_page(iod->sg)))
    584		pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
    585				    rq_dma_dir(req));
    586	else
    587		dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
    588}
    589
    590static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
    591{
    592	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
    593
    594	if (iod->dma_len) {
    595		dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
    596			       rq_dma_dir(req));
    597		return;
    598	}
    599
    600	WARN_ON_ONCE(!iod->nents);
    601
    602	nvme_unmap_sg(dev, req);
    603	if (iod->npages == 0)
    604		dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
    605			      iod->first_dma);
    606	else if (iod->use_sgl)
    607		nvme_free_sgls(dev, req);
    608	else
    609		nvme_free_prps(dev, req);
    610	mempool_free(iod->sg, dev->iod_mempool);
    611}
    612
    613static void nvme_print_sgl(struct scatterlist *sgl, int nents)
    614{
    615	int i;
    616	struct scatterlist *sg;
    617
    618	for_each_sg(sgl, sg, nents, i) {
    619		dma_addr_t phys = sg_phys(sg);
    620		pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
    621			"dma_address:%pad dma_length:%d\n",
    622			i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
    623			sg_dma_len(sg));
    624	}
    625}
    626
    627static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
    628		struct request *req, struct nvme_rw_command *cmnd)
    629{
    630	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
    631	struct dma_pool *pool;
    632	int length = blk_rq_payload_bytes(req);
    633	struct scatterlist *sg = iod->sg;
    634	int dma_len = sg_dma_len(sg);
    635	u64 dma_addr = sg_dma_address(sg);
    636	int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
    637	__le64 *prp_list;
    638	void **list = nvme_pci_iod_list(req);
    639	dma_addr_t prp_dma;
    640	int nprps, i;
    641
    642	length -= (NVME_CTRL_PAGE_SIZE - offset);
    643	if (length <= 0) {
    644		iod->first_dma = 0;
    645		goto done;
    646	}
    647
    648	dma_len -= (NVME_CTRL_PAGE_SIZE - offset);
    649	if (dma_len) {
    650		dma_addr += (NVME_CTRL_PAGE_SIZE - offset);
    651	} else {
    652		sg = sg_next(sg);
    653		dma_addr = sg_dma_address(sg);
    654		dma_len = sg_dma_len(sg);
    655	}
    656
    657	if (length <= NVME_CTRL_PAGE_SIZE) {
    658		iod->first_dma = dma_addr;
    659		goto done;
    660	}
    661
    662	nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
    663	if (nprps <= (256 / 8)) {
    664		pool = dev->prp_small_pool;
    665		iod->npages = 0;
    666	} else {
    667		pool = dev->prp_page_pool;
    668		iod->npages = 1;
    669	}
    670
    671	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
    672	if (!prp_list) {
    673		iod->first_dma = dma_addr;
    674		iod->npages = -1;
    675		return BLK_STS_RESOURCE;
    676	}
    677	list[0] = prp_list;
    678	iod->first_dma = prp_dma;
    679	i = 0;
    680	for (;;) {
    681		if (i == NVME_CTRL_PAGE_SIZE >> 3) {
    682			__le64 *old_prp_list = prp_list;
    683			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
    684			if (!prp_list)
    685				goto free_prps;
    686			list[iod->npages++] = prp_list;
    687			prp_list[0] = old_prp_list[i - 1];
    688			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
    689			i = 1;
    690		}
    691		prp_list[i++] = cpu_to_le64(dma_addr);
    692		dma_len -= NVME_CTRL_PAGE_SIZE;
    693		dma_addr += NVME_CTRL_PAGE_SIZE;
    694		length -= NVME_CTRL_PAGE_SIZE;
    695		if (length <= 0)
    696			break;
    697		if (dma_len > 0)
    698			continue;
    699		if (unlikely(dma_len < 0))
    700			goto bad_sgl;
    701		sg = sg_next(sg);
    702		dma_addr = sg_dma_address(sg);
    703		dma_len = sg_dma_len(sg);
    704	}
    705done:
    706	cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
    707	cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
    708	return BLK_STS_OK;
    709free_prps:
    710	nvme_free_prps(dev, req);
    711	return BLK_STS_RESOURCE;
    712bad_sgl:
    713	WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents),
    714			"Invalid SGL for payload:%d nents:%d\n",
    715			blk_rq_payload_bytes(req), iod->nents);
    716	return BLK_STS_IOERR;
    717}
    718
    719static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
    720		struct scatterlist *sg)
    721{
    722	sge->addr = cpu_to_le64(sg_dma_address(sg));
    723	sge->length = cpu_to_le32(sg_dma_len(sg));
    724	sge->type = NVME_SGL_FMT_DATA_DESC << 4;
    725}
    726
    727static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
    728		dma_addr_t dma_addr, int entries)
    729{
    730	sge->addr = cpu_to_le64(dma_addr);
    731	if (entries < SGES_PER_PAGE) {
    732		sge->length = cpu_to_le32(entries * sizeof(*sge));
    733		sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
    734	} else {
    735		sge->length = cpu_to_le32(PAGE_SIZE);
    736		sge->type = NVME_SGL_FMT_SEG_DESC << 4;
    737	}
    738}
    739
    740static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
    741		struct request *req, struct nvme_rw_command *cmd, int entries)
    742{
    743	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
    744	struct dma_pool *pool;
    745	struct nvme_sgl_desc *sg_list;
    746	struct scatterlist *sg = iod->sg;
    747	dma_addr_t sgl_dma;
    748	int i = 0;
    749
    750	/* setting the transfer type as SGL */
    751	cmd->flags = NVME_CMD_SGL_METABUF;
    752
    753	if (entries == 1) {
    754		nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
    755		return BLK_STS_OK;
    756	}
    757
    758	if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
    759		pool = dev->prp_small_pool;
    760		iod->npages = 0;
    761	} else {
    762		pool = dev->prp_page_pool;
    763		iod->npages = 1;
    764	}
    765
    766	sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
    767	if (!sg_list) {
    768		iod->npages = -1;
    769		return BLK_STS_RESOURCE;
    770	}
    771
    772	nvme_pci_iod_list(req)[0] = sg_list;
    773	iod->first_dma = sgl_dma;
    774
    775	nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
    776
    777	do {
    778		if (i == SGES_PER_PAGE) {
    779			struct nvme_sgl_desc *old_sg_desc = sg_list;
    780			struct nvme_sgl_desc *link = &old_sg_desc[i - 1];
    781
    782			sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
    783			if (!sg_list)
    784				goto free_sgls;
    785
    786			i = 0;
    787			nvme_pci_iod_list(req)[iod->npages++] = sg_list;
    788			sg_list[i++] = *link;
    789			nvme_pci_sgl_set_seg(link, sgl_dma, entries);
    790		}
    791
    792		nvme_pci_sgl_set_data(&sg_list[i++], sg);
    793		sg = sg_next(sg);
    794	} while (--entries > 0);
    795
    796	return BLK_STS_OK;
    797free_sgls:
    798	nvme_free_sgls(dev, req);
    799	return BLK_STS_RESOURCE;
    800}
    801
    802static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
    803		struct request *req, struct nvme_rw_command *cmnd,
    804		struct bio_vec *bv)
    805{
    806	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
    807	unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
    808	unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
    809
    810	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
    811	if (dma_mapping_error(dev->dev, iod->first_dma))
    812		return BLK_STS_RESOURCE;
    813	iod->dma_len = bv->bv_len;
    814
    815	cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
    816	if (bv->bv_len > first_prp_len)
    817		cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
    818	return BLK_STS_OK;
    819}
    820
    821static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
    822		struct request *req, struct nvme_rw_command *cmnd,
    823		struct bio_vec *bv)
    824{
    825	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
    826
    827	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
    828	if (dma_mapping_error(dev->dev, iod->first_dma))
    829		return BLK_STS_RESOURCE;
    830	iod->dma_len = bv->bv_len;
    831
    832	cmnd->flags = NVME_CMD_SGL_METABUF;
    833	cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
    834	cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
    835	cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
    836	return BLK_STS_OK;
    837}
    838
    839static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
    840		struct nvme_command *cmnd)
    841{
    842	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
    843	blk_status_t ret = BLK_STS_RESOURCE;
    844	int nr_mapped;
    845
    846	if (blk_rq_nr_phys_segments(req) == 1) {
    847		struct bio_vec bv = req_bvec(req);
    848
    849		if (!is_pci_p2pdma_page(bv.bv_page)) {
    850			if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
    851				return nvme_setup_prp_simple(dev, req,
    852							     &cmnd->rw, &bv);
    853
    854			if (iod->nvmeq->qid && sgl_threshold &&
    855			    nvme_ctrl_sgl_supported(&dev->ctrl))
    856				return nvme_setup_sgl_simple(dev, req,
    857							     &cmnd->rw, &bv);
    858		}
    859	}
    860
    861	iod->dma_len = 0;
    862	iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
    863	if (!iod->sg)
    864		return BLK_STS_RESOURCE;
    865	sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
    866	iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
    867	if (!iod->nents)
    868		goto out_free_sg;
    869
    870	if (is_pci_p2pdma_page(sg_page(iod->sg)))
    871		nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg,
    872				iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN);
    873	else
    874		nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
    875					     rq_dma_dir(req), DMA_ATTR_NO_WARN);
    876	if (!nr_mapped)
    877		goto out_free_sg;
    878
    879	iod->use_sgl = nvme_pci_use_sgls(dev, req);
    880	if (iod->use_sgl)
    881		ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
    882	else
    883		ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
    884	if (ret != BLK_STS_OK)
    885		goto out_unmap_sg;
    886	return BLK_STS_OK;
    887
    888out_unmap_sg:
    889	nvme_unmap_sg(dev, req);
    890out_free_sg:
    891	mempool_free(iod->sg, dev->iod_mempool);
    892	return ret;
    893}
    894
    895static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
    896		struct nvme_command *cmnd)
    897{
    898	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
    899
    900	iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
    901			rq_dma_dir(req), 0);
    902	if (dma_mapping_error(dev->dev, iod->meta_dma))
    903		return BLK_STS_IOERR;
    904	cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
    905	return BLK_STS_OK;
    906}
    907
    908static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
    909{
    910	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
    911	blk_status_t ret;
    912
    913	iod->aborted = 0;
    914	iod->npages = -1;
    915	iod->nents = 0;
    916
    917	ret = nvme_setup_cmd(req->q->queuedata, req);
    918	if (ret)
    919		return ret;
    920
    921	if (blk_rq_nr_phys_segments(req)) {
    922		ret = nvme_map_data(dev, req, &iod->cmd);
    923		if (ret)
    924			goto out_free_cmd;
    925	}
    926
    927	if (blk_integrity_rq(req)) {
    928		ret = nvme_map_metadata(dev, req, &iod->cmd);
    929		if (ret)
    930			goto out_unmap_data;
    931	}
    932
    933	blk_mq_start_request(req);
    934	return BLK_STS_OK;
    935out_unmap_data:
    936	nvme_unmap_data(dev, req);
    937out_free_cmd:
    938	nvme_cleanup_cmd(req);
    939	return ret;
    940}
    941
    942/*
    943 * NOTE: ns is NULL when called on the admin queue.
    944 */
    945static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
    946			 const struct blk_mq_queue_data *bd)
    947{
    948	struct nvme_queue *nvmeq = hctx->driver_data;
    949	struct nvme_dev *dev = nvmeq->dev;
    950	struct request *req = bd->rq;
    951	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
    952	blk_status_t ret;
    953
    954	/*
    955	 * We should not need to do this, but we're still using this to
    956	 * ensure we can drain requests on a dying queue.
    957	 */
    958	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
    959		return BLK_STS_IOERR;
    960
    961	if (unlikely(!nvme_check_ready(&dev->ctrl, req, true)))
    962		return nvme_fail_nonready_command(&dev->ctrl, req);
    963
    964	ret = nvme_prep_rq(dev, req);
    965	if (unlikely(ret))
    966		return ret;
    967	spin_lock(&nvmeq->sq_lock);
    968	nvme_sq_copy_cmd(nvmeq, &iod->cmd);
    969	nvme_write_sq_db(nvmeq, bd->last);
    970	spin_unlock(&nvmeq->sq_lock);
    971	return BLK_STS_OK;
    972}
    973
    974static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist)
    975{
    976	spin_lock(&nvmeq->sq_lock);
    977	while (!rq_list_empty(*rqlist)) {
    978		struct request *req = rq_list_pop(rqlist);
    979		struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
    980
    981		nvme_sq_copy_cmd(nvmeq, &iod->cmd);
    982	}
    983	nvme_write_sq_db(nvmeq, true);
    984	spin_unlock(&nvmeq->sq_lock);
    985}
    986
    987static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
    988{
    989	/*
    990	 * We should not need to do this, but we're still using this to
    991	 * ensure we can drain requests on a dying queue.
    992	 */
    993	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
    994		return false;
    995	if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
    996		return false;
    997
    998	req->mq_hctx->tags->rqs[req->tag] = req;
    999	return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
   1000}
   1001
   1002static void nvme_queue_rqs(struct request **rqlist)
   1003{
   1004	struct request *req, *next, *prev = NULL;
   1005	struct request *requeue_list = NULL;
   1006
   1007	rq_list_for_each_safe(rqlist, req, next) {
   1008		struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
   1009
   1010		if (!nvme_prep_rq_batch(nvmeq, req)) {
   1011			/* detach 'req' and add to remainder list */
   1012			rq_list_move(rqlist, &requeue_list, req, prev);
   1013
   1014			req = prev;
   1015			if (!req)
   1016				continue;
   1017		}
   1018
   1019		if (!next || req->mq_hctx != next->mq_hctx) {
   1020			/* detach rest of list, and submit */
   1021			req->rq_next = NULL;
   1022			nvme_submit_cmds(nvmeq, rqlist);
   1023			*rqlist = next;
   1024			prev = NULL;
   1025		} else
   1026			prev = req;
   1027	}
   1028
   1029	*rqlist = requeue_list;
   1030}
   1031
   1032static __always_inline void nvme_pci_unmap_rq(struct request *req)
   1033{
   1034	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
   1035	struct nvme_dev *dev = iod->nvmeq->dev;
   1036
   1037	if (blk_integrity_rq(req))
   1038		dma_unmap_page(dev->dev, iod->meta_dma,
   1039			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
   1040	if (blk_rq_nr_phys_segments(req))
   1041		nvme_unmap_data(dev, req);
   1042}
   1043
   1044static void nvme_pci_complete_rq(struct request *req)
   1045{
   1046	nvme_pci_unmap_rq(req);
   1047	nvme_complete_rq(req);
   1048}
   1049
   1050static void nvme_pci_complete_batch(struct io_comp_batch *iob)
   1051{
   1052	nvme_complete_batch(iob, nvme_pci_unmap_rq);
   1053}
   1054
   1055/* We read the CQE phase first to check if the rest of the entry is valid */
   1056static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
   1057{
   1058	struct nvme_completion *hcqe = &nvmeq->cqes[nvmeq->cq_head];
   1059
   1060	return (le16_to_cpu(READ_ONCE(hcqe->status)) & 1) == nvmeq->cq_phase;
   1061}
   1062
   1063static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
   1064{
   1065	u16 head = nvmeq->cq_head;
   1066
   1067	if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
   1068					      nvmeq->dbbuf_cq_ei))
   1069		writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
   1070}
   1071
   1072static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
   1073{
   1074	if (!nvmeq->qid)
   1075		return nvmeq->dev->admin_tagset.tags[0];
   1076	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
   1077}
   1078
   1079static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
   1080				   struct io_comp_batch *iob, u16 idx)
   1081{
   1082	struct nvme_completion *cqe = &nvmeq->cqes[idx];
   1083	__u16 command_id = READ_ONCE(cqe->command_id);
   1084	struct request *req;
   1085
   1086	/*
   1087	 * AEN requests are special as they don't time out and can
   1088	 * survive any kind of queue freeze and often don't respond to
   1089	 * aborts.  We don't even bother to allocate a struct request
   1090	 * for them but rather special case them here.
   1091	 */
   1092	if (unlikely(nvme_is_aen_req(nvmeq->qid, command_id))) {
   1093		nvme_complete_async_event(&nvmeq->dev->ctrl,
   1094				cqe->status, &cqe->result);
   1095		return;
   1096	}
   1097
   1098	req = nvme_find_rq(nvme_queue_tagset(nvmeq), command_id);
   1099	if (unlikely(!req)) {
   1100		dev_warn(nvmeq->dev->ctrl.device,
   1101			"invalid id %d completed on queue %d\n",
   1102			command_id, le16_to_cpu(cqe->sq_id));
   1103		return;
   1104	}
   1105
   1106	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
   1107	if (!nvme_try_complete_req(req, cqe->status, cqe->result) &&
   1108	    !blk_mq_add_to_batch(req, iob, nvme_req(req)->status,
   1109					nvme_pci_complete_batch))
   1110		nvme_pci_complete_rq(req);
   1111}
   1112
   1113static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
   1114{
   1115	u32 tmp = nvmeq->cq_head + 1;
   1116
   1117	if (tmp == nvmeq->q_depth) {
   1118		nvmeq->cq_head = 0;
   1119		nvmeq->cq_phase ^= 1;
   1120	} else {
   1121		nvmeq->cq_head = tmp;
   1122	}
   1123}
   1124
   1125static inline int nvme_poll_cq(struct nvme_queue *nvmeq,
   1126			       struct io_comp_batch *iob)
   1127{
   1128	int found = 0;
   1129
   1130	while (nvme_cqe_pending(nvmeq)) {
   1131		found++;
   1132		/*
   1133		 * load-load control dependency between phase and the rest of
   1134		 * the cqe requires a full read memory barrier
   1135		 */
   1136		dma_rmb();
   1137		nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head);
   1138		nvme_update_cq_head(nvmeq);
   1139	}
   1140
   1141	if (found)
   1142		nvme_ring_cq_doorbell(nvmeq);
   1143	return found;
   1144}
   1145
   1146static irqreturn_t nvme_irq(int irq, void *data)
   1147{
   1148	struct nvme_queue *nvmeq = data;
   1149	DEFINE_IO_COMP_BATCH(iob);
   1150
   1151	if (nvme_poll_cq(nvmeq, &iob)) {
   1152		if (!rq_list_empty(iob.req_list))
   1153			nvme_pci_complete_batch(&iob);
   1154		return IRQ_HANDLED;
   1155	}
   1156	return IRQ_NONE;
   1157}
   1158
   1159static irqreturn_t nvme_irq_check(int irq, void *data)
   1160{
   1161	struct nvme_queue *nvmeq = data;
   1162
   1163	if (nvme_cqe_pending(nvmeq))
   1164		return IRQ_WAKE_THREAD;
   1165	return IRQ_NONE;
   1166}
   1167
   1168/*
   1169 * Poll for completions for any interrupt driven queue
   1170 * Can be called from any context.
   1171 */
   1172static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
   1173{
   1174	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
   1175
   1176	WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags));
   1177
   1178	disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
   1179	nvme_poll_cq(nvmeq, NULL);
   1180	enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
   1181}
   1182
   1183static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
   1184{
   1185	struct nvme_queue *nvmeq = hctx->driver_data;
   1186	bool found;
   1187
   1188	if (!nvme_cqe_pending(nvmeq))
   1189		return 0;
   1190
   1191	spin_lock(&nvmeq->cq_poll_lock);
   1192	found = nvme_poll_cq(nvmeq, iob);
   1193	spin_unlock(&nvmeq->cq_poll_lock);
   1194
   1195	return found;
   1196}
   1197
   1198static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
   1199{
   1200	struct nvme_dev *dev = to_nvme_dev(ctrl);
   1201	struct nvme_queue *nvmeq = &dev->queues[0];
   1202	struct nvme_command c = { };
   1203
   1204	c.common.opcode = nvme_admin_async_event;
   1205	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
   1206
   1207	spin_lock(&nvmeq->sq_lock);
   1208	nvme_sq_copy_cmd(nvmeq, &c);
   1209	nvme_write_sq_db(nvmeq, true);
   1210	spin_unlock(&nvmeq->sq_lock);
   1211}
   1212
   1213static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
   1214{
   1215	struct nvme_command c = { };
   1216
   1217	c.delete_queue.opcode = opcode;
   1218	c.delete_queue.qid = cpu_to_le16(id);
   1219
   1220	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
   1221}
   1222
   1223static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
   1224		struct nvme_queue *nvmeq, s16 vector)
   1225{
   1226	struct nvme_command c = { };
   1227	int flags = NVME_QUEUE_PHYS_CONTIG;
   1228
   1229	if (!test_bit(NVMEQ_POLLED, &nvmeq->flags))
   1230		flags |= NVME_CQ_IRQ_ENABLED;
   1231
   1232	/*
   1233	 * Note: we (ab)use the fact that the prp fields survive if no data
   1234	 * is attached to the request.
   1235	 */
   1236	c.create_cq.opcode = nvme_admin_create_cq;
   1237	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
   1238	c.create_cq.cqid = cpu_to_le16(qid);
   1239	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
   1240	c.create_cq.cq_flags = cpu_to_le16(flags);
   1241	c.create_cq.irq_vector = cpu_to_le16(vector);
   1242
   1243	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
   1244}
   1245
   1246static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
   1247						struct nvme_queue *nvmeq)
   1248{
   1249	struct nvme_ctrl *ctrl = &dev->ctrl;
   1250	struct nvme_command c = { };
   1251	int flags = NVME_QUEUE_PHYS_CONTIG;
   1252
   1253	/*
   1254	 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
   1255	 * set. Since URGENT priority is zeroes, it makes all queues
   1256	 * URGENT.
   1257	 */
   1258	if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
   1259		flags |= NVME_SQ_PRIO_MEDIUM;
   1260
   1261	/*
   1262	 * Note: we (ab)use the fact that the prp fields survive if no data
   1263	 * is attached to the request.
   1264	 */
   1265	c.create_sq.opcode = nvme_admin_create_sq;
   1266	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
   1267	c.create_sq.sqid = cpu_to_le16(qid);
   1268	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
   1269	c.create_sq.sq_flags = cpu_to_le16(flags);
   1270	c.create_sq.cqid = cpu_to_le16(qid);
   1271
   1272	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
   1273}
   1274
   1275static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
   1276{
   1277	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
   1278}
   1279
   1280static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
   1281{
   1282	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
   1283}
   1284
   1285static void abort_endio(struct request *req, blk_status_t error)
   1286{
   1287	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
   1288	struct nvme_queue *nvmeq = iod->nvmeq;
   1289
   1290	dev_warn(nvmeq->dev->ctrl.device,
   1291		 "Abort status: 0x%x", nvme_req(req)->status);
   1292	atomic_inc(&nvmeq->dev->ctrl.abort_limit);
   1293	blk_mq_free_request(req);
   1294}
   1295
   1296static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
   1297{
   1298	/* If true, indicates loss of adapter communication, possibly by a
   1299	 * NVMe Subsystem reset.
   1300	 */
   1301	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
   1302
   1303	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
   1304	switch (dev->ctrl.state) {
   1305	case NVME_CTRL_RESETTING:
   1306	case NVME_CTRL_CONNECTING:
   1307		return false;
   1308	default:
   1309		break;
   1310	}
   1311
   1312	/* We shouldn't reset unless the controller is on fatal error state
   1313	 * _or_ if we lost the communication with it.
   1314	 */
   1315	if (!(csts & NVME_CSTS_CFS) && !nssro)
   1316		return false;
   1317
   1318	return true;
   1319}
   1320
   1321static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
   1322{
   1323	/* Read a config register to help see what died. */
   1324	u16 pci_status;
   1325	int result;
   1326
   1327	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
   1328				      &pci_status);
   1329	if (result == PCIBIOS_SUCCESSFUL)
   1330		dev_warn(dev->ctrl.device,
   1331			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
   1332			 csts, pci_status);
   1333	else
   1334		dev_warn(dev->ctrl.device,
   1335			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
   1336			 csts, result);
   1337
   1338	if (csts != ~0)
   1339		return;
   1340
   1341	dev_warn(dev->ctrl.device,
   1342		 "Does your device have a faulty power saving mode enabled?\n");
   1343	dev_warn(dev->ctrl.device,
   1344		 "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off\" and report a bug\n");
   1345}
   1346
   1347static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
   1348{
   1349	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
   1350	struct nvme_queue *nvmeq = iod->nvmeq;
   1351	struct nvme_dev *dev = nvmeq->dev;
   1352	struct request *abort_req;
   1353	struct nvme_command cmd = { };
   1354	u32 csts = readl(dev->bar + NVME_REG_CSTS);
   1355
   1356	/* If PCI error recovery process is happening, we cannot reset or
   1357	 * the recovery mechanism will surely fail.
   1358	 */
   1359	mb();
   1360	if (pci_channel_offline(to_pci_dev(dev->dev)))
   1361		return BLK_EH_RESET_TIMER;
   1362
   1363	/*
   1364	 * Reset immediately if the controller is failed
   1365	 */
   1366	if (nvme_should_reset(dev, csts)) {
   1367		nvme_warn_reset(dev, csts);
   1368		nvme_dev_disable(dev, false);
   1369		nvme_reset_ctrl(&dev->ctrl);
   1370		return BLK_EH_DONE;
   1371	}
   1372
   1373	/*
   1374	 * Did we miss an interrupt?
   1375	 */
   1376	if (test_bit(NVMEQ_POLLED, &nvmeq->flags))
   1377		nvme_poll(req->mq_hctx, NULL);
   1378	else
   1379		nvme_poll_irqdisable(nvmeq);
   1380
   1381	if (blk_mq_request_completed(req)) {
   1382		dev_warn(dev->ctrl.device,
   1383			 "I/O %d QID %d timeout, completion polled\n",
   1384			 req->tag, nvmeq->qid);
   1385		return BLK_EH_DONE;
   1386	}
   1387
   1388	/*
   1389	 * Shutdown immediately if controller times out while starting. The
   1390	 * reset work will see the pci device disabled when it gets the forced
   1391	 * cancellation error. All outstanding requests are completed on
   1392	 * shutdown, so we return BLK_EH_DONE.
   1393	 */
   1394	switch (dev->ctrl.state) {
   1395	case NVME_CTRL_CONNECTING:
   1396		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
   1397		fallthrough;
   1398	case NVME_CTRL_DELETING:
   1399		dev_warn_ratelimited(dev->ctrl.device,
   1400			 "I/O %d QID %d timeout, disable controller\n",
   1401			 req->tag, nvmeq->qid);
   1402		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
   1403		nvme_dev_disable(dev, true);
   1404		return BLK_EH_DONE;
   1405	case NVME_CTRL_RESETTING:
   1406		return BLK_EH_RESET_TIMER;
   1407	default:
   1408		break;
   1409	}
   1410
   1411	/*
   1412	 * Shutdown the controller immediately and schedule a reset if the
   1413	 * command was already aborted once before and still hasn't been
   1414	 * returned to the driver, or if this is the admin queue.
   1415	 */
   1416	if (!nvmeq->qid || iod->aborted) {
   1417		dev_warn(dev->ctrl.device,
   1418			 "I/O %d QID %d timeout, reset controller\n",
   1419			 req->tag, nvmeq->qid);
   1420		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
   1421		nvme_dev_disable(dev, false);
   1422		nvme_reset_ctrl(&dev->ctrl);
   1423
   1424		return BLK_EH_DONE;
   1425	}
   1426
   1427	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
   1428		atomic_inc(&dev->ctrl.abort_limit);
   1429		return BLK_EH_RESET_TIMER;
   1430	}
   1431	iod->aborted = 1;
   1432
   1433	cmd.abort.opcode = nvme_admin_abort_cmd;
   1434	cmd.abort.cid = nvme_cid(req);
   1435	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
   1436
   1437	dev_warn(nvmeq->dev->ctrl.device,
   1438		"I/O %d QID %d timeout, aborting\n",
   1439		 req->tag, nvmeq->qid);
   1440
   1441	abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd),
   1442					 BLK_MQ_REQ_NOWAIT);
   1443	if (IS_ERR(abort_req)) {
   1444		atomic_inc(&dev->ctrl.abort_limit);
   1445		return BLK_EH_RESET_TIMER;
   1446	}
   1447	nvme_init_request(abort_req, &cmd);
   1448
   1449	abort_req->end_io = abort_endio;
   1450	abort_req->end_io_data = NULL;
   1451	abort_req->rq_flags |= RQF_QUIET;
   1452	blk_execute_rq_nowait(abort_req, false);
   1453
   1454	/*
   1455	 * The aborted req will be completed on receiving the abort req.
   1456	 * We enable the timer again. If hit twice, it'll cause a device reset,
   1457	 * as the device then is in a faulty state.
   1458	 */
   1459	return BLK_EH_RESET_TIMER;
   1460}
   1461
   1462static void nvme_free_queue(struct nvme_queue *nvmeq)
   1463{
   1464	dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq),
   1465				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
   1466	if (!nvmeq->sq_cmds)
   1467		return;
   1468
   1469	if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
   1470		pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev),
   1471				nvmeq->sq_cmds, SQ_SIZE(nvmeq));
   1472	} else {
   1473		dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq),
   1474				nvmeq->sq_cmds, nvmeq->sq_dma_addr);
   1475	}
   1476}
   1477
   1478static void nvme_free_queues(struct nvme_dev *dev, int lowest)
   1479{
   1480	int i;
   1481
   1482	for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
   1483		dev->ctrl.queue_count--;
   1484		nvme_free_queue(&dev->queues[i]);
   1485	}
   1486}
   1487
   1488/**
   1489 * nvme_suspend_queue - put queue into suspended state
   1490 * @nvmeq: queue to suspend
   1491 */
   1492static int nvme_suspend_queue(struct nvme_queue *nvmeq)
   1493{
   1494	if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
   1495		return 1;
   1496
   1497	/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
   1498	mb();
   1499
   1500	nvmeq->dev->online_queues--;
   1501	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
   1502		nvme_stop_admin_queue(&nvmeq->dev->ctrl);
   1503	if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
   1504		pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
   1505	return 0;
   1506}
   1507
   1508static void nvme_suspend_io_queues(struct nvme_dev *dev)
   1509{
   1510	int i;
   1511
   1512	for (i = dev->ctrl.queue_count - 1; i > 0; i--)
   1513		nvme_suspend_queue(&dev->queues[i]);
   1514}
   1515
   1516static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
   1517{
   1518	struct nvme_queue *nvmeq = &dev->queues[0];
   1519
   1520	if (shutdown)
   1521		nvme_shutdown_ctrl(&dev->ctrl);
   1522	else
   1523		nvme_disable_ctrl(&dev->ctrl);
   1524
   1525	nvme_poll_irqdisable(nvmeq);
   1526}
   1527
   1528/*
   1529 * Called only on a device that has been disabled and after all other threads
   1530 * that can check this device's completion queues have synced, except
   1531 * nvme_poll(). This is the last chance for the driver to see a natural
   1532 * completion before nvme_cancel_request() terminates all incomplete requests.
   1533 */
   1534static void nvme_reap_pending_cqes(struct nvme_dev *dev)
   1535{
   1536	int i;
   1537
   1538	for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
   1539		spin_lock(&dev->queues[i].cq_poll_lock);
   1540		nvme_poll_cq(&dev->queues[i], NULL);
   1541		spin_unlock(&dev->queues[i].cq_poll_lock);
   1542	}
   1543}
   1544
   1545static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
   1546				int entry_size)
   1547{
   1548	int q_depth = dev->q_depth;
   1549	unsigned q_size_aligned = roundup(q_depth * entry_size,
   1550					  NVME_CTRL_PAGE_SIZE);
   1551
   1552	if (q_size_aligned * nr_io_queues > dev->cmb_size) {
   1553		u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
   1554
   1555		mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE);
   1556		q_depth = div_u64(mem_per_q, entry_size);
   1557
   1558		/*
   1559		 * Ensure the reduced q_depth is above some threshold where it
   1560		 * would be better to map queues in system memory with the
   1561		 * original depth
   1562		 */
   1563		if (q_depth < 64)
   1564			return -ENOMEM;
   1565	}
   1566
   1567	return q_depth;
   1568}
   1569
   1570static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
   1571				int qid)
   1572{
   1573	struct pci_dev *pdev = to_pci_dev(dev->dev);
   1574
   1575	if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
   1576		nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq));
   1577		if (nvmeq->sq_cmds) {
   1578			nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
   1579							nvmeq->sq_cmds);
   1580			if (nvmeq->sq_dma_addr) {
   1581				set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
   1582				return 0;
   1583			}
   1584
   1585			pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq));
   1586		}
   1587	}
   1588
   1589	nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq),
   1590				&nvmeq->sq_dma_addr, GFP_KERNEL);
   1591	if (!nvmeq->sq_cmds)
   1592		return -ENOMEM;
   1593	return 0;
   1594}
   1595
   1596static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
   1597{
   1598	struct nvme_queue *nvmeq = &dev->queues[qid];
   1599
   1600	if (dev->ctrl.queue_count > qid)
   1601		return 0;
   1602
   1603	nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES;
   1604	nvmeq->q_depth = depth;
   1605	nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
   1606					 &nvmeq->cq_dma_addr, GFP_KERNEL);
   1607	if (!nvmeq->cqes)
   1608		goto free_nvmeq;
   1609
   1610	if (nvme_alloc_sq_cmds(dev, nvmeq, qid))
   1611		goto free_cqdma;
   1612
   1613	nvmeq->dev = dev;
   1614	spin_lock_init(&nvmeq->sq_lock);
   1615	spin_lock_init(&nvmeq->cq_poll_lock);
   1616	nvmeq->cq_head = 0;
   1617	nvmeq->cq_phase = 1;
   1618	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
   1619	nvmeq->qid = qid;
   1620	dev->ctrl.queue_count++;
   1621
   1622	return 0;
   1623
   1624 free_cqdma:
   1625	dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes,
   1626			  nvmeq->cq_dma_addr);
   1627 free_nvmeq:
   1628	return -ENOMEM;
   1629}
   1630
   1631static int queue_request_irq(struct nvme_queue *nvmeq)
   1632{
   1633	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
   1634	int nr = nvmeq->dev->ctrl.instance;
   1635
   1636	if (use_threaded_interrupts) {
   1637		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
   1638				nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
   1639	} else {
   1640		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
   1641				NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
   1642	}
   1643}
   1644
   1645static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
   1646{
   1647	struct nvme_dev *dev = nvmeq->dev;
   1648
   1649	nvmeq->sq_tail = 0;
   1650	nvmeq->last_sq_tail = 0;
   1651	nvmeq->cq_head = 0;
   1652	nvmeq->cq_phase = 1;
   1653	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
   1654	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq));
   1655	nvme_dbbuf_init(dev, nvmeq, qid);
   1656	dev->online_queues++;
   1657	wmb(); /* ensure the first interrupt sees the initialization */
   1658}
   1659
   1660/*
   1661 * Try getting shutdown_lock while setting up IO queues.
   1662 */
   1663static int nvme_setup_io_queues_trylock(struct nvme_dev *dev)
   1664{
   1665	/*
   1666	 * Give up if the lock is being held by nvme_dev_disable.
   1667	 */
   1668	if (!mutex_trylock(&dev->shutdown_lock))
   1669		return -ENODEV;
   1670
   1671	/*
   1672	 * Controller is in wrong state, fail early.
   1673	 */
   1674	if (dev->ctrl.state != NVME_CTRL_CONNECTING) {
   1675		mutex_unlock(&dev->shutdown_lock);
   1676		return -ENODEV;
   1677	}
   1678
   1679	return 0;
   1680}
   1681
   1682static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
   1683{
   1684	struct nvme_dev *dev = nvmeq->dev;
   1685	int result;
   1686	u16 vector = 0;
   1687
   1688	clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
   1689
   1690	/*
   1691	 * A queue's vector matches the queue identifier unless the controller
   1692	 * has only one vector available.
   1693	 */
   1694	if (!polled)
   1695		vector = dev->num_vecs == 1 ? 0 : qid;
   1696	else
   1697		set_bit(NVMEQ_POLLED, &nvmeq->flags);
   1698
   1699	result = adapter_alloc_cq(dev, qid, nvmeq, vector);
   1700	if (result)
   1701		return result;
   1702
   1703	result = adapter_alloc_sq(dev, qid, nvmeq);
   1704	if (result < 0)
   1705		return result;
   1706	if (result)
   1707		goto release_cq;
   1708
   1709	nvmeq->cq_vector = vector;
   1710
   1711	result = nvme_setup_io_queues_trylock(dev);
   1712	if (result)
   1713		return result;
   1714	nvme_init_queue(nvmeq, qid);
   1715	if (!polled) {
   1716		result = queue_request_irq(nvmeq);
   1717		if (result < 0)
   1718			goto release_sq;
   1719	}
   1720
   1721	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
   1722	mutex_unlock(&dev->shutdown_lock);
   1723	return result;
   1724
   1725release_sq:
   1726	dev->online_queues--;
   1727	mutex_unlock(&dev->shutdown_lock);
   1728	adapter_delete_sq(dev, qid);
   1729release_cq:
   1730	adapter_delete_cq(dev, qid);
   1731	return result;
   1732}
   1733
   1734static const struct blk_mq_ops nvme_mq_admin_ops = {
   1735	.queue_rq	= nvme_queue_rq,
   1736	.complete	= nvme_pci_complete_rq,
   1737	.init_hctx	= nvme_admin_init_hctx,
   1738	.init_request	= nvme_pci_init_request,
   1739	.timeout	= nvme_timeout,
   1740};
   1741
   1742static const struct blk_mq_ops nvme_mq_ops = {
   1743	.queue_rq	= nvme_queue_rq,
   1744	.queue_rqs	= nvme_queue_rqs,
   1745	.complete	= nvme_pci_complete_rq,
   1746	.commit_rqs	= nvme_commit_rqs,
   1747	.init_hctx	= nvme_init_hctx,
   1748	.init_request	= nvme_pci_init_request,
   1749	.map_queues	= nvme_pci_map_queues,
   1750	.timeout	= nvme_timeout,
   1751	.poll		= nvme_poll,
   1752};
   1753
   1754static void nvme_dev_remove_admin(struct nvme_dev *dev)
   1755{
   1756	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
   1757		/*
   1758		 * If the controller was reset during removal, it's possible
   1759		 * user requests may be waiting on a stopped queue. Start the
   1760		 * queue to flush these to completion.
   1761		 */
   1762		nvme_start_admin_queue(&dev->ctrl);
   1763		blk_cleanup_queue(dev->ctrl.admin_q);
   1764		blk_mq_free_tag_set(&dev->admin_tagset);
   1765	}
   1766}
   1767
   1768static int nvme_alloc_admin_tags(struct nvme_dev *dev)
   1769{
   1770	if (!dev->ctrl.admin_q) {
   1771		dev->admin_tagset.ops = &nvme_mq_admin_ops;
   1772		dev->admin_tagset.nr_hw_queues = 1;
   1773
   1774		dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
   1775		dev->admin_tagset.timeout = NVME_ADMIN_TIMEOUT;
   1776		dev->admin_tagset.numa_node = dev->ctrl.numa_node;
   1777		dev->admin_tagset.cmd_size = sizeof(struct nvme_iod);
   1778		dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
   1779		dev->admin_tagset.driver_data = dev;
   1780
   1781		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
   1782			return -ENOMEM;
   1783		dev->ctrl.admin_tagset = &dev->admin_tagset;
   1784
   1785		dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
   1786		if (IS_ERR(dev->ctrl.admin_q)) {
   1787			blk_mq_free_tag_set(&dev->admin_tagset);
   1788			dev->ctrl.admin_q = NULL;
   1789			return -ENOMEM;
   1790		}
   1791		if (!blk_get_queue(dev->ctrl.admin_q)) {
   1792			nvme_dev_remove_admin(dev);
   1793			dev->ctrl.admin_q = NULL;
   1794			return -ENODEV;
   1795		}
   1796	} else
   1797		nvme_start_admin_queue(&dev->ctrl);
   1798
   1799	return 0;
   1800}
   1801
   1802static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
   1803{
   1804	return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
   1805}
   1806
   1807static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
   1808{
   1809	struct pci_dev *pdev = to_pci_dev(dev->dev);
   1810
   1811	if (size <= dev->bar_mapped_size)
   1812		return 0;
   1813	if (size > pci_resource_len(pdev, 0))
   1814		return -ENOMEM;
   1815	if (dev->bar)
   1816		iounmap(dev->bar);
   1817	dev->bar = ioremap(pci_resource_start(pdev, 0), size);
   1818	if (!dev->bar) {
   1819		dev->bar_mapped_size = 0;
   1820		return -ENOMEM;
   1821	}
   1822	dev->bar_mapped_size = size;
   1823	dev->dbs = dev->bar + NVME_REG_DBS;
   1824
   1825	return 0;
   1826}
   1827
   1828static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
   1829{
   1830	int result;
   1831	u32 aqa;
   1832	struct nvme_queue *nvmeq;
   1833
   1834	result = nvme_remap_bar(dev, db_bar_size(dev, 0));
   1835	if (result < 0)
   1836		return result;
   1837
   1838	dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
   1839				NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
   1840
   1841	if (dev->subsystem &&
   1842	    (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
   1843		writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
   1844
   1845	result = nvme_disable_ctrl(&dev->ctrl);
   1846	if (result < 0)
   1847		return result;
   1848
   1849	result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
   1850	if (result)
   1851		return result;
   1852
   1853	dev->ctrl.numa_node = dev_to_node(dev->dev);
   1854
   1855	nvmeq = &dev->queues[0];
   1856	aqa = nvmeq->q_depth - 1;
   1857	aqa |= aqa << 16;
   1858
   1859	writel(aqa, dev->bar + NVME_REG_AQA);
   1860	lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
   1861	lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
   1862
   1863	result = nvme_enable_ctrl(&dev->ctrl);
   1864	if (result)
   1865		return result;
   1866
   1867	nvmeq->cq_vector = 0;
   1868	nvme_init_queue(nvmeq, 0);
   1869	result = queue_request_irq(nvmeq);
   1870	if (result) {
   1871		dev->online_queues--;
   1872		return result;
   1873	}
   1874
   1875	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
   1876	return result;
   1877}
   1878
   1879static int nvme_create_io_queues(struct nvme_dev *dev)
   1880{
   1881	unsigned i, max, rw_queues;
   1882	int ret = 0;
   1883
   1884	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
   1885		if (nvme_alloc_queue(dev, i, dev->q_depth)) {
   1886			ret = -ENOMEM;
   1887			break;
   1888		}
   1889	}
   1890
   1891	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
   1892	if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
   1893		rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
   1894				dev->io_queues[HCTX_TYPE_READ];
   1895	} else {
   1896		rw_queues = max;
   1897	}
   1898
   1899	for (i = dev->online_queues; i <= max; i++) {
   1900		bool polled = i > rw_queues;
   1901
   1902		ret = nvme_create_queue(&dev->queues[i], i, polled);
   1903		if (ret)
   1904			break;
   1905	}
   1906
   1907	/*
   1908	 * Ignore failing Create SQ/CQ commands, we can continue with less
   1909	 * than the desired amount of queues, and even a controller without
   1910	 * I/O queues can still be used to issue admin commands.  This might
   1911	 * be useful to upgrade a buggy firmware for example.
   1912	 */
   1913	return ret >= 0 ? 0 : ret;
   1914}
   1915
   1916static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
   1917{
   1918	u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;
   1919
   1920	return 1ULL << (12 + 4 * szu);
   1921}
   1922
   1923static u32 nvme_cmb_size(struct nvme_dev *dev)
   1924{
   1925	return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
   1926}
   1927
   1928static void nvme_map_cmb(struct nvme_dev *dev)
   1929{
   1930	u64 size, offset;
   1931	resource_size_t bar_size;
   1932	struct pci_dev *pdev = to_pci_dev(dev->dev);
   1933	int bar;
   1934
   1935	if (dev->cmb_size)
   1936		return;
   1937
   1938	if (NVME_CAP_CMBS(dev->ctrl.cap))
   1939		writel(NVME_CMBMSC_CRE, dev->bar + NVME_REG_CMBMSC);
   1940
   1941	dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
   1942	if (!dev->cmbsz)
   1943		return;
   1944	dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
   1945
   1946	size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
   1947	offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
   1948	bar = NVME_CMB_BIR(dev->cmbloc);
   1949	bar_size = pci_resource_len(pdev, bar);
   1950
   1951	if (offset > bar_size)
   1952		return;
   1953
   1954	/*
   1955	 * Tell the controller about the host side address mapping the CMB,
   1956	 * and enable CMB decoding for the NVMe 1.4+ scheme:
   1957	 */
   1958	if (NVME_CAP_CMBS(dev->ctrl.cap)) {
   1959		hi_lo_writeq(NVME_CMBMSC_CRE | NVME_CMBMSC_CMSE |
   1960			     (pci_bus_address(pdev, bar) + offset),
   1961			     dev->bar + NVME_REG_CMBMSC);
   1962	}
   1963
   1964	/*
   1965	 * Controllers may support a CMB size larger than their BAR,
   1966	 * for example, due to being behind a bridge. Reduce the CMB to
   1967	 * the reported size of the BAR
   1968	 */
   1969	if (size > bar_size - offset)
   1970		size = bar_size - offset;
   1971
   1972	if (pci_p2pdma_add_resource(pdev, bar, size, offset)) {
   1973		dev_warn(dev->ctrl.device,
   1974			 "failed to register the CMB\n");
   1975		return;
   1976	}
   1977
   1978	dev->cmb_size = size;
   1979	dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS);
   1980
   1981	if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
   1982			(NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
   1983		pci_p2pmem_publish(pdev, true);
   1984}
   1985
   1986static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
   1987{
   1988	u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT;
   1989	u64 dma_addr = dev->host_mem_descs_dma;
   1990	struct nvme_command c = { };
   1991	int ret;
   1992
   1993	c.features.opcode	= nvme_admin_set_features;
   1994	c.features.fid		= cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
   1995	c.features.dword11	= cpu_to_le32(bits);
   1996	c.features.dword12	= cpu_to_le32(host_mem_size);
   1997	c.features.dword13	= cpu_to_le32(lower_32_bits(dma_addr));
   1998	c.features.dword14	= cpu_to_le32(upper_32_bits(dma_addr));
   1999	c.features.dword15	= cpu_to_le32(dev->nr_host_mem_descs);
   2000
   2001	ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
   2002	if (ret) {
   2003		dev_warn(dev->ctrl.device,
   2004			 "failed to set host mem (err %d, flags %#x).\n",
   2005			 ret, bits);
   2006	} else
   2007		dev->hmb = bits & NVME_HOST_MEM_ENABLE;
   2008
   2009	return ret;
   2010}
   2011
   2012static void nvme_free_host_mem(struct nvme_dev *dev)
   2013{
   2014	int i;
   2015
   2016	for (i = 0; i < dev->nr_host_mem_descs; i++) {
   2017		struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
   2018		size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE;
   2019
   2020		dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i],
   2021			       le64_to_cpu(desc->addr),
   2022			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
   2023	}
   2024
   2025	kfree(dev->host_mem_desc_bufs);
   2026	dev->host_mem_desc_bufs = NULL;
   2027	dma_free_coherent(dev->dev,
   2028			dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
   2029			dev->host_mem_descs, dev->host_mem_descs_dma);
   2030	dev->host_mem_descs = NULL;
   2031	dev->nr_host_mem_descs = 0;
   2032}
   2033
   2034static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
   2035		u32 chunk_size)
   2036{
   2037	struct nvme_host_mem_buf_desc *descs;
   2038	u32 max_entries, len;
   2039	dma_addr_t descs_dma;
   2040	int i = 0;
   2041	void **bufs;
   2042	u64 size, tmp;
   2043
   2044	tmp = (preferred + chunk_size - 1);
   2045	do_div(tmp, chunk_size);
   2046	max_entries = tmp;
   2047
   2048	if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
   2049		max_entries = dev->ctrl.hmmaxd;
   2050
   2051	descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs),
   2052				   &descs_dma, GFP_KERNEL);
   2053	if (!descs)
   2054		goto out;
   2055
   2056	bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
   2057	if (!bufs)
   2058		goto out_free_descs;
   2059
   2060	for (size = 0; size < preferred && i < max_entries; size += len) {
   2061		dma_addr_t dma_addr;
   2062
   2063		len = min_t(u64, chunk_size, preferred - size);
   2064		bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL,
   2065				DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
   2066		if (!bufs[i])
   2067			break;
   2068
   2069		descs[i].addr = cpu_to_le64(dma_addr);
   2070		descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE);
   2071		i++;
   2072	}
   2073
   2074	if (!size)
   2075		goto out_free_bufs;
   2076
   2077	dev->nr_host_mem_descs = i;
   2078	dev->host_mem_size = size;
   2079	dev->host_mem_descs = descs;
   2080	dev->host_mem_descs_dma = descs_dma;
   2081	dev->host_mem_desc_bufs = bufs;
   2082	return 0;
   2083
   2084out_free_bufs:
   2085	while (--i >= 0) {
   2086		size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE;
   2087
   2088		dma_free_attrs(dev->dev, size, bufs[i],
   2089			       le64_to_cpu(descs[i].addr),
   2090			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
   2091	}
   2092
   2093	kfree(bufs);
   2094out_free_descs:
   2095	dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
   2096			descs_dma);
   2097out:
   2098	dev->host_mem_descs = NULL;
   2099	return -ENOMEM;
   2100}
   2101
   2102static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
   2103{
   2104	u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
   2105	u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
   2106	u64 chunk_size;
   2107
   2108	/* start big and work our way down */
   2109	for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) {
   2110		if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
   2111			if (!min || dev->host_mem_size >= min)
   2112				return 0;
   2113			nvme_free_host_mem(dev);
   2114		}
   2115	}
   2116
   2117	return -ENOMEM;
   2118}
   2119
   2120static int nvme_setup_host_mem(struct nvme_dev *dev)
   2121{
   2122	u64 max = (u64)max_host_mem_size_mb * SZ_1M;
   2123	u64 preferred = (u64)dev->ctrl.hmpre * 4096;
   2124	u64 min = (u64)dev->ctrl.hmmin * 4096;
   2125	u32 enable_bits = NVME_HOST_MEM_ENABLE;
   2126	int ret;
   2127
   2128	preferred = min(preferred, max);
   2129	if (min > max) {
   2130		dev_warn(dev->ctrl.device,
   2131			"min host memory (%lld MiB) above limit (%d MiB).\n",
   2132			min >> ilog2(SZ_1M), max_host_mem_size_mb);
   2133		nvme_free_host_mem(dev);
   2134		return 0;
   2135	}
   2136
   2137	/*
   2138	 * If we already have a buffer allocated check if we can reuse it.
   2139	 */
   2140	if (dev->host_mem_descs) {
   2141		if (dev->host_mem_size >= min)
   2142			enable_bits |= NVME_HOST_MEM_RETURN;
   2143		else
   2144			nvme_free_host_mem(dev);
   2145	}
   2146
   2147	if (!dev->host_mem_descs) {
   2148		if (nvme_alloc_host_mem(dev, min, preferred)) {
   2149			dev_warn(dev->ctrl.device,
   2150				"failed to allocate host memory buffer.\n");
   2151			return 0; /* controller must work without HMB */
   2152		}
   2153
   2154		dev_info(dev->ctrl.device,
   2155			"allocated %lld MiB host memory buffer.\n",
   2156			dev->host_mem_size >> ilog2(SZ_1M));
   2157	}
   2158
   2159	ret = nvme_set_host_mem(dev, enable_bits);
   2160	if (ret)
   2161		nvme_free_host_mem(dev);
   2162	return ret;
   2163}
   2164
   2165static ssize_t cmb_show(struct device *dev, struct device_attribute *attr,
   2166		char *buf)
   2167{
   2168	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
   2169
   2170	return sysfs_emit(buf, "cmbloc : x%08x\ncmbsz  : x%08x\n",
   2171		       ndev->cmbloc, ndev->cmbsz);
   2172}
   2173static DEVICE_ATTR_RO(cmb);
   2174
   2175static ssize_t cmbloc_show(struct device *dev, struct device_attribute *attr,
   2176		char *buf)
   2177{
   2178	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
   2179
   2180	return sysfs_emit(buf, "%u\n", ndev->cmbloc);
   2181}
   2182static DEVICE_ATTR_RO(cmbloc);
   2183
   2184static ssize_t cmbsz_show(struct device *dev, struct device_attribute *attr,
   2185		char *buf)
   2186{
   2187	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
   2188
   2189	return sysfs_emit(buf, "%u\n", ndev->cmbsz);
   2190}
   2191static DEVICE_ATTR_RO(cmbsz);
   2192
   2193static ssize_t hmb_show(struct device *dev, struct device_attribute *attr,
   2194			char *buf)
   2195{
   2196	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
   2197
   2198	return sysfs_emit(buf, "%d\n", ndev->hmb);
   2199}
   2200
   2201static ssize_t hmb_store(struct device *dev, struct device_attribute *attr,
   2202			 const char *buf, size_t count)
   2203{
   2204	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
   2205	bool new;
   2206	int ret;
   2207
   2208	if (strtobool(buf, &new) < 0)
   2209		return -EINVAL;
   2210
   2211	if (new == ndev->hmb)
   2212		return count;
   2213
   2214	if (new) {
   2215		ret = nvme_setup_host_mem(ndev);
   2216	} else {
   2217		ret = nvme_set_host_mem(ndev, 0);
   2218		if (!ret)
   2219			nvme_free_host_mem(ndev);
   2220	}
   2221
   2222	if (ret < 0)
   2223		return ret;
   2224
   2225	return count;
   2226}
   2227static DEVICE_ATTR_RW(hmb);
   2228
   2229static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj,
   2230		struct attribute *a, int n)
   2231{
   2232	struct nvme_ctrl *ctrl =
   2233		dev_get_drvdata(container_of(kobj, struct device, kobj));
   2234	struct nvme_dev *dev = to_nvme_dev(ctrl);
   2235
   2236	if (a == &dev_attr_cmb.attr ||
   2237	    a == &dev_attr_cmbloc.attr ||
   2238	    a == &dev_attr_cmbsz.attr) {
   2239	    	if (!dev->cmbsz)
   2240			return 0;
   2241	}
   2242	if (a == &dev_attr_hmb.attr && !ctrl->hmpre)
   2243		return 0;
   2244
   2245	return a->mode;
   2246}
   2247
   2248static struct attribute *nvme_pci_attrs[] = {
   2249	&dev_attr_cmb.attr,
   2250	&dev_attr_cmbloc.attr,
   2251	&dev_attr_cmbsz.attr,
   2252	&dev_attr_hmb.attr,
   2253	NULL,
   2254};
   2255
   2256static const struct attribute_group nvme_pci_attr_group = {
   2257	.attrs		= nvme_pci_attrs,
   2258	.is_visible	= nvme_pci_attrs_are_visible,
   2259};
   2260
   2261/*
   2262 * nirqs is the number of interrupts available for write and read
   2263 * queues. The core already reserved an interrupt for the admin queue.
   2264 */
   2265static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
   2266{
   2267	struct nvme_dev *dev = affd->priv;
   2268	unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues;
   2269
   2270	/*
   2271	 * If there is no interrupt available for queues, ensure that
   2272	 * the default queue is set to 1. The affinity set size is
   2273	 * also set to one, but the irq core ignores it for this case.
   2274	 *
   2275	 * If only one interrupt is available or 'write_queue' == 0, combine
   2276	 * write and read queues.
   2277	 *
   2278	 * If 'write_queues' > 0, ensure it leaves room for at least one read
   2279	 * queue.
   2280	 */
   2281	if (!nrirqs) {
   2282		nrirqs = 1;
   2283		nr_read_queues = 0;
   2284	} else if (nrirqs == 1 || !nr_write_queues) {
   2285		nr_read_queues = 0;
   2286	} else if (nr_write_queues >= nrirqs) {
   2287		nr_read_queues = 1;
   2288	} else {
   2289		nr_read_queues = nrirqs - nr_write_queues;
   2290	}
   2291
   2292	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
   2293	affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
   2294	dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
   2295	affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
   2296	affd->nr_sets = nr_read_queues ? 2 : 1;
   2297}
   2298
   2299static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
   2300{
   2301	struct pci_dev *pdev = to_pci_dev(dev->dev);
   2302	struct irq_affinity affd = {
   2303		.pre_vectors	= 1,
   2304		.calc_sets	= nvme_calc_irq_sets,
   2305		.priv		= dev,
   2306	};
   2307	unsigned int irq_queues, poll_queues;
   2308
   2309	/*
   2310	 * Poll queues don't need interrupts, but we need at least one I/O queue
   2311	 * left over for non-polled I/O.
   2312	 */
   2313	poll_queues = min(dev->nr_poll_queues, nr_io_queues - 1);
   2314	dev->io_queues[HCTX_TYPE_POLL] = poll_queues;
   2315
   2316	/*
   2317	 * Initialize for the single interrupt case, will be updated in
   2318	 * nvme_calc_irq_sets().
   2319	 */
   2320	dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
   2321	dev->io_queues[HCTX_TYPE_READ] = 0;
   2322
   2323	/*
   2324	 * We need interrupts for the admin queue and each non-polled I/O queue,
   2325	 * but some Apple controllers require all queues to use the first
   2326	 * vector.
   2327	 */
   2328	irq_queues = 1;
   2329	if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR))
   2330		irq_queues += (nr_io_queues - poll_queues);
   2331	return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
   2332			      PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
   2333}
   2334
   2335static void nvme_disable_io_queues(struct nvme_dev *dev)
   2336{
   2337	if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq))
   2338		__nvme_disable_io_queues(dev, nvme_admin_delete_cq);
   2339}
   2340
   2341static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
   2342{
   2343	/*
   2344	 * If tags are shared with admin queue (Apple bug), then
   2345	 * make sure we only use one IO queue.
   2346	 */
   2347	if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
   2348		return 1;
   2349	return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues;
   2350}
   2351
   2352static int nvme_setup_io_queues(struct nvme_dev *dev)
   2353{
   2354	struct nvme_queue *adminq = &dev->queues[0];
   2355	struct pci_dev *pdev = to_pci_dev(dev->dev);
   2356	unsigned int nr_io_queues;
   2357	unsigned long size;
   2358	int result;
   2359
   2360	/*
   2361	 * Sample the module parameters once at reset time so that we have
   2362	 * stable values to work with.
   2363	 */
   2364	dev->nr_write_queues = write_queues;
   2365	dev->nr_poll_queues = poll_queues;
   2366
   2367	nr_io_queues = dev->nr_allocated_queues - 1;
   2368	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
   2369	if (result < 0)
   2370		return result;
   2371
   2372	if (nr_io_queues == 0)
   2373		return 0;
   2374
   2375	/*
   2376	 * Free IRQ resources as soon as NVMEQ_ENABLED bit transitions
   2377	 * from set to unset. If there is a window to it is truely freed,
   2378	 * pci_free_irq_vectors() jumping into this window will crash.
   2379	 * And take lock to avoid racing with pci_free_irq_vectors() in
   2380	 * nvme_dev_disable() path.
   2381	 */
   2382	result = nvme_setup_io_queues_trylock(dev);
   2383	if (result)
   2384		return result;
   2385	if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags))
   2386		pci_free_irq(pdev, 0, adminq);
   2387
   2388	if (dev->cmb_use_sqes) {
   2389		result = nvme_cmb_qdepth(dev, nr_io_queues,
   2390				sizeof(struct nvme_command));
   2391		if (result > 0)
   2392			dev->q_depth = result;
   2393		else
   2394			dev->cmb_use_sqes = false;
   2395	}
   2396
   2397	do {
   2398		size = db_bar_size(dev, nr_io_queues);
   2399		result = nvme_remap_bar(dev, size);
   2400		if (!result)
   2401			break;
   2402		if (!--nr_io_queues) {
   2403			result = -ENOMEM;
   2404			goto out_unlock;
   2405		}
   2406	} while (1);
   2407	adminq->q_db = dev->dbs;
   2408
   2409 retry:
   2410	/* Deregister the admin queue's interrupt */
   2411	if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags))
   2412		pci_free_irq(pdev, 0, adminq);
   2413
   2414	/*
   2415	 * If we enable msix early due to not intx, disable it again before
   2416	 * setting up the full range we need.
   2417	 */
   2418	pci_free_irq_vectors(pdev);
   2419
   2420	result = nvme_setup_irqs(dev, nr_io_queues);
   2421	if (result <= 0) {
   2422		result = -EIO;
   2423		goto out_unlock;
   2424	}
   2425
   2426	dev->num_vecs = result;
   2427	result = max(result - 1, 1);
   2428	dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
   2429
   2430	/*
   2431	 * Should investigate if there's a performance win from allocating
   2432	 * more queues than interrupt vectors; it might allow the submission
   2433	 * path to scale better, even if the receive path is limited by the
   2434	 * number of interrupts.
   2435	 */
   2436	result = queue_request_irq(adminq);
   2437	if (result)
   2438		goto out_unlock;
   2439	set_bit(NVMEQ_ENABLED, &adminq->flags);
   2440	mutex_unlock(&dev->shutdown_lock);
   2441
   2442	result = nvme_create_io_queues(dev);
   2443	if (result || dev->online_queues < 2)
   2444		return result;
   2445
   2446	if (dev->online_queues - 1 < dev->max_qid) {
   2447		nr_io_queues = dev->online_queues - 1;
   2448		nvme_disable_io_queues(dev);
   2449		result = nvme_setup_io_queues_trylock(dev);
   2450		if (result)
   2451			return result;
   2452		nvme_suspend_io_queues(dev);
   2453		goto retry;
   2454	}
   2455	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
   2456					dev->io_queues[HCTX_TYPE_DEFAULT],
   2457					dev->io_queues[HCTX_TYPE_READ],
   2458					dev->io_queues[HCTX_TYPE_POLL]);
   2459	return 0;
   2460out_unlock:
   2461	mutex_unlock(&dev->shutdown_lock);
   2462	return result;
   2463}
   2464
   2465static void nvme_del_queue_end(struct request *req, blk_status_t error)
   2466{
   2467	struct nvme_queue *nvmeq = req->end_io_data;
   2468
   2469	blk_mq_free_request(req);
   2470	complete(&nvmeq->delete_done);
   2471}
   2472
   2473static void nvme_del_cq_end(struct request *req, blk_status_t error)
   2474{
   2475	struct nvme_queue *nvmeq = req->end_io_data;
   2476
   2477	if (error)
   2478		set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
   2479
   2480	nvme_del_queue_end(req, error);
   2481}
   2482
   2483static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
   2484{
   2485	struct request_queue *q = nvmeq->dev->ctrl.admin_q;
   2486	struct request *req;
   2487	struct nvme_command cmd = { };
   2488
   2489	cmd.delete_queue.opcode = opcode;
   2490	cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
   2491
   2492	req = blk_mq_alloc_request(q, nvme_req_op(&cmd), BLK_MQ_REQ_NOWAIT);
   2493	if (IS_ERR(req))
   2494		return PTR_ERR(req);
   2495	nvme_init_request(req, &cmd);
   2496
   2497	if (opcode == nvme_admin_delete_cq)
   2498		req->end_io = nvme_del_cq_end;
   2499	else
   2500		req->end_io = nvme_del_queue_end;
   2501	req->end_io_data = nvmeq;
   2502
   2503	init_completion(&nvmeq->delete_done);
   2504	req->rq_flags |= RQF_QUIET;
   2505	blk_execute_rq_nowait(req, false);
   2506	return 0;
   2507}
   2508
   2509static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
   2510{
   2511	int nr_queues = dev->online_queues - 1, sent = 0;
   2512	unsigned long timeout;
   2513
   2514 retry:
   2515	timeout = NVME_ADMIN_TIMEOUT;
   2516	while (nr_queues > 0) {
   2517		if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
   2518			break;
   2519		nr_queues--;
   2520		sent++;
   2521	}
   2522	while (sent) {
   2523		struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent];
   2524
   2525		timeout = wait_for_completion_io_timeout(&nvmeq->delete_done,
   2526				timeout);
   2527		if (timeout == 0)
   2528			return false;
   2529
   2530		sent--;
   2531		if (nr_queues)
   2532			goto retry;
   2533	}
   2534	return true;
   2535}
   2536
   2537static void nvme_dev_add(struct nvme_dev *dev)
   2538{
   2539	int ret;
   2540
   2541	if (!dev->ctrl.tagset) {
   2542		dev->tagset.ops = &nvme_mq_ops;
   2543		dev->tagset.nr_hw_queues = dev->online_queues - 1;
   2544		dev->tagset.nr_maps = 2; /* default + read */
   2545		if (dev->io_queues[HCTX_TYPE_POLL])
   2546			dev->tagset.nr_maps++;
   2547		dev->tagset.timeout = NVME_IO_TIMEOUT;
   2548		dev->tagset.numa_node = dev->ctrl.numa_node;
   2549		dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth,
   2550						BLK_MQ_MAX_DEPTH) - 1;
   2551		dev->tagset.cmd_size = sizeof(struct nvme_iod);
   2552		dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
   2553		dev->tagset.driver_data = dev;
   2554
   2555		/*
   2556		 * Some Apple controllers requires tags to be unique
   2557		 * across admin and IO queue, so reserve the first 32
   2558		 * tags of the IO queue.
   2559		 */
   2560		if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
   2561			dev->tagset.reserved_tags = NVME_AQ_DEPTH;
   2562
   2563		ret = blk_mq_alloc_tag_set(&dev->tagset);
   2564		if (ret) {
   2565			dev_warn(dev->ctrl.device,
   2566				"IO queues tagset allocation failed %d\n", ret);
   2567			return;
   2568		}
   2569		dev->ctrl.tagset = &dev->tagset;
   2570	} else {
   2571		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
   2572
   2573		/* Free previously allocated queues that are no longer usable */
   2574		nvme_free_queues(dev, dev->online_queues);
   2575	}
   2576
   2577	nvme_dbbuf_set(dev);
   2578}
   2579
   2580static int nvme_pci_enable(struct nvme_dev *dev)
   2581{
   2582	int result = -ENOMEM;
   2583	struct pci_dev *pdev = to_pci_dev(dev->dev);
   2584	int dma_address_bits = 64;
   2585
   2586	if (pci_enable_device_mem(pdev))
   2587		return result;
   2588
   2589	pci_set_master(pdev);
   2590
   2591	if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48)
   2592		dma_address_bits = 48;
   2593	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(dma_address_bits)))
   2594		goto disable;
   2595
   2596	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
   2597		result = -ENODEV;
   2598		goto disable;
   2599	}
   2600
   2601	/*
   2602	 * Some devices and/or platforms don't advertise or work with INTx
   2603	 * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
   2604	 * adjust this later.
   2605	 */
   2606	result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
   2607	if (result < 0)
   2608		return result;
   2609
   2610	dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
   2611
   2612	dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1,
   2613				io_queue_depth);
   2614	dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
   2615	dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
   2616	dev->dbs = dev->bar + 4096;
   2617
   2618	/*
   2619	 * Some Apple controllers require a non-standard SQE size.
   2620	 * Interestingly they also seem to ignore the CC:IOSQES register
   2621	 * so we don't bother updating it here.
   2622	 */
   2623	if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES)
   2624		dev->io_sqes = 7;
   2625	else
   2626		dev->io_sqes = NVME_NVM_IOSQES;
   2627
   2628	/*
   2629	 * Temporary fix for the Apple controller found in the MacBook8,1 and
   2630	 * some MacBook7,1 to avoid controller resets and data loss.
   2631	 */
   2632	if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
   2633		dev->q_depth = 2;
   2634		dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
   2635			"set queue depth=%u to work around controller resets\n",
   2636			dev->q_depth);
   2637	} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
   2638		   (pdev->device == 0xa821 || pdev->device == 0xa822) &&
   2639		   NVME_CAP_MQES(dev->ctrl.cap) == 0) {
   2640		dev->q_depth = 64;
   2641		dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, "
   2642                        "set queue depth=%u\n", dev->q_depth);
   2643	}
   2644
   2645	/*
   2646	 * Controllers with the shared tags quirk need the IO queue to be
   2647	 * big enough so that we get 32 tags for the admin queue
   2648	 */
   2649	if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) &&
   2650	    (dev->q_depth < (NVME_AQ_DEPTH + 2))) {
   2651		dev->q_depth = NVME_AQ_DEPTH + 2;
   2652		dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
   2653			 dev->q_depth);
   2654	}
   2655
   2656
   2657	nvme_map_cmb(dev);
   2658
   2659	pci_enable_pcie_error_reporting(pdev);
   2660	pci_save_state(pdev);
   2661	return 0;
   2662
   2663 disable:
   2664	pci_disable_device(pdev);
   2665	return result;
   2666}
   2667
   2668static void nvme_dev_unmap(struct nvme_dev *dev)
   2669{
   2670	if (dev->bar)
   2671		iounmap(dev->bar);
   2672	pci_release_mem_regions(to_pci_dev(dev->dev));
   2673}
   2674
   2675static void nvme_pci_disable(struct nvme_dev *dev)
   2676{
   2677	struct pci_dev *pdev = to_pci_dev(dev->dev);
   2678
   2679	pci_free_irq_vectors(pdev);
   2680
   2681	if (pci_is_enabled(pdev)) {
   2682		pci_disable_pcie_error_reporting(pdev);
   2683		pci_disable_device(pdev);
   2684	}
   2685}
   2686
   2687static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
   2688{
   2689	bool dead = true, freeze = false;
   2690	struct pci_dev *pdev = to_pci_dev(dev->dev);
   2691
   2692	mutex_lock(&dev->shutdown_lock);
   2693	if (pci_device_is_present(pdev) && pci_is_enabled(pdev)) {
   2694		u32 csts = readl(dev->bar + NVME_REG_CSTS);
   2695
   2696		if (dev->ctrl.state == NVME_CTRL_LIVE ||
   2697		    dev->ctrl.state == NVME_CTRL_RESETTING) {
   2698			freeze = true;
   2699			nvme_start_freeze(&dev->ctrl);
   2700		}
   2701		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
   2702			pdev->error_state  != pci_channel_io_normal);
   2703	}
   2704
   2705	/*
   2706	 * Give the controller a chance to complete all entered requests if
   2707	 * doing a safe shutdown.
   2708	 */
   2709	if (!dead && shutdown && freeze)
   2710		nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
   2711
   2712	nvme_stop_queues(&dev->ctrl);
   2713
   2714	if (!dead && dev->ctrl.queue_count > 0) {
   2715		nvme_disable_io_queues(dev);
   2716		nvme_disable_admin_queue(dev, shutdown);
   2717	}
   2718	nvme_suspend_io_queues(dev);
   2719	nvme_suspend_queue(&dev->queues[0]);
   2720	nvme_pci_disable(dev);
   2721	nvme_reap_pending_cqes(dev);
   2722
   2723	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
   2724	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
   2725	blk_mq_tagset_wait_completed_request(&dev->tagset);
   2726	blk_mq_tagset_wait_completed_request(&dev->admin_tagset);
   2727
   2728	/*
   2729	 * The driver will not be starting up queues again if shutting down so
   2730	 * must flush all entered requests to their failed completion to avoid
   2731	 * deadlocking blk-mq hot-cpu notifier.
   2732	 */
   2733	if (shutdown) {
   2734		nvme_start_queues(&dev->ctrl);
   2735		if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
   2736			nvme_start_admin_queue(&dev->ctrl);
   2737	}
   2738	mutex_unlock(&dev->shutdown_lock);
   2739}
   2740
   2741static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
   2742{
   2743	if (!nvme_wait_reset(&dev->ctrl))
   2744		return -EBUSY;
   2745	nvme_dev_disable(dev, shutdown);
   2746	return 0;
   2747}
   2748
   2749static int nvme_setup_prp_pools(struct nvme_dev *dev)
   2750{
   2751	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
   2752						NVME_CTRL_PAGE_SIZE,
   2753						NVME_CTRL_PAGE_SIZE, 0);
   2754	if (!dev->prp_page_pool)
   2755		return -ENOMEM;
   2756
   2757	/* Optimisation for I/Os between 4k and 128k */
   2758	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
   2759						256, 256, 0);
   2760	if (!dev->prp_small_pool) {
   2761		dma_pool_destroy(dev->prp_page_pool);
   2762		return -ENOMEM;
   2763	}
   2764	return 0;
   2765}
   2766
   2767static void nvme_release_prp_pools(struct nvme_dev *dev)
   2768{
   2769	dma_pool_destroy(dev->prp_page_pool);
   2770	dma_pool_destroy(dev->prp_small_pool);
   2771}
   2772
   2773static void nvme_free_tagset(struct nvme_dev *dev)
   2774{
   2775	if (dev->tagset.tags)
   2776		blk_mq_free_tag_set(&dev->tagset);
   2777	dev->ctrl.tagset = NULL;
   2778}
   2779
   2780static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
   2781{
   2782	struct nvme_dev *dev = to_nvme_dev(ctrl);
   2783
   2784	nvme_dbbuf_dma_free(dev);
   2785	nvme_free_tagset(dev);
   2786	if (dev->ctrl.admin_q)
   2787		blk_put_queue(dev->ctrl.admin_q);
   2788	free_opal_dev(dev->ctrl.opal_dev);
   2789	mempool_destroy(dev->iod_mempool);
   2790	put_device(dev->dev);
   2791	kfree(dev->queues);
   2792	kfree(dev);
   2793}
   2794
   2795static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
   2796{
   2797	/*
   2798	 * Set state to deleting now to avoid blocking nvme_wait_reset(), which
   2799	 * may be holding this pci_dev's device lock.
   2800	 */
   2801	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
   2802	nvme_get_ctrl(&dev->ctrl);
   2803	nvme_dev_disable(dev, false);
   2804	nvme_kill_queues(&dev->ctrl);
   2805	if (!queue_work(nvme_wq, &dev->remove_work))
   2806		nvme_put_ctrl(&dev->ctrl);
   2807}
   2808
   2809static void nvme_reset_work(struct work_struct *work)
   2810{
   2811	struct nvme_dev *dev =
   2812		container_of(work, struct nvme_dev, ctrl.reset_work);
   2813	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
   2814	int result;
   2815
   2816	if (dev->ctrl.state != NVME_CTRL_RESETTING) {
   2817		dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
   2818			 dev->ctrl.state);
   2819		result = -ENODEV;
   2820		goto out;
   2821	}
   2822
   2823	/*
   2824	 * If we're called to reset a live controller first shut it down before
   2825	 * moving on.
   2826	 */
   2827	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
   2828		nvme_dev_disable(dev, false);
   2829	nvme_sync_queues(&dev->ctrl);
   2830
   2831	mutex_lock(&dev->shutdown_lock);
   2832	result = nvme_pci_enable(dev);
   2833	if (result)
   2834		goto out_unlock;
   2835
   2836	result = nvme_pci_configure_admin_queue(dev);
   2837	if (result)
   2838		goto out_unlock;
   2839
   2840	result = nvme_alloc_admin_tags(dev);
   2841	if (result)
   2842		goto out_unlock;
   2843
   2844	/*
   2845	 * Limit the max command size to prevent iod->sg allocations going
   2846	 * over a single page.
   2847	 */
   2848	dev->ctrl.max_hw_sectors = min_t(u32,
   2849		NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9);
   2850	dev->ctrl.max_segments = NVME_MAX_SEGS;
   2851
   2852	/*
   2853	 * Don't limit the IOMMU merged segment size.
   2854	 */
   2855	dma_set_max_seg_size(dev->dev, 0xffffffff);
   2856	dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1);
   2857
   2858	mutex_unlock(&dev->shutdown_lock);
   2859
   2860	/*
   2861	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
   2862	 * initializing procedure here.
   2863	 */
   2864	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
   2865		dev_warn(dev->ctrl.device,
   2866			"failed to mark controller CONNECTING\n");
   2867		result = -EBUSY;
   2868		goto out;
   2869	}
   2870
   2871	/*
   2872	 * We do not support an SGL for metadata (yet), so we are limited to a
   2873	 * single integrity segment for the separate metadata pointer.
   2874	 */
   2875	dev->ctrl.max_integrity_segments = 1;
   2876
   2877	result = nvme_init_ctrl_finish(&dev->ctrl);
   2878	if (result)
   2879		goto out;
   2880
   2881	if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
   2882		if (!dev->ctrl.opal_dev)
   2883			dev->ctrl.opal_dev =
   2884				init_opal_dev(&dev->ctrl, &nvme_sec_submit);
   2885		else if (was_suspend)
   2886			opal_unlock_from_suspend(dev->ctrl.opal_dev);
   2887	} else {
   2888		free_opal_dev(dev->ctrl.opal_dev);
   2889		dev->ctrl.opal_dev = NULL;
   2890	}
   2891
   2892	if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
   2893		result = nvme_dbbuf_dma_alloc(dev);
   2894		if (result)
   2895			dev_warn(dev->dev,
   2896				 "unable to allocate dma for dbbuf\n");
   2897	}
   2898
   2899	if (dev->ctrl.hmpre) {
   2900		result = nvme_setup_host_mem(dev);
   2901		if (result < 0)
   2902			goto out;
   2903	}
   2904
   2905	result = nvme_setup_io_queues(dev);
   2906	if (result)
   2907		goto out;
   2908
   2909	/*
   2910	 * Keep the controller around but remove all namespaces if we don't have
   2911	 * any working I/O queue.
   2912	 */
   2913	if (dev->online_queues < 2) {
   2914		dev_warn(dev->ctrl.device, "IO queues not created\n");
   2915		nvme_kill_queues(&dev->ctrl);
   2916		nvme_remove_namespaces(&dev->ctrl);
   2917		nvme_free_tagset(dev);
   2918	} else {
   2919		nvme_start_queues(&dev->ctrl);
   2920		nvme_wait_freeze(&dev->ctrl);
   2921		nvme_dev_add(dev);
   2922		nvme_unfreeze(&dev->ctrl);
   2923	}
   2924
   2925	/*
   2926	 * If only admin queue live, keep it to do further investigation or
   2927	 * recovery.
   2928	 */
   2929	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
   2930		dev_warn(dev->ctrl.device,
   2931			"failed to mark controller live state\n");
   2932		result = -ENODEV;
   2933		goto out;
   2934	}
   2935
   2936	if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj,
   2937			&nvme_pci_attr_group))
   2938		dev->attrs_added = true;
   2939
   2940	nvme_start_ctrl(&dev->ctrl);
   2941	return;
   2942
   2943 out_unlock:
   2944	mutex_unlock(&dev->shutdown_lock);
   2945 out:
   2946	if (result)
   2947		dev_warn(dev->ctrl.device,
   2948			 "Removing after probe failure status: %d\n", result);
   2949	nvme_remove_dead_ctrl(dev);
   2950}
   2951
   2952static void nvme_remove_dead_ctrl_work(struct work_struct *work)
   2953{
   2954	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
   2955	struct pci_dev *pdev = to_pci_dev(dev->dev);
   2956
   2957	if (pci_get_drvdata(pdev))
   2958		device_release_driver(&pdev->dev);
   2959	nvme_put_ctrl(&dev->ctrl);
   2960}
   2961
   2962static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
   2963{
   2964	*val = readl(to_nvme_dev(ctrl)->bar + off);
   2965	return 0;
   2966}
   2967
   2968static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
   2969{
   2970	writel(val, to_nvme_dev(ctrl)->bar + off);
   2971	return 0;
   2972}
   2973
   2974static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
   2975{
   2976	*val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off);
   2977	return 0;
   2978}
   2979
   2980static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
   2981{
   2982	struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev);
   2983
   2984	return snprintf(buf, size, "%s\n", dev_name(&pdev->dev));
   2985}
   2986
   2987
   2988static void nvme_pci_print_device_info(struct nvme_ctrl *ctrl)
   2989{
   2990	struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev);
   2991	struct nvme_subsystem *subsys = ctrl->subsys;
   2992
   2993	dev_err(ctrl->device,
   2994		"VID:DID %04x:%04x model:%.*s firmware:%.*s\n",
   2995		pdev->vendor, pdev->device,
   2996		nvme_strlen(subsys->model, sizeof(subsys->model)),
   2997		subsys->model, nvme_strlen(subsys->firmware_rev,
   2998					   sizeof(subsys->firmware_rev)),
   2999		subsys->firmware_rev);
   3000}
   3001
   3002static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
   3003	.name			= "pcie",
   3004	.module			= THIS_MODULE,
   3005	.flags			= NVME_F_METADATA_SUPPORTED |
   3006				  NVME_F_PCI_P2PDMA,
   3007	.reg_read32		= nvme_pci_reg_read32,
   3008	.reg_write32		= nvme_pci_reg_write32,
   3009	.reg_read64		= nvme_pci_reg_read64,
   3010	.free_ctrl		= nvme_pci_free_ctrl,
   3011	.submit_async_event	= nvme_pci_submit_async_event,
   3012	.get_address		= nvme_pci_get_address,
   3013	.print_device_info	= nvme_pci_print_device_info,
   3014};
   3015
   3016static int nvme_dev_map(struct nvme_dev *dev)
   3017{
   3018	struct pci_dev *pdev = to_pci_dev(dev->dev);
   3019
   3020	if (pci_request_mem_regions(pdev, "nvme"))
   3021		return -ENODEV;
   3022
   3023	if (nvme_remap_bar(dev, NVME_REG_DBS + 4096))
   3024		goto release;
   3025
   3026	return 0;
   3027  release:
   3028	pci_release_mem_regions(pdev);
   3029	return -ENODEV;
   3030}
   3031
   3032static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
   3033{
   3034	if (pdev->vendor == 0x144d && pdev->device == 0xa802) {
   3035		/*
   3036		 * Several Samsung devices seem to drop off the PCIe bus
   3037		 * randomly when APST is on and uses the deepest sleep state.
   3038		 * This has been observed on a Samsung "SM951 NVMe SAMSUNG
   3039		 * 256GB", a "PM951 NVMe SAMSUNG 512GB", and a "Samsung SSD
   3040		 * 950 PRO 256GB", but it seems to be restricted to two Dell
   3041		 * laptops.
   3042		 */
   3043		if (dmi_match(DMI_SYS_VENDOR, "Dell Inc.") &&
   3044		    (dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") ||
   3045		     dmi_match(DMI_PRODUCT_NAME, "Precision 5510")))
   3046			return NVME_QUIRK_NO_DEEPEST_PS;
   3047	} else if (pdev->vendor == 0x144d && pdev->device == 0xa804) {
   3048		/*
   3049		 * Samsung SSD 960 EVO drops off the PCIe bus after system
   3050		 * suspend on a Ryzen board, ASUS PRIME B350M-A, as well as
   3051		 * within few minutes after bootup on a Coffee Lake board -
   3052		 * ASUS PRIME Z370-A
   3053		 */
   3054		if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") &&
   3055		    (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") ||
   3056		     dmi_match(DMI_BOARD_NAME, "PRIME Z370-A")))
   3057			return NVME_QUIRK_NO_APST;
   3058	} else if ((pdev->vendor == 0x144d && (pdev->device == 0xa801 ||
   3059		    pdev->device == 0xa808 || pdev->device == 0xa809)) ||
   3060		   (pdev->vendor == 0x1e0f && pdev->device == 0x0001)) {
   3061		/*
   3062		 * Forcing to use host managed nvme power settings for
   3063		 * lowest idle power with quick resume latency on
   3064		 * Samsung and Toshiba SSDs based on suspend behavior
   3065		 * on Coffee Lake board for LENOVO C640
   3066		 */
   3067		if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) &&
   3068		     dmi_match(DMI_BOARD_NAME, "LNVNB161216"))
   3069			return NVME_QUIRK_SIMPLE_SUSPEND;
   3070	}
   3071
   3072	return 0;
   3073}
   3074
   3075static void nvme_async_probe(void *data, async_cookie_t cookie)
   3076{
   3077	struct nvme_dev *dev = data;
   3078
   3079	flush_work(&dev->ctrl.reset_work);
   3080	flush_work(&dev->ctrl.scan_work);
   3081	nvme_put_ctrl(&dev->ctrl);
   3082}
   3083
   3084static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
   3085{
   3086	int node, result = -ENOMEM;
   3087	struct nvme_dev *dev;
   3088	unsigned long quirks = id->driver_data;
   3089	size_t alloc_size;
   3090
   3091	node = dev_to_node(&pdev->dev);
   3092	if (node == NUMA_NO_NODE)
   3093		set_dev_node(&pdev->dev, first_memory_node);
   3094
   3095	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
   3096	if (!dev)
   3097		return -ENOMEM;
   3098
   3099	dev->nr_write_queues = write_queues;
   3100	dev->nr_poll_queues = poll_queues;
   3101	dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1;
   3102	dev->queues = kcalloc_node(dev->nr_allocated_queues,
   3103			sizeof(struct nvme_queue), GFP_KERNEL, node);
   3104	if (!dev->queues)
   3105		goto free;
   3106
   3107	dev->dev = get_device(&pdev->dev);
   3108	pci_set_drvdata(pdev, dev);
   3109
   3110	result = nvme_dev_map(dev);
   3111	if (result)
   3112		goto put_pci;
   3113
   3114	INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
   3115	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
   3116	mutex_init(&dev->shutdown_lock);
   3117
   3118	result = nvme_setup_prp_pools(dev);
   3119	if (result)
   3120		goto unmap;
   3121
   3122	quirks |= check_vendor_combination_bug(pdev);
   3123
   3124	if (!noacpi && acpi_storage_d3(&pdev->dev)) {
   3125		/*
   3126		 * Some systems use a bios work around to ask for D3 on
   3127		 * platforms that support kernel managed suspend.
   3128		 */
   3129		dev_info(&pdev->dev,
   3130			 "platform quirk: setting simple suspend\n");
   3131		quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
   3132	}
   3133
   3134	/*
   3135	 * Double check that our mempool alloc size will cover the biggest
   3136	 * command we support.
   3137	 */
   3138	alloc_size = nvme_pci_iod_alloc_size();
   3139	WARN_ON_ONCE(alloc_size > PAGE_SIZE);
   3140
   3141	dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
   3142						mempool_kfree,
   3143						(void *) alloc_size,
   3144						GFP_KERNEL, node);
   3145	if (!dev->iod_mempool) {
   3146		result = -ENOMEM;
   3147		goto release_pools;
   3148	}
   3149
   3150	result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
   3151			quirks);
   3152	if (result)
   3153		goto release_mempool;
   3154
   3155	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
   3156
   3157	nvme_reset_ctrl(&dev->ctrl);
   3158	async_schedule(nvme_async_probe, dev);
   3159
   3160	return 0;
   3161
   3162 release_mempool:
   3163	mempool_destroy(dev->iod_mempool);
   3164 release_pools:
   3165	nvme_release_prp_pools(dev);
   3166 unmap:
   3167	nvme_dev_unmap(dev);
   3168 put_pci:
   3169	put_device(dev->dev);
   3170 free:
   3171	kfree(dev->queues);
   3172	kfree(dev);
   3173	return result;
   3174}
   3175
   3176static void nvme_reset_prepare(struct pci_dev *pdev)
   3177{
   3178	struct nvme_dev *dev = pci_get_drvdata(pdev);
   3179
   3180	/*
   3181	 * We don't need to check the return value from waiting for the reset
   3182	 * state as pci_dev device lock is held, making it impossible to race
   3183	 * with ->remove().
   3184	 */
   3185	nvme_disable_prepare_reset(dev, false);
   3186	nvme_sync_queues(&dev->ctrl);
   3187}
   3188
   3189static void nvme_reset_done(struct pci_dev *pdev)
   3190{
   3191	struct nvme_dev *dev = pci_get_drvdata(pdev);
   3192
   3193	if (!nvme_try_sched_reset(&dev->ctrl))
   3194		flush_work(&dev->ctrl.reset_work);
   3195}
   3196
   3197static void nvme_shutdown(struct pci_dev *pdev)
   3198{
   3199	struct nvme_dev *dev = pci_get_drvdata(pdev);
   3200
   3201	nvme_disable_prepare_reset(dev, true);
   3202}
   3203
   3204static void nvme_remove_attrs(struct nvme_dev *dev)
   3205{
   3206	if (dev->attrs_added)
   3207		sysfs_remove_group(&dev->ctrl.device->kobj,
   3208				   &nvme_pci_attr_group);
   3209}
   3210
   3211/*
   3212 * The driver's remove may be called on a device in a partially initialized
   3213 * state. This function must not have any dependencies on the device state in
   3214 * order to proceed.
   3215 */
   3216static void nvme_remove(struct pci_dev *pdev)
   3217{
   3218	struct nvme_dev *dev = pci_get_drvdata(pdev);
   3219
   3220	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
   3221	pci_set_drvdata(pdev, NULL);
   3222
   3223	if (!pci_device_is_present(pdev)) {
   3224		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
   3225		nvme_dev_disable(dev, true);
   3226	}
   3227
   3228	flush_work(&dev->ctrl.reset_work);
   3229	nvme_stop_ctrl(&dev->ctrl);
   3230	nvme_remove_namespaces(&dev->ctrl);
   3231	nvme_dev_disable(dev, true);
   3232	nvme_remove_attrs(dev);
   3233	nvme_free_host_mem(dev);
   3234	nvme_dev_remove_admin(dev);
   3235	nvme_free_queues(dev, 0);
   3236	nvme_release_prp_pools(dev);
   3237	nvme_dev_unmap(dev);
   3238	nvme_uninit_ctrl(&dev->ctrl);
   3239}
   3240
   3241#ifdef CONFIG_PM_SLEEP
   3242static int nvme_get_power_state(struct nvme_ctrl *ctrl, u32 *ps)
   3243{
   3244	return nvme_get_features(ctrl, NVME_FEAT_POWER_MGMT, 0, NULL, 0, ps);
   3245}
   3246
   3247static int nvme_set_power_state(struct nvme_ctrl *ctrl, u32 ps)
   3248{
   3249	return nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, ps, NULL, 0, NULL);
   3250}
   3251
   3252static int nvme_resume(struct device *dev)
   3253{
   3254	struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
   3255	struct nvme_ctrl *ctrl = &ndev->ctrl;
   3256
   3257	if (ndev->last_ps == U32_MAX ||
   3258	    nvme_set_power_state(ctrl, ndev->last_ps) != 0)
   3259		goto reset;
   3260	if (ctrl->hmpre && nvme_setup_host_mem(ndev))
   3261		goto reset;
   3262
   3263	return 0;
   3264reset:
   3265	return nvme_try_sched_reset(ctrl);
   3266}
   3267
   3268static int nvme_suspend(struct device *dev)
   3269{
   3270	struct pci_dev *pdev = to_pci_dev(dev);
   3271	struct nvme_dev *ndev = pci_get_drvdata(pdev);
   3272	struct nvme_ctrl *ctrl = &ndev->ctrl;
   3273	int ret = -EBUSY;
   3274
   3275	ndev->last_ps = U32_MAX;
   3276
   3277	/*
   3278	 * The platform does not remove power for a kernel managed suspend so
   3279	 * use host managed nvme power settings for lowest idle power if
   3280	 * possible. This should have quicker resume latency than a full device
   3281	 * shutdown.  But if the firmware is involved after the suspend or the
   3282	 * device does not support any non-default power states, shut down the
   3283	 * device fully.
   3284	 *
   3285	 * If ASPM is not enabled for the device, shut down the device and allow
   3286	 * the PCI bus layer to put it into D3 in order to take the PCIe link
   3287	 * down, so as to allow the platform to achieve its minimum low-power
   3288	 * state (which may not be possible if the link is up).
   3289	 */
   3290	if (pm_suspend_via_firmware() || !ctrl->npss ||
   3291	    !pcie_aspm_enabled(pdev) ||
   3292	    (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
   3293		return nvme_disable_prepare_reset(ndev, true);
   3294
   3295	nvme_start_freeze(ctrl);
   3296	nvme_wait_freeze(ctrl);
   3297	nvme_sync_queues(ctrl);
   3298
   3299	if (ctrl->state != NVME_CTRL_LIVE)
   3300		goto unfreeze;
   3301
   3302	/*
   3303	 * Host memory access may not be successful in a system suspend state,
   3304	 * but the specification allows the controller to access memory in a
   3305	 * non-operational power state.
   3306	 */
   3307	if (ndev->hmb) {
   3308		ret = nvme_set_host_mem(ndev, 0);
   3309		if (ret < 0)
   3310			goto unfreeze;
   3311	}
   3312
   3313	ret = nvme_get_power_state(ctrl, &ndev->last_ps);
   3314	if (ret < 0)
   3315		goto unfreeze;
   3316
   3317	/*
   3318	 * A saved state prevents pci pm from generically controlling the
   3319	 * device's power. If we're using protocol specific settings, we don't
   3320	 * want pci interfering.
   3321	 */
   3322	pci_save_state(pdev);
   3323
   3324	ret = nvme_set_power_state(ctrl, ctrl->npss);
   3325	if (ret < 0)
   3326		goto unfreeze;
   3327
   3328	if (ret) {
   3329		/* discard the saved state */
   3330		pci_load_saved_state(pdev, NULL);
   3331
   3332		/*
   3333		 * Clearing npss forces a controller reset on resume. The
   3334		 * correct value will be rediscovered then.
   3335		 */
   3336		ret = nvme_disable_prepare_reset(ndev, true);
   3337		ctrl->npss = 0;
   3338	}
   3339unfreeze:
   3340	nvme_unfreeze(ctrl);
   3341	return ret;
   3342}
   3343
   3344static int nvme_simple_suspend(struct device *dev)
   3345{
   3346	struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
   3347
   3348	return nvme_disable_prepare_reset(ndev, true);
   3349}
   3350
   3351static int nvme_simple_resume(struct device *dev)
   3352{
   3353	struct pci_dev *pdev = to_pci_dev(dev);
   3354	struct nvme_dev *ndev = pci_get_drvdata(pdev);
   3355
   3356	return nvme_try_sched_reset(&ndev->ctrl);
   3357}
   3358
   3359static const struct dev_pm_ops nvme_dev_pm_ops = {
   3360	.suspend	= nvme_suspend,
   3361	.resume		= nvme_resume,
   3362	.freeze		= nvme_simple_suspend,
   3363	.thaw		= nvme_simple_resume,
   3364	.poweroff	= nvme_simple_suspend,
   3365	.restore	= nvme_simple_resume,
   3366};
   3367#endif /* CONFIG_PM_SLEEP */
   3368
   3369static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
   3370						pci_channel_state_t state)
   3371{
   3372	struct nvme_dev *dev = pci_get_drvdata(pdev);
   3373
   3374	/*
   3375	 * A frozen channel requires a reset. When detected, this method will
   3376	 * shutdown the controller to quiesce. The controller will be restarted
   3377	 * after the slot reset through driver's slot_reset callback.
   3378	 */
   3379	switch (state) {
   3380	case pci_channel_io_normal:
   3381		return PCI_ERS_RESULT_CAN_RECOVER;
   3382	case pci_channel_io_frozen:
   3383		dev_warn(dev->ctrl.device,
   3384			"frozen state error detected, reset controller\n");
   3385		nvme_dev_disable(dev, false);
   3386		return PCI_ERS_RESULT_NEED_RESET;
   3387	case pci_channel_io_perm_failure:
   3388		dev_warn(dev->ctrl.device,
   3389			"failure state error detected, request disconnect\n");
   3390		return PCI_ERS_RESULT_DISCONNECT;
   3391	}
   3392	return PCI_ERS_RESULT_NEED_RESET;
   3393}
   3394
   3395static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
   3396{
   3397	struct nvme_dev *dev = pci_get_drvdata(pdev);
   3398
   3399	dev_info(dev->ctrl.device, "restart after slot reset\n");
   3400	pci_restore_state(pdev);
   3401	nvme_reset_ctrl(&dev->ctrl);
   3402	return PCI_ERS_RESULT_RECOVERED;
   3403}
   3404
   3405static void nvme_error_resume(struct pci_dev *pdev)
   3406{
   3407	struct nvme_dev *dev = pci_get_drvdata(pdev);
   3408
   3409	flush_work(&dev->ctrl.reset_work);
   3410}
   3411
   3412static const struct pci_error_handlers nvme_err_handler = {
   3413	.error_detected	= nvme_error_detected,
   3414	.slot_reset	= nvme_slot_reset,
   3415	.resume		= nvme_error_resume,
   3416	.reset_prepare	= nvme_reset_prepare,
   3417	.reset_done	= nvme_reset_done,
   3418};
   3419
   3420static const struct pci_device_id nvme_id_table[] = {
   3421	{ PCI_VDEVICE(INTEL, 0x0953),	/* Intel 750/P3500/P3600/P3700 */
   3422		.driver_data = NVME_QUIRK_STRIPE_SIZE |
   3423				NVME_QUIRK_DEALLOCATE_ZEROES, },
   3424	{ PCI_VDEVICE(INTEL, 0x0a53),	/* Intel P3520 */
   3425		.driver_data = NVME_QUIRK_STRIPE_SIZE |
   3426				NVME_QUIRK_DEALLOCATE_ZEROES, },
   3427	{ PCI_VDEVICE(INTEL, 0x0a54),	/* Intel P4500/P4600 */
   3428		.driver_data = NVME_QUIRK_STRIPE_SIZE |
   3429				NVME_QUIRK_DEALLOCATE_ZEROES |
   3430				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
   3431	{ PCI_VDEVICE(INTEL, 0x0a55),	/* Dell Express Flash P4600 */
   3432		.driver_data = NVME_QUIRK_STRIPE_SIZE |
   3433				NVME_QUIRK_DEALLOCATE_ZEROES, },
   3434	{ PCI_VDEVICE(INTEL, 0xf1a5),	/* Intel 600P/P3100 */
   3435		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
   3436				NVME_QUIRK_MEDIUM_PRIO_SQ |
   3437				NVME_QUIRK_NO_TEMP_THRESH_CHANGE |
   3438				NVME_QUIRK_DISABLE_WRITE_ZEROES, },
   3439	{ PCI_VDEVICE(INTEL, 0xf1a6),	/* Intel 760p/Pro 7600p */
   3440		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
   3441	{ PCI_VDEVICE(INTEL, 0x5845),	/* Qemu emulated controller */
   3442		.driver_data = NVME_QUIRK_IDENTIFY_CNS |
   3443				NVME_QUIRK_DISABLE_WRITE_ZEROES |
   3444				NVME_QUIRK_BOGUS_NID, },
   3445	{ PCI_VDEVICE(REDHAT, 0x0010),	/* Qemu emulated controller */
   3446		.driver_data = NVME_QUIRK_BOGUS_NID, },
   3447	{ PCI_DEVICE(0x126f, 0x2263),	/* Silicon Motion unidentified */
   3448		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
   3449				NVME_QUIRK_BOGUS_NID, },
   3450	{ PCI_DEVICE(0x1bb1, 0x0100),   /* Seagate Nytro Flash Storage */
   3451		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
   3452				NVME_QUIRK_NO_NS_DESC_LIST, },
   3453	{ PCI_DEVICE(0x1c58, 0x0003),	/* HGST adapter */
   3454		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
   3455	{ PCI_DEVICE(0x1c58, 0x0023),	/* WDC SN200 adapter */
   3456		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
   3457	{ PCI_DEVICE(0x1c5f, 0x0540),	/* Memblaze Pblaze4 adapter */
   3458		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
   3459	{ PCI_DEVICE(0x144d, 0xa821),   /* Samsung PM1725 */
   3460		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
   3461	{ PCI_DEVICE(0x144d, 0xa822),   /* Samsung PM1725a */
   3462		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
   3463				NVME_QUIRK_DISABLE_WRITE_ZEROES|
   3464				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
   3465	{ PCI_DEVICE(0x1987, 0x5012),	/* Phison E12 */
   3466		.driver_data = NVME_QUIRK_BOGUS_NID, },
   3467	{ PCI_DEVICE(0x1987, 0x5016),	/* Phison E16 */
   3468		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN |
   3469				NVME_QUIRK_BOGUS_NID, },
   3470	{ PCI_DEVICE(0x1b4b, 0x1092),	/* Lexar 256 GB SSD */
   3471		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
   3472				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
   3473	{ PCI_DEVICE(0x1cc1, 0x33f8),   /* ADATA IM2P33F8ABR1 1 TB */
   3474		.driver_data = NVME_QUIRK_BOGUS_NID, },
   3475	{ PCI_DEVICE(0x10ec, 0x5762),   /* ADATA SX6000LNP */
   3476		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN |
   3477				NVME_QUIRK_BOGUS_NID, },
   3478	{ PCI_DEVICE(0x1cc1, 0x8201),   /* ADATA SX8200PNP 512GB */
   3479		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
   3480				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
   3481	 { PCI_DEVICE(0x1344, 0x5407), /* Micron Technology Inc NVMe SSD */
   3482		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN },
   3483	{ PCI_DEVICE(0x1c5c, 0x1504),   /* SK Hynix PC400 */
   3484		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
   3485	{ PCI_DEVICE(0x1c5c, 0x174a),   /* SK Hynix P31 SSD */
   3486		.driver_data = NVME_QUIRK_BOGUS_NID, },
   3487	{ PCI_DEVICE(0x15b7, 0x2001),   /*  Sandisk Skyhawk */
   3488		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
   3489	{ PCI_DEVICE(0x1d97, 0x2263),   /* SPCC */
   3490		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
   3491	{ PCI_DEVICE(0x144d, 0xa80b),   /* Samsung PM9B1 256G and 512G */
   3492		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
   3493	{ PCI_DEVICE(0x144d, 0xa809),   /* Samsung MZALQ256HBJD 256G */
   3494		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
   3495	{ PCI_DEVICE(0x1cc4, 0x6303),   /* UMIS RPJTJ512MGE1QDY 512G */
   3496		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
   3497	{ PCI_DEVICE(0x1cc4, 0x6302),   /* UMIS RPJTJ256MGE1QDY 256G */
   3498		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
   3499	{ PCI_DEVICE(0x2646, 0x2262),   /* KINGSTON SKC2000 NVMe SSD */
   3500		.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
   3501	{ PCI_DEVICE(0x2646, 0x2263),   /* KINGSTON A2000 NVMe SSD  */
   3502		.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
   3503	{ PCI_DEVICE(0x1e4B, 0x1001),   /* MAXIO MAP1001 */
   3504		.driver_data = NVME_QUIRK_BOGUS_NID, },
   3505	{ PCI_DEVICE(0x1e4B, 0x1002),   /* MAXIO MAP1002 */
   3506		.driver_data = NVME_QUIRK_BOGUS_NID, },
   3507	{ PCI_DEVICE(0x1e4B, 0x1202),   /* MAXIO MAP1202 */
   3508		.driver_data = NVME_QUIRK_BOGUS_NID, },
   3509	{ PCI_DEVICE(0x1cc1, 0x5350),   /* ADATA XPG GAMMIX S50 */
   3510		.driver_data = NVME_QUIRK_BOGUS_NID, },
   3511	{ PCI_DEVICE(0x1e49, 0x0041),   /* ZHITAI TiPro7000 NVMe SSD */
   3512		.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
   3513	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061),
   3514		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
   3515	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065),
   3516		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
   3517	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x8061),
   3518		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
   3519	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd00),
   3520		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
   3521	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd01),
   3522		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
   3523	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02),
   3524		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
   3525	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
   3526		.driver_data = NVME_QUIRK_SINGLE_VECTOR },
   3527	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
   3528	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
   3529		.driver_data = NVME_QUIRK_SINGLE_VECTOR |
   3530				NVME_QUIRK_128_BYTES_SQES |
   3531				NVME_QUIRK_SHARED_TAGS |
   3532				NVME_QUIRK_SKIP_CID_GEN },
   3533	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
   3534	{ 0, }
   3535};
   3536MODULE_DEVICE_TABLE(pci, nvme_id_table);
   3537
   3538static struct pci_driver nvme_driver = {
   3539	.name		= "nvme",
   3540	.id_table	= nvme_id_table,
   3541	.probe		= nvme_probe,
   3542	.remove		= nvme_remove,
   3543	.shutdown	= nvme_shutdown,
   3544#ifdef CONFIG_PM_SLEEP
   3545	.driver		= {
   3546		.pm	= &nvme_dev_pm_ops,
   3547	},
   3548#endif
   3549	.sriov_configure = pci_sriov_configure_simple,
   3550	.err_handler	= &nvme_err_handler,
   3551};
   3552
   3553static int __init nvme_init(void)
   3554{
   3555	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
   3556	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
   3557	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
   3558	BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
   3559
   3560	return pci_register_driver(&nvme_driver);
   3561}
   3562
   3563static void __exit nvme_exit(void)
   3564{
   3565	pci_unregister_driver(&nvme_driver);
   3566	flush_workqueue(nvme_wq);
   3567}
   3568
   3569MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
   3570MODULE_LICENSE("GPL");
   3571MODULE_VERSION("1.0");
   3572module_init(nvme_init);
   3573module_exit(nvme_exit);