cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

qp.c (82758B)


      1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
      2/*
      3 * Copyright(c) 2016 - 2020 Intel Corporation.
      4 */
      5
      6#include <linux/hash.h>
      7#include <linux/bitops.h>
      8#include <linux/lockdep.h>
      9#include <linux/vmalloc.h>
     10#include <linux/slab.h>
     11#include <rdma/ib_verbs.h>
     12#include <rdma/ib_hdrs.h>
     13#include <rdma/opa_addr.h>
     14#include <rdma/uverbs_ioctl.h>
     15#include "qp.h"
     16#include "vt.h"
     17#include "trace.h"
     18
     19#define RVT_RWQ_COUNT_THRESHOLD 16
     20
     21static void rvt_rc_timeout(struct timer_list *t);
     22static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
     23			 enum ib_qp_type type);
     24
     25/*
     26 * Convert the AETH RNR timeout code into the number of microseconds.
     27 */
     28static const u32 ib_rvt_rnr_table[32] = {
     29	655360, /* 00: 655.36 */
     30	10,     /* 01:    .01 */
     31	20,     /* 02     .02 */
     32	30,     /* 03:    .03 */
     33	40,     /* 04:    .04 */
     34	60,     /* 05:    .06 */
     35	80,     /* 06:    .08 */
     36	120,    /* 07:    .12 */
     37	160,    /* 08:    .16 */
     38	240,    /* 09:    .24 */
     39	320,    /* 0A:    .32 */
     40	480,    /* 0B:    .48 */
     41	640,    /* 0C:    .64 */
     42	960,    /* 0D:    .96 */
     43	1280,   /* 0E:   1.28 */
     44	1920,   /* 0F:   1.92 */
     45	2560,   /* 10:   2.56 */
     46	3840,   /* 11:   3.84 */
     47	5120,   /* 12:   5.12 */
     48	7680,   /* 13:   7.68 */
     49	10240,  /* 14:  10.24 */
     50	15360,  /* 15:  15.36 */
     51	20480,  /* 16:  20.48 */
     52	30720,  /* 17:  30.72 */
     53	40960,  /* 18:  40.96 */
     54	61440,  /* 19:  61.44 */
     55	81920,  /* 1A:  81.92 */
     56	122880, /* 1B: 122.88 */
     57	163840, /* 1C: 163.84 */
     58	245760, /* 1D: 245.76 */
     59	327680, /* 1E: 327.68 */
     60	491520  /* 1F: 491.52 */
     61};
     62
     63/*
     64 * Note that it is OK to post send work requests in the SQE and ERR
     65 * states; rvt_do_send() will process them and generate error
     66 * completions as per IB 1.2 C10-96.
     67 */
     68const int ib_rvt_state_ops[IB_QPS_ERR + 1] = {
     69	[IB_QPS_RESET] = 0,
     70	[IB_QPS_INIT] = RVT_POST_RECV_OK,
     71	[IB_QPS_RTR] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK,
     72	[IB_QPS_RTS] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
     73	    RVT_POST_SEND_OK | RVT_PROCESS_SEND_OK |
     74	    RVT_PROCESS_NEXT_SEND_OK,
     75	[IB_QPS_SQD] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
     76	    RVT_POST_SEND_OK | RVT_PROCESS_SEND_OK,
     77	[IB_QPS_SQE] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
     78	    RVT_POST_SEND_OK | RVT_FLUSH_SEND,
     79	[IB_QPS_ERR] = RVT_POST_RECV_OK | RVT_FLUSH_RECV |
     80	    RVT_POST_SEND_OK | RVT_FLUSH_SEND,
     81};
     82EXPORT_SYMBOL(ib_rvt_state_ops);
     83
     84/* platform specific: return the last level cache (llc) size, in KiB */
     85static int rvt_wss_llc_size(void)
     86{
     87	/* assume that the boot CPU value is universal for all CPUs */
     88	return boot_cpu_data.x86_cache_size;
     89}
     90
     91/* platform specific: cacheless copy */
     92static void cacheless_memcpy(void *dst, void *src, size_t n)
     93{
     94	/*
     95	 * Use the only available X64 cacheless copy.  Add a __user cast
     96	 * to quiet sparse.  The src agument is already in the kernel so
     97	 * there are no security issues.  The extra fault recovery machinery
     98	 * is not invoked.
     99	 */
    100	__copy_user_nocache(dst, (void __user *)src, n, 0);
    101}
    102
    103void rvt_wss_exit(struct rvt_dev_info *rdi)
    104{
    105	struct rvt_wss *wss = rdi->wss;
    106
    107	if (!wss)
    108		return;
    109
    110	/* coded to handle partially initialized and repeat callers */
    111	kfree(wss->entries);
    112	wss->entries = NULL;
    113	kfree(rdi->wss);
    114	rdi->wss = NULL;
    115}
    116
    117/*
    118 * rvt_wss_init - Init wss data structures
    119 *
    120 * Return: 0 on success
    121 */
    122int rvt_wss_init(struct rvt_dev_info *rdi)
    123{
    124	unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode;
    125	unsigned int wss_threshold = rdi->dparms.wss_threshold;
    126	unsigned int wss_clean_period = rdi->dparms.wss_clean_period;
    127	long llc_size;
    128	long llc_bits;
    129	long table_size;
    130	long table_bits;
    131	struct rvt_wss *wss;
    132	int node = rdi->dparms.node;
    133
    134	if (sge_copy_mode != RVT_SGE_COPY_ADAPTIVE) {
    135		rdi->wss = NULL;
    136		return 0;
    137	}
    138
    139	rdi->wss = kzalloc_node(sizeof(*rdi->wss), GFP_KERNEL, node);
    140	if (!rdi->wss)
    141		return -ENOMEM;
    142	wss = rdi->wss;
    143
    144	/* check for a valid percent range - default to 80 if none or invalid */
    145	if (wss_threshold < 1 || wss_threshold > 100)
    146		wss_threshold = 80;
    147
    148	/* reject a wildly large period */
    149	if (wss_clean_period > 1000000)
    150		wss_clean_period = 256;
    151
    152	/* reject a zero period */
    153	if (wss_clean_period == 0)
    154		wss_clean_period = 1;
    155
    156	/*
    157	 * Calculate the table size - the next power of 2 larger than the
    158	 * LLC size.  LLC size is in KiB.
    159	 */
    160	llc_size = rvt_wss_llc_size() * 1024;
    161	table_size = roundup_pow_of_two(llc_size);
    162
    163	/* one bit per page in rounded up table */
    164	llc_bits = llc_size / PAGE_SIZE;
    165	table_bits = table_size / PAGE_SIZE;
    166	wss->pages_mask = table_bits - 1;
    167	wss->num_entries = table_bits / BITS_PER_LONG;
    168
    169	wss->threshold = (llc_bits * wss_threshold) / 100;
    170	if (wss->threshold == 0)
    171		wss->threshold = 1;
    172
    173	wss->clean_period = wss_clean_period;
    174	atomic_set(&wss->clean_counter, wss_clean_period);
    175
    176	wss->entries = kcalloc_node(wss->num_entries, sizeof(*wss->entries),
    177				    GFP_KERNEL, node);
    178	if (!wss->entries) {
    179		rvt_wss_exit(rdi);
    180		return -ENOMEM;
    181	}
    182
    183	return 0;
    184}
    185
    186/*
    187 * Advance the clean counter.  When the clean period has expired,
    188 * clean an entry.
    189 *
    190 * This is implemented in atomics to avoid locking.  Because multiple
    191 * variables are involved, it can be racy which can lead to slightly
    192 * inaccurate information.  Since this is only a heuristic, this is
    193 * OK.  Any innaccuracies will clean themselves out as the counter
    194 * advances.  That said, it is unlikely the entry clean operation will
    195 * race - the next possible racer will not start until the next clean
    196 * period.
    197 *
    198 * The clean counter is implemented as a decrement to zero.  When zero
    199 * is reached an entry is cleaned.
    200 */
    201static void wss_advance_clean_counter(struct rvt_wss *wss)
    202{
    203	int entry;
    204	int weight;
    205	unsigned long bits;
    206
    207	/* become the cleaner if we decrement the counter to zero */
    208	if (atomic_dec_and_test(&wss->clean_counter)) {
    209		/*
    210		 * Set, not add, the clean period.  This avoids an issue
    211		 * where the counter could decrement below the clean period.
    212		 * Doing a set can result in lost decrements, slowing the
    213		 * clean advance.  Since this a heuristic, this possible
    214		 * slowdown is OK.
    215		 *
    216		 * An alternative is to loop, advancing the counter by a
    217		 * clean period until the result is > 0. However, this could
    218		 * lead to several threads keeping another in the clean loop.
    219		 * This could be mitigated by limiting the number of times
    220		 * we stay in the loop.
    221		 */
    222		atomic_set(&wss->clean_counter, wss->clean_period);
    223
    224		/*
    225		 * Uniquely grab the entry to clean and move to next.
    226		 * The current entry is always the lower bits of
    227		 * wss.clean_entry.  The table size, wss.num_entries,
    228		 * is always a power-of-2.
    229		 */
    230		entry = (atomic_inc_return(&wss->clean_entry) - 1)
    231			& (wss->num_entries - 1);
    232
    233		/* clear the entry and count the bits */
    234		bits = xchg(&wss->entries[entry], 0);
    235		weight = hweight64((u64)bits);
    236		/* only adjust the contended total count if needed */
    237		if (weight)
    238			atomic_sub(weight, &wss->total_count);
    239	}
    240}
    241
    242/*
    243 * Insert the given address into the working set array.
    244 */
    245static void wss_insert(struct rvt_wss *wss, void *address)
    246{
    247	u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss->pages_mask;
    248	u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
    249	u32 nr = page & (BITS_PER_LONG - 1);
    250
    251	if (!test_and_set_bit(nr, &wss->entries[entry]))
    252		atomic_inc(&wss->total_count);
    253
    254	wss_advance_clean_counter(wss);
    255}
    256
    257/*
    258 * Is the working set larger than the threshold?
    259 */
    260static inline bool wss_exceeds_threshold(struct rvt_wss *wss)
    261{
    262	return atomic_read(&wss->total_count) >= wss->threshold;
    263}
    264
    265static void get_map_page(struct rvt_qpn_table *qpt,
    266			 struct rvt_qpn_map *map)
    267{
    268	unsigned long page = get_zeroed_page(GFP_KERNEL);
    269
    270	/*
    271	 * Free the page if someone raced with us installing it.
    272	 */
    273
    274	spin_lock(&qpt->lock);
    275	if (map->page)
    276		free_page(page);
    277	else
    278		map->page = (void *)page;
    279	spin_unlock(&qpt->lock);
    280}
    281
    282/**
    283 * init_qpn_table - initialize the QP number table for a device
    284 * @rdi: rvt dev struct
    285 * @qpt: the QPN table
    286 */
    287static int init_qpn_table(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt)
    288{
    289	u32 offset, i;
    290	struct rvt_qpn_map *map;
    291	int ret = 0;
    292
    293	if (!(rdi->dparms.qpn_res_end >= rdi->dparms.qpn_res_start))
    294		return -EINVAL;
    295
    296	spin_lock_init(&qpt->lock);
    297
    298	qpt->last = rdi->dparms.qpn_start;
    299	qpt->incr = rdi->dparms.qpn_inc << rdi->dparms.qos_shift;
    300
    301	/*
    302	 * Drivers may want some QPs beyond what we need for verbs let them use
    303	 * our qpn table. No need for two. Lets go ahead and mark the bitmaps
    304	 * for those. The reserved range must be *after* the range which verbs
    305	 * will pick from.
    306	 */
    307
    308	/* Figure out number of bit maps needed before reserved range */
    309	qpt->nmaps = rdi->dparms.qpn_res_start / RVT_BITS_PER_PAGE;
    310
    311	/* This should always be zero */
    312	offset = rdi->dparms.qpn_res_start & RVT_BITS_PER_PAGE_MASK;
    313
    314	/* Starting with the first reserved bit map */
    315	map = &qpt->map[qpt->nmaps];
    316
    317	rvt_pr_info(rdi, "Reserving QPNs from 0x%x to 0x%x for non-verbs use\n",
    318		    rdi->dparms.qpn_res_start, rdi->dparms.qpn_res_end);
    319	for (i = rdi->dparms.qpn_res_start; i <= rdi->dparms.qpn_res_end; i++) {
    320		if (!map->page) {
    321			get_map_page(qpt, map);
    322			if (!map->page) {
    323				ret = -ENOMEM;
    324				break;
    325			}
    326		}
    327		set_bit(offset, map->page);
    328		offset++;
    329		if (offset == RVT_BITS_PER_PAGE) {
    330			/* next page */
    331			qpt->nmaps++;
    332			map++;
    333			offset = 0;
    334		}
    335	}
    336	return ret;
    337}
    338
    339/**
    340 * free_qpn_table - free the QP number table for a device
    341 * @qpt: the QPN table
    342 */
    343static void free_qpn_table(struct rvt_qpn_table *qpt)
    344{
    345	int i;
    346
    347	for (i = 0; i < ARRAY_SIZE(qpt->map); i++)
    348		free_page((unsigned long)qpt->map[i].page);
    349}
    350
    351/**
    352 * rvt_driver_qp_init - Init driver qp resources
    353 * @rdi: rvt dev strucutre
    354 *
    355 * Return: 0 on success
    356 */
    357int rvt_driver_qp_init(struct rvt_dev_info *rdi)
    358{
    359	int i;
    360	int ret = -ENOMEM;
    361
    362	if (!rdi->dparms.qp_table_size)
    363		return -EINVAL;
    364
    365	/*
    366	 * If driver is not doing any QP allocation then make sure it is
    367	 * providing the necessary QP functions.
    368	 */
    369	if (!rdi->driver_f.free_all_qps ||
    370	    !rdi->driver_f.qp_priv_alloc ||
    371	    !rdi->driver_f.qp_priv_free ||
    372	    !rdi->driver_f.notify_qp_reset ||
    373	    !rdi->driver_f.notify_restart_rc)
    374		return -EINVAL;
    375
    376	/* allocate parent object */
    377	rdi->qp_dev = kzalloc_node(sizeof(*rdi->qp_dev), GFP_KERNEL,
    378				   rdi->dparms.node);
    379	if (!rdi->qp_dev)
    380		return -ENOMEM;
    381
    382	/* allocate hash table */
    383	rdi->qp_dev->qp_table_size = rdi->dparms.qp_table_size;
    384	rdi->qp_dev->qp_table_bits = ilog2(rdi->dparms.qp_table_size);
    385	rdi->qp_dev->qp_table =
    386		kmalloc_array_node(rdi->qp_dev->qp_table_size,
    387			     sizeof(*rdi->qp_dev->qp_table),
    388			     GFP_KERNEL, rdi->dparms.node);
    389	if (!rdi->qp_dev->qp_table)
    390		goto no_qp_table;
    391
    392	for (i = 0; i < rdi->qp_dev->qp_table_size; i++)
    393		RCU_INIT_POINTER(rdi->qp_dev->qp_table[i], NULL);
    394
    395	spin_lock_init(&rdi->qp_dev->qpt_lock);
    396
    397	/* initialize qpn map */
    398	if (init_qpn_table(rdi, &rdi->qp_dev->qpn_table))
    399		goto fail_table;
    400
    401	spin_lock_init(&rdi->n_qps_lock);
    402
    403	return 0;
    404
    405fail_table:
    406	kfree(rdi->qp_dev->qp_table);
    407	free_qpn_table(&rdi->qp_dev->qpn_table);
    408
    409no_qp_table:
    410	kfree(rdi->qp_dev);
    411
    412	return ret;
    413}
    414
    415/**
    416 * rvt_free_qp_cb - callback function to reset a qp
    417 * @qp: the qp to reset
    418 * @v: a 64-bit value
    419 *
    420 * This function resets the qp and removes it from the
    421 * qp hash table.
    422 */
    423static void rvt_free_qp_cb(struct rvt_qp *qp, u64 v)
    424{
    425	unsigned int *qp_inuse = (unsigned int *)v;
    426	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
    427
    428	/* Reset the qp and remove it from the qp hash list */
    429	rvt_reset_qp(rdi, qp, qp->ibqp.qp_type);
    430
    431	/* Increment the qp_inuse count */
    432	(*qp_inuse)++;
    433}
    434
    435/**
    436 * rvt_free_all_qps - check for QPs still in use
    437 * @rdi: rvt device info structure
    438 *
    439 * There should not be any QPs still in use.
    440 * Free memory for table.
    441 * Return the number of QPs still in use.
    442 */
    443static unsigned rvt_free_all_qps(struct rvt_dev_info *rdi)
    444{
    445	unsigned int qp_inuse = 0;
    446
    447	qp_inuse += rvt_mcast_tree_empty(rdi);
    448
    449	rvt_qp_iter(rdi, (u64)&qp_inuse, rvt_free_qp_cb);
    450
    451	return qp_inuse;
    452}
    453
    454/**
    455 * rvt_qp_exit - clean up qps on device exit
    456 * @rdi: rvt dev structure
    457 *
    458 * Check for qp leaks and free resources.
    459 */
    460void rvt_qp_exit(struct rvt_dev_info *rdi)
    461{
    462	u32 qps_inuse = rvt_free_all_qps(rdi);
    463
    464	if (qps_inuse)
    465		rvt_pr_err(rdi, "QP memory leak! %u still in use\n",
    466			   qps_inuse);
    467	if (!rdi->qp_dev)
    468		return;
    469
    470	kfree(rdi->qp_dev->qp_table);
    471	free_qpn_table(&rdi->qp_dev->qpn_table);
    472	kfree(rdi->qp_dev);
    473}
    474
    475static inline unsigned mk_qpn(struct rvt_qpn_table *qpt,
    476			      struct rvt_qpn_map *map, unsigned off)
    477{
    478	return (map - qpt->map) * RVT_BITS_PER_PAGE + off;
    479}
    480
    481/**
    482 * alloc_qpn - Allocate the next available qpn or zero/one for QP type
    483 *	       IB_QPT_SMI/IB_QPT_GSI
    484 * @rdi: rvt device info structure
    485 * @qpt: queue pair number table pointer
    486 * @type: the QP type
    487 * @port_num: IB port number, 1 based, comes from core
    488 * @exclude_prefix: prefix of special queue pair number being allocated
    489 *
    490 * Return: The queue pair number
    491 */
    492static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
    493		     enum ib_qp_type type, u8 port_num, u8 exclude_prefix)
    494{
    495	u32 i, offset, max_scan, qpn;
    496	struct rvt_qpn_map *map;
    497	u32 ret;
    498	u32 max_qpn = exclude_prefix == RVT_AIP_QP_PREFIX ?
    499		RVT_AIP_QPN_MAX : RVT_QPN_MAX;
    500
    501	if (rdi->driver_f.alloc_qpn)
    502		return rdi->driver_f.alloc_qpn(rdi, qpt, type, port_num);
    503
    504	if (type == IB_QPT_SMI || type == IB_QPT_GSI) {
    505		unsigned n;
    506
    507		ret = type == IB_QPT_GSI;
    508		n = 1 << (ret + 2 * (port_num - 1));
    509		spin_lock(&qpt->lock);
    510		if (qpt->flags & n)
    511			ret = -EINVAL;
    512		else
    513			qpt->flags |= n;
    514		spin_unlock(&qpt->lock);
    515		goto bail;
    516	}
    517
    518	qpn = qpt->last + qpt->incr;
    519	if (qpn >= max_qpn)
    520		qpn = qpt->incr | ((qpt->last & 1) ^ 1);
    521	/* offset carries bit 0 */
    522	offset = qpn & RVT_BITS_PER_PAGE_MASK;
    523	map = &qpt->map[qpn / RVT_BITS_PER_PAGE];
    524	max_scan = qpt->nmaps - !offset;
    525	for (i = 0;;) {
    526		if (unlikely(!map->page)) {
    527			get_map_page(qpt, map);
    528			if (unlikely(!map->page))
    529				break;
    530		}
    531		do {
    532			if (!test_and_set_bit(offset, map->page)) {
    533				qpt->last = qpn;
    534				ret = qpn;
    535				goto bail;
    536			}
    537			offset += qpt->incr;
    538			/*
    539			 * This qpn might be bogus if offset >= BITS_PER_PAGE.
    540			 * That is OK.   It gets re-assigned below
    541			 */
    542			qpn = mk_qpn(qpt, map, offset);
    543		} while (offset < RVT_BITS_PER_PAGE && qpn < RVT_QPN_MAX);
    544		/*
    545		 * In order to keep the number of pages allocated to a
    546		 * minimum, we scan the all existing pages before increasing
    547		 * the size of the bitmap table.
    548		 */
    549		if (++i > max_scan) {
    550			if (qpt->nmaps == RVT_QPNMAP_ENTRIES)
    551				break;
    552			map = &qpt->map[qpt->nmaps++];
    553			/* start at incr with current bit 0 */
    554			offset = qpt->incr | (offset & 1);
    555		} else if (map < &qpt->map[qpt->nmaps]) {
    556			++map;
    557			/* start at incr with current bit 0 */
    558			offset = qpt->incr | (offset & 1);
    559		} else {
    560			map = &qpt->map[0];
    561			/* wrap to first map page, invert bit 0 */
    562			offset = qpt->incr | ((offset & 1) ^ 1);
    563		}
    564		/* there can be no set bits in low-order QoS bits */
    565		WARN_ON(rdi->dparms.qos_shift > 1 &&
    566			offset & ((BIT(rdi->dparms.qos_shift - 1) - 1) << 1));
    567		qpn = mk_qpn(qpt, map, offset);
    568	}
    569
    570	ret = -ENOMEM;
    571
    572bail:
    573	return ret;
    574}
    575
    576/**
    577 * rvt_clear_mr_refs - Drop help mr refs
    578 * @qp: rvt qp data structure
    579 * @clr_sends: If shoudl clear send side or not
    580 */
    581static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
    582{
    583	unsigned n;
    584	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
    585
    586	if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags))
    587		rvt_put_ss(&qp->s_rdma_read_sge);
    588
    589	rvt_put_ss(&qp->r_sge);
    590
    591	if (clr_sends) {
    592		while (qp->s_last != qp->s_head) {
    593			struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_last);
    594
    595			rvt_put_qp_swqe(qp, wqe);
    596			if (++qp->s_last >= qp->s_size)
    597				qp->s_last = 0;
    598			smp_wmb(); /* see qp_set_savail */
    599		}
    600		if (qp->s_rdma_mr) {
    601			rvt_put_mr(qp->s_rdma_mr);
    602			qp->s_rdma_mr = NULL;
    603		}
    604	}
    605
    606	for (n = 0; qp->s_ack_queue && n < rvt_max_atomic(rdi); n++) {
    607		struct rvt_ack_entry *e = &qp->s_ack_queue[n];
    608
    609		if (e->rdma_sge.mr) {
    610			rvt_put_mr(e->rdma_sge.mr);
    611			e->rdma_sge.mr = NULL;
    612		}
    613	}
    614}
    615
    616/**
    617 * rvt_swqe_has_lkey - return true if lkey is used by swqe
    618 * @wqe: the send wqe
    619 * @lkey: the lkey
    620 *
    621 * Test the swqe for using lkey
    622 */
    623static bool rvt_swqe_has_lkey(struct rvt_swqe *wqe, u32 lkey)
    624{
    625	int i;
    626
    627	for (i = 0; i < wqe->wr.num_sge; i++) {
    628		struct rvt_sge *sge = &wqe->sg_list[i];
    629
    630		if (rvt_mr_has_lkey(sge->mr, lkey))
    631			return true;
    632	}
    633	return false;
    634}
    635
    636/**
    637 * rvt_qp_sends_has_lkey - return true is qp sends use lkey
    638 * @qp: the rvt_qp
    639 * @lkey: the lkey
    640 */
    641static bool rvt_qp_sends_has_lkey(struct rvt_qp *qp, u32 lkey)
    642{
    643	u32 s_last = qp->s_last;
    644
    645	while (s_last != qp->s_head) {
    646		struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, s_last);
    647
    648		if (rvt_swqe_has_lkey(wqe, lkey))
    649			return true;
    650
    651		if (++s_last >= qp->s_size)
    652			s_last = 0;
    653	}
    654	if (qp->s_rdma_mr)
    655		if (rvt_mr_has_lkey(qp->s_rdma_mr, lkey))
    656			return true;
    657	return false;
    658}
    659
    660/**
    661 * rvt_qp_acks_has_lkey - return true if acks have lkey
    662 * @qp: the qp
    663 * @lkey: the lkey
    664 */
    665static bool rvt_qp_acks_has_lkey(struct rvt_qp *qp, u32 lkey)
    666{
    667	int i;
    668	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
    669
    670	for (i = 0; qp->s_ack_queue && i < rvt_max_atomic(rdi); i++) {
    671		struct rvt_ack_entry *e = &qp->s_ack_queue[i];
    672
    673		if (rvt_mr_has_lkey(e->rdma_sge.mr, lkey))
    674			return true;
    675	}
    676	return false;
    677}
    678
    679/**
    680 * rvt_qp_mr_clean - clean up remote ops for lkey
    681 * @qp: the qp
    682 * @lkey: the lkey that is being de-registered
    683 *
    684 * This routine checks if the lkey is being used by
    685 * the qp.
    686 *
    687 * If so, the qp is put into an error state to elminate
    688 * any references from the qp.
    689 */
    690void rvt_qp_mr_clean(struct rvt_qp *qp, u32 lkey)
    691{
    692	bool lastwqe = false;
    693
    694	if (qp->ibqp.qp_type == IB_QPT_SMI ||
    695	    qp->ibqp.qp_type == IB_QPT_GSI)
    696		/* avoid special QPs */
    697		return;
    698	spin_lock_irq(&qp->r_lock);
    699	spin_lock(&qp->s_hlock);
    700	spin_lock(&qp->s_lock);
    701
    702	if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET)
    703		goto check_lwqe;
    704
    705	if (rvt_ss_has_lkey(&qp->r_sge, lkey) ||
    706	    rvt_qp_sends_has_lkey(qp, lkey) ||
    707	    rvt_qp_acks_has_lkey(qp, lkey))
    708		lastwqe = rvt_error_qp(qp, IB_WC_LOC_PROT_ERR);
    709check_lwqe:
    710	spin_unlock(&qp->s_lock);
    711	spin_unlock(&qp->s_hlock);
    712	spin_unlock_irq(&qp->r_lock);
    713	if (lastwqe) {
    714		struct ib_event ev;
    715
    716		ev.device = qp->ibqp.device;
    717		ev.element.qp = &qp->ibqp;
    718		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
    719		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
    720	}
    721}
    722
    723/**
    724 * rvt_remove_qp - remove qp form table
    725 * @rdi: rvt dev struct
    726 * @qp: qp to remove
    727 *
    728 * Remove the QP from the table so it can't be found asynchronously by
    729 * the receive routine.
    730 */
    731static void rvt_remove_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp)
    732{
    733	struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1];
    734	u32 n = hash_32(qp->ibqp.qp_num, rdi->qp_dev->qp_table_bits);
    735	unsigned long flags;
    736	int removed = 1;
    737
    738	spin_lock_irqsave(&rdi->qp_dev->qpt_lock, flags);
    739
    740	if (rcu_dereference_protected(rvp->qp[0],
    741			lockdep_is_held(&rdi->qp_dev->qpt_lock)) == qp) {
    742		RCU_INIT_POINTER(rvp->qp[0], NULL);
    743	} else if (rcu_dereference_protected(rvp->qp[1],
    744			lockdep_is_held(&rdi->qp_dev->qpt_lock)) == qp) {
    745		RCU_INIT_POINTER(rvp->qp[1], NULL);
    746	} else {
    747		struct rvt_qp *q;
    748		struct rvt_qp __rcu **qpp;
    749
    750		removed = 0;
    751		qpp = &rdi->qp_dev->qp_table[n];
    752		for (; (q = rcu_dereference_protected(*qpp,
    753			lockdep_is_held(&rdi->qp_dev->qpt_lock))) != NULL;
    754			qpp = &q->next) {
    755			if (q == qp) {
    756				RCU_INIT_POINTER(*qpp,
    757				     rcu_dereference_protected(qp->next,
    758				     lockdep_is_held(&rdi->qp_dev->qpt_lock)));
    759				removed = 1;
    760				trace_rvt_qpremove(qp, n);
    761				break;
    762			}
    763		}
    764	}
    765
    766	spin_unlock_irqrestore(&rdi->qp_dev->qpt_lock, flags);
    767	if (removed) {
    768		synchronize_rcu();
    769		rvt_put_qp(qp);
    770	}
    771}
    772
    773/**
    774 * rvt_alloc_rq - allocate memory for user or kernel buffer
    775 * @rq: receive queue data structure
    776 * @size: number of request queue entries
    777 * @node: The NUMA node
    778 * @udata: True if user data is available or not false
    779 *
    780 * Return: If memory allocation failed, return -ENONEM
    781 * This function is used by both shared receive
    782 * queues and non-shared receive queues to allocate
    783 * memory.
    784 */
    785int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node,
    786		 struct ib_udata *udata)
    787{
    788	if (udata) {
    789		rq->wq = vmalloc_user(sizeof(struct rvt_rwq) + size);
    790		if (!rq->wq)
    791			goto bail;
    792		/* need kwq with no buffers */
    793		rq->kwq = kzalloc_node(sizeof(*rq->kwq), GFP_KERNEL, node);
    794		if (!rq->kwq)
    795			goto bail;
    796		rq->kwq->curr_wq = rq->wq->wq;
    797	} else {
    798		/* need kwq with buffers */
    799		rq->kwq =
    800			vzalloc_node(sizeof(struct rvt_krwq) + size, node);
    801		if (!rq->kwq)
    802			goto bail;
    803		rq->kwq->curr_wq = rq->kwq->wq;
    804	}
    805
    806	spin_lock_init(&rq->kwq->p_lock);
    807	spin_lock_init(&rq->kwq->c_lock);
    808	return 0;
    809bail:
    810	rvt_free_rq(rq);
    811	return -ENOMEM;
    812}
    813
    814/**
    815 * rvt_init_qp - initialize the QP state to the reset state
    816 * @rdi: rvt dev struct
    817 * @qp: the QP to init or reinit
    818 * @type: the QP type
    819 *
    820 * This function is called from both rvt_create_qp() and
    821 * rvt_reset_qp().   The difference is that the reset
    822 * patch the necessary locks to protect against concurent
    823 * access.
    824 */
    825static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
    826			enum ib_qp_type type)
    827{
    828	qp->remote_qpn = 0;
    829	qp->qkey = 0;
    830	qp->qp_access_flags = 0;
    831	qp->s_flags &= RVT_S_SIGNAL_REQ_WR;
    832	qp->s_hdrwords = 0;
    833	qp->s_wqe = NULL;
    834	qp->s_draining = 0;
    835	qp->s_next_psn = 0;
    836	qp->s_last_psn = 0;
    837	qp->s_sending_psn = 0;
    838	qp->s_sending_hpsn = 0;
    839	qp->s_psn = 0;
    840	qp->r_psn = 0;
    841	qp->r_msn = 0;
    842	if (type == IB_QPT_RC) {
    843		qp->s_state = IB_OPCODE_RC_SEND_LAST;
    844		qp->r_state = IB_OPCODE_RC_SEND_LAST;
    845	} else {
    846		qp->s_state = IB_OPCODE_UC_SEND_LAST;
    847		qp->r_state = IB_OPCODE_UC_SEND_LAST;
    848	}
    849	qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
    850	qp->r_nak_state = 0;
    851	qp->r_aflags = 0;
    852	qp->r_flags = 0;
    853	qp->s_head = 0;
    854	qp->s_tail = 0;
    855	qp->s_cur = 0;
    856	qp->s_acked = 0;
    857	qp->s_last = 0;
    858	qp->s_ssn = 1;
    859	qp->s_lsn = 0;
    860	qp->s_mig_state = IB_MIG_MIGRATED;
    861	qp->r_head_ack_queue = 0;
    862	qp->s_tail_ack_queue = 0;
    863	qp->s_acked_ack_queue = 0;
    864	qp->s_num_rd_atomic = 0;
    865	qp->r_sge.num_sge = 0;
    866	atomic_set(&qp->s_reserved_used, 0);
    867}
    868
    869/**
    870 * _rvt_reset_qp - initialize the QP state to the reset state
    871 * @rdi: rvt dev struct
    872 * @qp: the QP to reset
    873 * @type: the QP type
    874 *
    875 * r_lock, s_hlock, and s_lock are required to be held by the caller
    876 */
    877static void _rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
    878			  enum ib_qp_type type)
    879	__must_hold(&qp->s_lock)
    880	__must_hold(&qp->s_hlock)
    881	__must_hold(&qp->r_lock)
    882{
    883	lockdep_assert_held(&qp->r_lock);
    884	lockdep_assert_held(&qp->s_hlock);
    885	lockdep_assert_held(&qp->s_lock);
    886	if (qp->state != IB_QPS_RESET) {
    887		qp->state = IB_QPS_RESET;
    888
    889		/* Let drivers flush their waitlist */
    890		rdi->driver_f.flush_qp_waiters(qp);
    891		rvt_stop_rc_timers(qp);
    892		qp->s_flags &= ~(RVT_S_TIMER | RVT_S_ANY_WAIT);
    893		spin_unlock(&qp->s_lock);
    894		spin_unlock(&qp->s_hlock);
    895		spin_unlock_irq(&qp->r_lock);
    896
    897		/* Stop the send queue and the retry timer */
    898		rdi->driver_f.stop_send_queue(qp);
    899		rvt_del_timers_sync(qp);
    900		/* Wait for things to stop */
    901		rdi->driver_f.quiesce_qp(qp);
    902
    903		/* take qp out the hash and wait for it to be unused */
    904		rvt_remove_qp(rdi, qp);
    905
    906		/* grab the lock b/c it was locked at call time */
    907		spin_lock_irq(&qp->r_lock);
    908		spin_lock(&qp->s_hlock);
    909		spin_lock(&qp->s_lock);
    910
    911		rvt_clear_mr_refs(qp, 1);
    912		/*
    913		 * Let the driver do any tear down or re-init it needs to for
    914		 * a qp that has been reset
    915		 */
    916		rdi->driver_f.notify_qp_reset(qp);
    917	}
    918	rvt_init_qp(rdi, qp, type);
    919	lockdep_assert_held(&qp->r_lock);
    920	lockdep_assert_held(&qp->s_hlock);
    921	lockdep_assert_held(&qp->s_lock);
    922}
    923
    924/**
    925 * rvt_reset_qp - initialize the QP state to the reset state
    926 * @rdi: the device info
    927 * @qp: the QP to reset
    928 * @type: the QP type
    929 *
    930 * This is the wrapper function to acquire the r_lock, s_hlock, and s_lock
    931 * before calling _rvt_reset_qp().
    932 */
    933static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
    934			 enum ib_qp_type type)
    935{
    936	spin_lock_irq(&qp->r_lock);
    937	spin_lock(&qp->s_hlock);
    938	spin_lock(&qp->s_lock);
    939	_rvt_reset_qp(rdi, qp, type);
    940	spin_unlock(&qp->s_lock);
    941	spin_unlock(&qp->s_hlock);
    942	spin_unlock_irq(&qp->r_lock);
    943}
    944
    945/**
    946 * rvt_free_qpn - Free a qpn from the bit map
    947 * @qpt: QP table
    948 * @qpn: queue pair number to free
    949 */
    950static void rvt_free_qpn(struct rvt_qpn_table *qpt, u32 qpn)
    951{
    952	struct rvt_qpn_map *map;
    953
    954	if ((qpn & RVT_AIP_QP_PREFIX_MASK) == RVT_AIP_QP_BASE)
    955		qpn &= RVT_AIP_QP_SUFFIX;
    956
    957	map = qpt->map + (qpn & RVT_QPN_MASK) / RVT_BITS_PER_PAGE;
    958	if (map->page)
    959		clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page);
    960}
    961
    962/**
    963 * get_allowed_ops - Given a QP type return the appropriate allowed OP
    964 * @type: valid, supported, QP type
    965 */
    966static u8 get_allowed_ops(enum ib_qp_type type)
    967{
    968	return type == IB_QPT_RC ? IB_OPCODE_RC : type == IB_QPT_UC ?
    969		IB_OPCODE_UC : IB_OPCODE_UD;
    970}
    971
    972/**
    973 * free_ud_wq_attr - Clean up AH attribute cache for UD QPs
    974 * @qp: Valid QP with allowed_ops set
    975 *
    976 * The rvt_swqe data structure being used is a union, so this is
    977 * only valid for UD QPs.
    978 */
    979static void free_ud_wq_attr(struct rvt_qp *qp)
    980{
    981	struct rvt_swqe *wqe;
    982	int i;
    983
    984	for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
    985		wqe = rvt_get_swqe_ptr(qp, i);
    986		kfree(wqe->ud_wr.attr);
    987		wqe->ud_wr.attr = NULL;
    988	}
    989}
    990
    991/**
    992 * alloc_ud_wq_attr - AH attribute cache for UD QPs
    993 * @qp: Valid QP with allowed_ops set
    994 * @node: Numa node for allocation
    995 *
    996 * The rvt_swqe data structure being used is a union, so this is
    997 * only valid for UD QPs.
    998 */
    999static int alloc_ud_wq_attr(struct rvt_qp *qp, int node)
   1000{
   1001	struct rvt_swqe *wqe;
   1002	int i;
   1003
   1004	for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
   1005		wqe = rvt_get_swqe_ptr(qp, i);
   1006		wqe->ud_wr.attr = kzalloc_node(sizeof(*wqe->ud_wr.attr),
   1007					       GFP_KERNEL, node);
   1008		if (!wqe->ud_wr.attr) {
   1009			free_ud_wq_attr(qp);
   1010			return -ENOMEM;
   1011		}
   1012	}
   1013
   1014	return 0;
   1015}
   1016
   1017/**
   1018 * rvt_create_qp - create a queue pair for a device
   1019 * @ibqp: the queue pair
   1020 * @init_attr: the attributes of the queue pair
   1021 * @udata: user data for libibverbs.so
   1022 *
   1023 * Queue pair creation is mostly an rvt issue. However, drivers have their own
   1024 * unique idea of what queue pair numbers mean. For instance there is a reserved
   1025 * range for PSM.
   1026 *
   1027 * Return: 0 on success, otherwise returns an errno.
   1028 *
   1029 * Called by the ib_create_qp() core verbs function.
   1030 */
   1031int rvt_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
   1032		  struct ib_udata *udata)
   1033{
   1034	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
   1035	int ret = -ENOMEM;
   1036	struct rvt_swqe *swq = NULL;
   1037	size_t sz;
   1038	size_t sg_list_sz = 0;
   1039	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
   1040	void *priv = NULL;
   1041	size_t sqsize;
   1042	u8 exclude_prefix = 0;
   1043
   1044	if (!rdi)
   1045		return -EINVAL;
   1046
   1047	if (init_attr->create_flags & ~IB_QP_CREATE_NETDEV_USE)
   1048		return -EOPNOTSUPP;
   1049
   1050	if (init_attr->cap.max_send_sge > rdi->dparms.props.max_send_sge ||
   1051	    init_attr->cap.max_send_wr > rdi->dparms.props.max_qp_wr)
   1052		return -EINVAL;
   1053
   1054	/* Check receive queue parameters if no SRQ is specified. */
   1055	if (!init_attr->srq) {
   1056		if (init_attr->cap.max_recv_sge >
   1057		    rdi->dparms.props.max_recv_sge ||
   1058		    init_attr->cap.max_recv_wr > rdi->dparms.props.max_qp_wr)
   1059			return -EINVAL;
   1060
   1061		if (init_attr->cap.max_send_sge +
   1062		    init_attr->cap.max_send_wr +
   1063		    init_attr->cap.max_recv_sge +
   1064		    init_attr->cap.max_recv_wr == 0)
   1065			return -EINVAL;
   1066	}
   1067	sqsize =
   1068		init_attr->cap.max_send_wr + 1 +
   1069		rdi->dparms.reserved_operations;
   1070	switch (init_attr->qp_type) {
   1071	case IB_QPT_SMI:
   1072	case IB_QPT_GSI:
   1073		if (init_attr->port_num == 0 ||
   1074		    init_attr->port_num > ibqp->device->phys_port_cnt)
   1075			return -EINVAL;
   1076		fallthrough;
   1077	case IB_QPT_UC:
   1078	case IB_QPT_RC:
   1079	case IB_QPT_UD:
   1080		sz = struct_size(swq, sg_list, init_attr->cap.max_send_sge);
   1081		swq = vzalloc_node(array_size(sz, sqsize), rdi->dparms.node);
   1082		if (!swq)
   1083			return -ENOMEM;
   1084
   1085		if (init_attr->srq) {
   1086			struct rvt_srq *srq = ibsrq_to_rvtsrq(init_attr->srq);
   1087
   1088			if (srq->rq.max_sge > 1)
   1089				sg_list_sz = sizeof(*qp->r_sg_list) *
   1090					(srq->rq.max_sge - 1);
   1091		} else if (init_attr->cap.max_recv_sge > 1)
   1092			sg_list_sz = sizeof(*qp->r_sg_list) *
   1093				(init_attr->cap.max_recv_sge - 1);
   1094		qp->r_sg_list =
   1095			kzalloc_node(sg_list_sz, GFP_KERNEL, rdi->dparms.node);
   1096		if (!qp->r_sg_list)
   1097			goto bail_qp;
   1098		qp->allowed_ops = get_allowed_ops(init_attr->qp_type);
   1099
   1100		RCU_INIT_POINTER(qp->next, NULL);
   1101		if (init_attr->qp_type == IB_QPT_RC) {
   1102			qp->s_ack_queue =
   1103				kcalloc_node(rvt_max_atomic(rdi),
   1104					     sizeof(*qp->s_ack_queue),
   1105					     GFP_KERNEL,
   1106					     rdi->dparms.node);
   1107			if (!qp->s_ack_queue)
   1108				goto bail_qp;
   1109		}
   1110		/* initialize timers needed for rc qp */
   1111		timer_setup(&qp->s_timer, rvt_rc_timeout, 0);
   1112		hrtimer_init(&qp->s_rnr_timer, CLOCK_MONOTONIC,
   1113			     HRTIMER_MODE_REL);
   1114		qp->s_rnr_timer.function = rvt_rc_rnr_retry;
   1115
   1116		/*
   1117		 * Driver needs to set up it's private QP structure and do any
   1118		 * initialization that is needed.
   1119		 */
   1120		priv = rdi->driver_f.qp_priv_alloc(rdi, qp);
   1121		if (IS_ERR(priv)) {
   1122			ret = PTR_ERR(priv);
   1123			goto bail_qp;
   1124		}
   1125		qp->priv = priv;
   1126		qp->timeout_jiffies =
   1127			usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
   1128				1000UL);
   1129		if (init_attr->srq) {
   1130			sz = 0;
   1131		} else {
   1132			qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
   1133			qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
   1134			sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
   1135				sizeof(struct rvt_rwqe);
   1136			ret = rvt_alloc_rq(&qp->r_rq, qp->r_rq.size * sz,
   1137					   rdi->dparms.node, udata);
   1138			if (ret)
   1139				goto bail_driver_priv;
   1140		}
   1141
   1142		/*
   1143		 * ib_create_qp() will initialize qp->ibqp
   1144		 * except for qp->ibqp.qp_num.
   1145		 */
   1146		spin_lock_init(&qp->r_lock);
   1147		spin_lock_init(&qp->s_hlock);
   1148		spin_lock_init(&qp->s_lock);
   1149		atomic_set(&qp->refcount, 0);
   1150		atomic_set(&qp->local_ops_pending, 0);
   1151		init_waitqueue_head(&qp->wait);
   1152		INIT_LIST_HEAD(&qp->rspwait);
   1153		qp->state = IB_QPS_RESET;
   1154		qp->s_wq = swq;
   1155		qp->s_size = sqsize;
   1156		qp->s_avail = init_attr->cap.max_send_wr;
   1157		qp->s_max_sge = init_attr->cap.max_send_sge;
   1158		if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
   1159			qp->s_flags = RVT_S_SIGNAL_REQ_WR;
   1160		ret = alloc_ud_wq_attr(qp, rdi->dparms.node);
   1161		if (ret)
   1162			goto bail_rq_rvt;
   1163
   1164		if (init_attr->create_flags & IB_QP_CREATE_NETDEV_USE)
   1165			exclude_prefix = RVT_AIP_QP_PREFIX;
   1166
   1167		ret = alloc_qpn(rdi, &rdi->qp_dev->qpn_table,
   1168				init_attr->qp_type,
   1169				init_attr->port_num,
   1170				exclude_prefix);
   1171		if (ret < 0)
   1172			goto bail_rq_wq;
   1173
   1174		qp->ibqp.qp_num = ret;
   1175		if (init_attr->create_flags & IB_QP_CREATE_NETDEV_USE)
   1176			qp->ibqp.qp_num |= RVT_AIP_QP_BASE;
   1177		qp->port_num = init_attr->port_num;
   1178		rvt_init_qp(rdi, qp, init_attr->qp_type);
   1179		if (rdi->driver_f.qp_priv_init) {
   1180			ret = rdi->driver_f.qp_priv_init(rdi, qp, init_attr);
   1181			if (ret)
   1182				goto bail_rq_wq;
   1183		}
   1184		break;
   1185
   1186	default:
   1187		/* Don't support raw QPs */
   1188		return -EOPNOTSUPP;
   1189	}
   1190
   1191	init_attr->cap.max_inline_data = 0;
   1192
   1193	/*
   1194	 * Return the address of the RWQ as the offset to mmap.
   1195	 * See rvt_mmap() for details.
   1196	 */
   1197	if (udata && udata->outlen >= sizeof(__u64)) {
   1198		if (!qp->r_rq.wq) {
   1199			__u64 offset = 0;
   1200
   1201			ret = ib_copy_to_udata(udata, &offset,
   1202					       sizeof(offset));
   1203			if (ret)
   1204				goto bail_qpn;
   1205		} else {
   1206			u32 s = sizeof(struct rvt_rwq) + qp->r_rq.size * sz;
   1207
   1208			qp->ip = rvt_create_mmap_info(rdi, s, udata,
   1209						      qp->r_rq.wq);
   1210			if (IS_ERR(qp->ip)) {
   1211				ret = PTR_ERR(qp->ip);
   1212				goto bail_qpn;
   1213			}
   1214
   1215			ret = ib_copy_to_udata(udata, &qp->ip->offset,
   1216					       sizeof(qp->ip->offset));
   1217			if (ret)
   1218				goto bail_ip;
   1219		}
   1220		qp->pid = current->pid;
   1221	}
   1222
   1223	spin_lock(&rdi->n_qps_lock);
   1224	if (rdi->n_qps_allocated == rdi->dparms.props.max_qp) {
   1225		spin_unlock(&rdi->n_qps_lock);
   1226		ret = -ENOMEM;
   1227		goto bail_ip;
   1228	}
   1229
   1230	rdi->n_qps_allocated++;
   1231	/*
   1232	 * Maintain a busy_jiffies variable that will be added to the timeout
   1233	 * period in mod_retry_timer and add_retry_timer. This busy jiffies
   1234	 * is scaled by the number of rc qps created for the device to reduce
   1235	 * the number of timeouts occurring when there is a large number of
   1236	 * qps. busy_jiffies is incremented every rc qp scaling interval.
   1237	 * The scaling interval is selected based on extensive performance
   1238	 * evaluation of targeted workloads.
   1239	 */
   1240	if (init_attr->qp_type == IB_QPT_RC) {
   1241		rdi->n_rc_qps++;
   1242		rdi->busy_jiffies = rdi->n_rc_qps / RC_QP_SCALING_INTERVAL;
   1243	}
   1244	spin_unlock(&rdi->n_qps_lock);
   1245
   1246	if (qp->ip) {
   1247		spin_lock_irq(&rdi->pending_lock);
   1248		list_add(&qp->ip->pending_mmaps, &rdi->pending_mmaps);
   1249		spin_unlock_irq(&rdi->pending_lock);
   1250	}
   1251
   1252	return 0;
   1253
   1254bail_ip:
   1255	if (qp->ip)
   1256		kref_put(&qp->ip->ref, rvt_release_mmap_info);
   1257
   1258bail_qpn:
   1259	rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
   1260
   1261bail_rq_wq:
   1262	free_ud_wq_attr(qp);
   1263
   1264bail_rq_rvt:
   1265	rvt_free_rq(&qp->r_rq);
   1266
   1267bail_driver_priv:
   1268	rdi->driver_f.qp_priv_free(rdi, qp);
   1269
   1270bail_qp:
   1271	kfree(qp->s_ack_queue);
   1272	kfree(qp->r_sg_list);
   1273	vfree(swq);
   1274	return ret;
   1275}
   1276
   1277/**
   1278 * rvt_error_qp - put a QP into the error state
   1279 * @qp: the QP to put into the error state
   1280 * @err: the receive completion error to signal if a RWQE is active
   1281 *
   1282 * Flushes both send and receive work queues.
   1283 *
   1284 * Return: true if last WQE event should be generated.
   1285 * The QP r_lock and s_lock should be held and interrupts disabled.
   1286 * If we are already in error state, just return.
   1287 */
   1288int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err)
   1289{
   1290	struct ib_wc wc;
   1291	int ret = 0;
   1292	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
   1293
   1294	lockdep_assert_held(&qp->r_lock);
   1295	lockdep_assert_held(&qp->s_lock);
   1296	if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET)
   1297		goto bail;
   1298
   1299	qp->state = IB_QPS_ERR;
   1300
   1301	if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) {
   1302		qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR);
   1303		del_timer(&qp->s_timer);
   1304	}
   1305
   1306	if (qp->s_flags & RVT_S_ANY_WAIT_SEND)
   1307		qp->s_flags &= ~RVT_S_ANY_WAIT_SEND;
   1308
   1309	rdi->driver_f.notify_error_qp(qp);
   1310
   1311	/* Schedule the sending tasklet to drain the send work queue. */
   1312	if (READ_ONCE(qp->s_last) != qp->s_head)
   1313		rdi->driver_f.schedule_send(qp);
   1314
   1315	rvt_clear_mr_refs(qp, 0);
   1316
   1317	memset(&wc, 0, sizeof(wc));
   1318	wc.qp = &qp->ibqp;
   1319	wc.opcode = IB_WC_RECV;
   1320
   1321	if (test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) {
   1322		wc.wr_id = qp->r_wr_id;
   1323		wc.status = err;
   1324		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
   1325	}
   1326	wc.status = IB_WC_WR_FLUSH_ERR;
   1327
   1328	if (qp->r_rq.kwq) {
   1329		u32 head;
   1330		u32 tail;
   1331		struct rvt_rwq *wq = NULL;
   1332		struct rvt_krwq *kwq = NULL;
   1333
   1334		spin_lock(&qp->r_rq.kwq->c_lock);
   1335		/* qp->ip used to validate if there is a  user buffer mmaped */
   1336		if (qp->ip) {
   1337			wq = qp->r_rq.wq;
   1338			head = RDMA_READ_UAPI_ATOMIC(wq->head);
   1339			tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
   1340		} else {
   1341			kwq = qp->r_rq.kwq;
   1342			head = kwq->head;
   1343			tail = kwq->tail;
   1344		}
   1345		/* sanity check pointers before trusting them */
   1346		if (head >= qp->r_rq.size)
   1347			head = 0;
   1348		if (tail >= qp->r_rq.size)
   1349			tail = 0;
   1350		while (tail != head) {
   1351			wc.wr_id = rvt_get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
   1352			if (++tail >= qp->r_rq.size)
   1353				tail = 0;
   1354			rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
   1355		}
   1356		if (qp->ip)
   1357			RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
   1358		else
   1359			kwq->tail = tail;
   1360		spin_unlock(&qp->r_rq.kwq->c_lock);
   1361	} else if (qp->ibqp.event_handler) {
   1362		ret = 1;
   1363	}
   1364
   1365bail:
   1366	return ret;
   1367}
   1368EXPORT_SYMBOL(rvt_error_qp);
   1369
   1370/*
   1371 * Put the QP into the hash table.
   1372 * The hash table holds a reference to the QP.
   1373 */
   1374static void rvt_insert_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp)
   1375{
   1376	struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1];
   1377	unsigned long flags;
   1378
   1379	rvt_get_qp(qp);
   1380	spin_lock_irqsave(&rdi->qp_dev->qpt_lock, flags);
   1381
   1382	if (qp->ibqp.qp_num <= 1) {
   1383		rcu_assign_pointer(rvp->qp[qp->ibqp.qp_num], qp);
   1384	} else {
   1385		u32 n = hash_32(qp->ibqp.qp_num, rdi->qp_dev->qp_table_bits);
   1386
   1387		qp->next = rdi->qp_dev->qp_table[n];
   1388		rcu_assign_pointer(rdi->qp_dev->qp_table[n], qp);
   1389		trace_rvt_qpinsert(qp, n);
   1390	}
   1391
   1392	spin_unlock_irqrestore(&rdi->qp_dev->qpt_lock, flags);
   1393}
   1394
   1395/**
   1396 * rvt_modify_qp - modify the attributes of a queue pair
   1397 * @ibqp: the queue pair who's attributes we're modifying
   1398 * @attr: the new attributes
   1399 * @attr_mask: the mask of attributes to modify
   1400 * @udata: user data for libibverbs.so
   1401 *
   1402 * Return: 0 on success, otherwise returns an errno.
   1403 */
   1404int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
   1405		  int attr_mask, struct ib_udata *udata)
   1406{
   1407	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
   1408	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
   1409	enum ib_qp_state cur_state, new_state;
   1410	struct ib_event ev;
   1411	int lastwqe = 0;
   1412	int mig = 0;
   1413	int pmtu = 0; /* for gcc warning only */
   1414	int opa_ah;
   1415
   1416	if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
   1417		return -EOPNOTSUPP;
   1418
   1419	spin_lock_irq(&qp->r_lock);
   1420	spin_lock(&qp->s_hlock);
   1421	spin_lock(&qp->s_lock);
   1422
   1423	cur_state = attr_mask & IB_QP_CUR_STATE ?
   1424		attr->cur_qp_state : qp->state;
   1425	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
   1426	opa_ah = rdma_cap_opa_ah(ibqp->device, qp->port_num);
   1427
   1428	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
   1429				attr_mask))
   1430		goto inval;
   1431
   1432	if (rdi->driver_f.check_modify_qp &&
   1433	    rdi->driver_f.check_modify_qp(qp, attr, attr_mask, udata))
   1434		goto inval;
   1435
   1436	if (attr_mask & IB_QP_AV) {
   1437		if (opa_ah) {
   1438			if (rdma_ah_get_dlid(&attr->ah_attr) >=
   1439				opa_get_mcast_base(OPA_MCAST_NR))
   1440				goto inval;
   1441		} else {
   1442			if (rdma_ah_get_dlid(&attr->ah_attr) >=
   1443				be16_to_cpu(IB_MULTICAST_LID_BASE))
   1444				goto inval;
   1445		}
   1446
   1447		if (rvt_check_ah(qp->ibqp.device, &attr->ah_attr))
   1448			goto inval;
   1449	}
   1450
   1451	if (attr_mask & IB_QP_ALT_PATH) {
   1452		if (opa_ah) {
   1453			if (rdma_ah_get_dlid(&attr->alt_ah_attr) >=
   1454				opa_get_mcast_base(OPA_MCAST_NR))
   1455				goto inval;
   1456		} else {
   1457			if (rdma_ah_get_dlid(&attr->alt_ah_attr) >=
   1458				be16_to_cpu(IB_MULTICAST_LID_BASE))
   1459				goto inval;
   1460		}
   1461
   1462		if (rvt_check_ah(qp->ibqp.device, &attr->alt_ah_attr))
   1463			goto inval;
   1464		if (attr->alt_pkey_index >= rvt_get_npkeys(rdi))
   1465			goto inval;
   1466	}
   1467
   1468	if (attr_mask & IB_QP_PKEY_INDEX)
   1469		if (attr->pkey_index >= rvt_get_npkeys(rdi))
   1470			goto inval;
   1471
   1472	if (attr_mask & IB_QP_MIN_RNR_TIMER)
   1473		if (attr->min_rnr_timer > 31)
   1474			goto inval;
   1475
   1476	if (attr_mask & IB_QP_PORT)
   1477		if (qp->ibqp.qp_type == IB_QPT_SMI ||
   1478		    qp->ibqp.qp_type == IB_QPT_GSI ||
   1479		    attr->port_num == 0 ||
   1480		    attr->port_num > ibqp->device->phys_port_cnt)
   1481			goto inval;
   1482
   1483	if (attr_mask & IB_QP_DEST_QPN)
   1484		if (attr->dest_qp_num > RVT_QPN_MASK)
   1485			goto inval;
   1486
   1487	if (attr_mask & IB_QP_RETRY_CNT)
   1488		if (attr->retry_cnt > 7)
   1489			goto inval;
   1490
   1491	if (attr_mask & IB_QP_RNR_RETRY)
   1492		if (attr->rnr_retry > 7)
   1493			goto inval;
   1494
   1495	/*
   1496	 * Don't allow invalid path_mtu values.  OK to set greater
   1497	 * than the active mtu (or even the max_cap, if we have tuned
   1498	 * that to a small mtu.  We'll set qp->path_mtu
   1499	 * to the lesser of requested attribute mtu and active,
   1500	 * for packetizing messages.
   1501	 * Note that the QP port has to be set in INIT and MTU in RTR.
   1502	 */
   1503	if (attr_mask & IB_QP_PATH_MTU) {
   1504		pmtu = rdi->driver_f.get_pmtu_from_attr(rdi, qp, attr);
   1505		if (pmtu < 0)
   1506			goto inval;
   1507	}
   1508
   1509	if (attr_mask & IB_QP_PATH_MIG_STATE) {
   1510		if (attr->path_mig_state == IB_MIG_REARM) {
   1511			if (qp->s_mig_state == IB_MIG_ARMED)
   1512				goto inval;
   1513			if (new_state != IB_QPS_RTS)
   1514				goto inval;
   1515		} else if (attr->path_mig_state == IB_MIG_MIGRATED) {
   1516			if (qp->s_mig_state == IB_MIG_REARM)
   1517				goto inval;
   1518			if (new_state != IB_QPS_RTS && new_state != IB_QPS_SQD)
   1519				goto inval;
   1520			if (qp->s_mig_state == IB_MIG_ARMED)
   1521				mig = 1;
   1522		} else {
   1523			goto inval;
   1524		}
   1525	}
   1526
   1527	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
   1528		if (attr->max_dest_rd_atomic > rdi->dparms.max_rdma_atomic)
   1529			goto inval;
   1530
   1531	switch (new_state) {
   1532	case IB_QPS_RESET:
   1533		if (qp->state != IB_QPS_RESET)
   1534			_rvt_reset_qp(rdi, qp, ibqp->qp_type);
   1535		break;
   1536
   1537	case IB_QPS_RTR:
   1538		/* Allow event to re-trigger if QP set to RTR more than once */
   1539		qp->r_flags &= ~RVT_R_COMM_EST;
   1540		qp->state = new_state;
   1541		break;
   1542
   1543	case IB_QPS_SQD:
   1544		qp->s_draining = qp->s_last != qp->s_cur;
   1545		qp->state = new_state;
   1546		break;
   1547
   1548	case IB_QPS_SQE:
   1549		if (qp->ibqp.qp_type == IB_QPT_RC)
   1550			goto inval;
   1551		qp->state = new_state;
   1552		break;
   1553
   1554	case IB_QPS_ERR:
   1555		lastwqe = rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
   1556		break;
   1557
   1558	default:
   1559		qp->state = new_state;
   1560		break;
   1561	}
   1562
   1563	if (attr_mask & IB_QP_PKEY_INDEX)
   1564		qp->s_pkey_index = attr->pkey_index;
   1565
   1566	if (attr_mask & IB_QP_PORT)
   1567		qp->port_num = attr->port_num;
   1568
   1569	if (attr_mask & IB_QP_DEST_QPN)
   1570		qp->remote_qpn = attr->dest_qp_num;
   1571
   1572	if (attr_mask & IB_QP_SQ_PSN) {
   1573		qp->s_next_psn = attr->sq_psn & rdi->dparms.psn_modify_mask;
   1574		qp->s_psn = qp->s_next_psn;
   1575		qp->s_sending_psn = qp->s_next_psn;
   1576		qp->s_last_psn = qp->s_next_psn - 1;
   1577		qp->s_sending_hpsn = qp->s_last_psn;
   1578	}
   1579
   1580	if (attr_mask & IB_QP_RQ_PSN)
   1581		qp->r_psn = attr->rq_psn & rdi->dparms.psn_modify_mask;
   1582
   1583	if (attr_mask & IB_QP_ACCESS_FLAGS)
   1584		qp->qp_access_flags = attr->qp_access_flags;
   1585
   1586	if (attr_mask & IB_QP_AV) {
   1587		rdma_replace_ah_attr(&qp->remote_ah_attr, &attr->ah_attr);
   1588		qp->s_srate = rdma_ah_get_static_rate(&attr->ah_attr);
   1589		qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
   1590	}
   1591
   1592	if (attr_mask & IB_QP_ALT_PATH) {
   1593		rdma_replace_ah_attr(&qp->alt_ah_attr, &attr->alt_ah_attr);
   1594		qp->s_alt_pkey_index = attr->alt_pkey_index;
   1595	}
   1596
   1597	if (attr_mask & IB_QP_PATH_MIG_STATE) {
   1598		qp->s_mig_state = attr->path_mig_state;
   1599		if (mig) {
   1600			qp->remote_ah_attr = qp->alt_ah_attr;
   1601			qp->port_num = rdma_ah_get_port_num(&qp->alt_ah_attr);
   1602			qp->s_pkey_index = qp->s_alt_pkey_index;
   1603		}
   1604	}
   1605
   1606	if (attr_mask & IB_QP_PATH_MTU) {
   1607		qp->pmtu = rdi->driver_f.mtu_from_qp(rdi, qp, pmtu);
   1608		qp->log_pmtu = ilog2(qp->pmtu);
   1609	}
   1610
   1611	if (attr_mask & IB_QP_RETRY_CNT) {
   1612		qp->s_retry_cnt = attr->retry_cnt;
   1613		qp->s_retry = attr->retry_cnt;
   1614	}
   1615
   1616	if (attr_mask & IB_QP_RNR_RETRY) {
   1617		qp->s_rnr_retry_cnt = attr->rnr_retry;
   1618		qp->s_rnr_retry = attr->rnr_retry;
   1619	}
   1620
   1621	if (attr_mask & IB_QP_MIN_RNR_TIMER)
   1622		qp->r_min_rnr_timer = attr->min_rnr_timer;
   1623
   1624	if (attr_mask & IB_QP_TIMEOUT) {
   1625		qp->timeout = attr->timeout;
   1626		qp->timeout_jiffies = rvt_timeout_to_jiffies(qp->timeout);
   1627	}
   1628
   1629	if (attr_mask & IB_QP_QKEY)
   1630		qp->qkey = attr->qkey;
   1631
   1632	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
   1633		qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
   1634
   1635	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
   1636		qp->s_max_rd_atomic = attr->max_rd_atomic;
   1637
   1638	if (rdi->driver_f.modify_qp)
   1639		rdi->driver_f.modify_qp(qp, attr, attr_mask, udata);
   1640
   1641	spin_unlock(&qp->s_lock);
   1642	spin_unlock(&qp->s_hlock);
   1643	spin_unlock_irq(&qp->r_lock);
   1644
   1645	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
   1646		rvt_insert_qp(rdi, qp);
   1647
   1648	if (lastwqe) {
   1649		ev.device = qp->ibqp.device;
   1650		ev.element.qp = &qp->ibqp;
   1651		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
   1652		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
   1653	}
   1654	if (mig) {
   1655		ev.device = qp->ibqp.device;
   1656		ev.element.qp = &qp->ibqp;
   1657		ev.event = IB_EVENT_PATH_MIG;
   1658		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
   1659	}
   1660	return 0;
   1661
   1662inval:
   1663	spin_unlock(&qp->s_lock);
   1664	spin_unlock(&qp->s_hlock);
   1665	spin_unlock_irq(&qp->r_lock);
   1666	return -EINVAL;
   1667}
   1668
   1669/**
   1670 * rvt_destroy_qp - destroy a queue pair
   1671 * @ibqp: the queue pair to destroy
   1672 * @udata: unused by the driver
   1673 *
   1674 * Note that this can be called while the QP is actively sending or
   1675 * receiving!
   1676 *
   1677 * Return: 0 on success.
   1678 */
   1679int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
   1680{
   1681	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
   1682	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
   1683
   1684	rvt_reset_qp(rdi, qp, ibqp->qp_type);
   1685
   1686	wait_event(qp->wait, !atomic_read(&qp->refcount));
   1687	/* qpn is now available for use again */
   1688	rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
   1689
   1690	spin_lock(&rdi->n_qps_lock);
   1691	rdi->n_qps_allocated--;
   1692	if (qp->ibqp.qp_type == IB_QPT_RC) {
   1693		rdi->n_rc_qps--;
   1694		rdi->busy_jiffies = rdi->n_rc_qps / RC_QP_SCALING_INTERVAL;
   1695	}
   1696	spin_unlock(&rdi->n_qps_lock);
   1697
   1698	if (qp->ip)
   1699		kref_put(&qp->ip->ref, rvt_release_mmap_info);
   1700	kvfree(qp->r_rq.kwq);
   1701	rdi->driver_f.qp_priv_free(rdi, qp);
   1702	kfree(qp->s_ack_queue);
   1703	kfree(qp->r_sg_list);
   1704	rdma_destroy_ah_attr(&qp->remote_ah_attr);
   1705	rdma_destroy_ah_attr(&qp->alt_ah_attr);
   1706	free_ud_wq_attr(qp);
   1707	vfree(qp->s_wq);
   1708	return 0;
   1709}
   1710
   1711/**
   1712 * rvt_query_qp - query an ipbq
   1713 * @ibqp: IB qp to query
   1714 * @attr: attr struct to fill in
   1715 * @attr_mask: attr mask ignored
   1716 * @init_attr: struct to fill in
   1717 *
   1718 * Return: always 0
   1719 */
   1720int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
   1721		 int attr_mask, struct ib_qp_init_attr *init_attr)
   1722{
   1723	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
   1724	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
   1725
   1726	attr->qp_state = qp->state;
   1727	attr->cur_qp_state = attr->qp_state;
   1728	attr->path_mtu = rdi->driver_f.mtu_to_path_mtu(qp->pmtu);
   1729	attr->path_mig_state = qp->s_mig_state;
   1730	attr->qkey = qp->qkey;
   1731	attr->rq_psn = qp->r_psn & rdi->dparms.psn_mask;
   1732	attr->sq_psn = qp->s_next_psn & rdi->dparms.psn_mask;
   1733	attr->dest_qp_num = qp->remote_qpn;
   1734	attr->qp_access_flags = qp->qp_access_flags;
   1735	attr->cap.max_send_wr = qp->s_size - 1 -
   1736		rdi->dparms.reserved_operations;
   1737	attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
   1738	attr->cap.max_send_sge = qp->s_max_sge;
   1739	attr->cap.max_recv_sge = qp->r_rq.max_sge;
   1740	attr->cap.max_inline_data = 0;
   1741	attr->ah_attr = qp->remote_ah_attr;
   1742	attr->alt_ah_attr = qp->alt_ah_attr;
   1743	attr->pkey_index = qp->s_pkey_index;
   1744	attr->alt_pkey_index = qp->s_alt_pkey_index;
   1745	attr->en_sqd_async_notify = 0;
   1746	attr->sq_draining = qp->s_draining;
   1747	attr->max_rd_atomic = qp->s_max_rd_atomic;
   1748	attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
   1749	attr->min_rnr_timer = qp->r_min_rnr_timer;
   1750	attr->port_num = qp->port_num;
   1751	attr->timeout = qp->timeout;
   1752	attr->retry_cnt = qp->s_retry_cnt;
   1753	attr->rnr_retry = qp->s_rnr_retry_cnt;
   1754	attr->alt_port_num =
   1755		rdma_ah_get_port_num(&qp->alt_ah_attr);
   1756	attr->alt_timeout = qp->alt_timeout;
   1757
   1758	init_attr->event_handler = qp->ibqp.event_handler;
   1759	init_attr->qp_context = qp->ibqp.qp_context;
   1760	init_attr->send_cq = qp->ibqp.send_cq;
   1761	init_attr->recv_cq = qp->ibqp.recv_cq;
   1762	init_attr->srq = qp->ibqp.srq;
   1763	init_attr->cap = attr->cap;
   1764	if (qp->s_flags & RVT_S_SIGNAL_REQ_WR)
   1765		init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
   1766	else
   1767		init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
   1768	init_attr->qp_type = qp->ibqp.qp_type;
   1769	init_attr->port_num = qp->port_num;
   1770	return 0;
   1771}
   1772
   1773/**
   1774 * rvt_post_recv - post a receive on a QP
   1775 * @ibqp: the QP to post the receive on
   1776 * @wr: the WR to post
   1777 * @bad_wr: the first bad WR is put here
   1778 *
   1779 * This may be called from interrupt context.
   1780 *
   1781 * Return: 0 on success otherwise errno
   1782 */
   1783int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
   1784		  const struct ib_recv_wr **bad_wr)
   1785{
   1786	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
   1787	struct rvt_krwq *wq = qp->r_rq.kwq;
   1788	unsigned long flags;
   1789	int qp_err_flush = (ib_rvt_state_ops[qp->state] & RVT_FLUSH_RECV) &&
   1790				!qp->ibqp.srq;
   1791
   1792	/* Check that state is OK to post receive. */
   1793	if (!(ib_rvt_state_ops[qp->state] & RVT_POST_RECV_OK) || !wq) {
   1794		*bad_wr = wr;
   1795		return -EINVAL;
   1796	}
   1797
   1798	for (; wr; wr = wr->next) {
   1799		struct rvt_rwqe *wqe;
   1800		u32 next;
   1801		int i;
   1802
   1803		if ((unsigned)wr->num_sge > qp->r_rq.max_sge) {
   1804			*bad_wr = wr;
   1805			return -EINVAL;
   1806		}
   1807
   1808		spin_lock_irqsave(&qp->r_rq.kwq->p_lock, flags);
   1809		next = wq->head + 1;
   1810		if (next >= qp->r_rq.size)
   1811			next = 0;
   1812		if (next == READ_ONCE(wq->tail)) {
   1813			spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
   1814			*bad_wr = wr;
   1815			return -ENOMEM;
   1816		}
   1817		if (unlikely(qp_err_flush)) {
   1818			struct ib_wc wc;
   1819
   1820			memset(&wc, 0, sizeof(wc));
   1821			wc.qp = &qp->ibqp;
   1822			wc.opcode = IB_WC_RECV;
   1823			wc.wr_id = wr->wr_id;
   1824			wc.status = IB_WC_WR_FLUSH_ERR;
   1825			rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
   1826		} else {
   1827			wqe = rvt_get_rwqe_ptr(&qp->r_rq, wq->head);
   1828			wqe->wr_id = wr->wr_id;
   1829			wqe->num_sge = wr->num_sge;
   1830			for (i = 0; i < wr->num_sge; i++) {
   1831				wqe->sg_list[i].addr = wr->sg_list[i].addr;
   1832				wqe->sg_list[i].length = wr->sg_list[i].length;
   1833				wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
   1834			}
   1835			/*
   1836			 * Make sure queue entry is written
   1837			 * before the head index.
   1838			 */
   1839			smp_store_release(&wq->head, next);
   1840		}
   1841		spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
   1842	}
   1843	return 0;
   1844}
   1845
   1846/**
   1847 * rvt_qp_valid_operation - validate post send wr request
   1848 * @qp: the qp
   1849 * @post_parms: the post send table for the driver
   1850 * @wr: the work request
   1851 *
   1852 * The routine validates the operation based on the
   1853 * validation table an returns the length of the operation
   1854 * which can extend beyond the ib_send_bw.  Operation
   1855 * dependent flags key atomic operation validation.
   1856 *
   1857 * There is an exception for UD qps that validates the pd and
   1858 * overrides the length to include the additional UD specific
   1859 * length.
   1860 *
   1861 * Returns a negative error or the length of the work request
   1862 * for building the swqe.
   1863 */
   1864static inline int rvt_qp_valid_operation(
   1865	struct rvt_qp *qp,
   1866	const struct rvt_operation_params *post_parms,
   1867	const struct ib_send_wr *wr)
   1868{
   1869	int len;
   1870
   1871	if (wr->opcode >= RVT_OPERATION_MAX || !post_parms[wr->opcode].length)
   1872		return -EINVAL;
   1873	if (!(post_parms[wr->opcode].qpt_support & BIT(qp->ibqp.qp_type)))
   1874		return -EINVAL;
   1875	if ((post_parms[wr->opcode].flags & RVT_OPERATION_PRIV) &&
   1876	    ibpd_to_rvtpd(qp->ibqp.pd)->user)
   1877		return -EINVAL;
   1878	if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC_SGE &&
   1879	    (wr->num_sge == 0 ||
   1880	     wr->sg_list[0].length < sizeof(u64) ||
   1881	     wr->sg_list[0].addr & (sizeof(u64) - 1)))
   1882		return -EINVAL;
   1883	if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC &&
   1884	    !qp->s_max_rd_atomic)
   1885		return -EINVAL;
   1886	len = post_parms[wr->opcode].length;
   1887	/* UD specific */
   1888	if (qp->ibqp.qp_type != IB_QPT_UC &&
   1889	    qp->ibqp.qp_type != IB_QPT_RC) {
   1890		if (qp->ibqp.pd != ud_wr(wr)->ah->pd)
   1891			return -EINVAL;
   1892		len = sizeof(struct ib_ud_wr);
   1893	}
   1894	return len;
   1895}
   1896
   1897/**
   1898 * rvt_qp_is_avail - determine queue capacity
   1899 * @qp: the qp
   1900 * @rdi: the rdmavt device
   1901 * @reserved_op: is reserved operation
   1902 *
   1903 * This assumes the s_hlock is held but the s_last
   1904 * qp variable is uncontrolled.
   1905 *
   1906 * For non reserved operations, the qp->s_avail
   1907 * may be changed.
   1908 *
   1909 * The return value is zero or a -ENOMEM.
   1910 */
   1911static inline int rvt_qp_is_avail(
   1912	struct rvt_qp *qp,
   1913	struct rvt_dev_info *rdi,
   1914	bool reserved_op)
   1915{
   1916	u32 slast;
   1917	u32 avail;
   1918	u32 reserved_used;
   1919
   1920	/* see rvt_qp_wqe_unreserve() */
   1921	smp_mb__before_atomic();
   1922	if (unlikely(reserved_op)) {
   1923		/* see rvt_qp_wqe_unreserve() */
   1924		reserved_used = atomic_read(&qp->s_reserved_used);
   1925		if (reserved_used >= rdi->dparms.reserved_operations)
   1926			return -ENOMEM;
   1927		return 0;
   1928	}
   1929	/* non-reserved operations */
   1930	if (likely(qp->s_avail))
   1931		return 0;
   1932	/* See rvt_qp_complete_swqe() */
   1933	slast = smp_load_acquire(&qp->s_last);
   1934	if (qp->s_head >= slast)
   1935		avail = qp->s_size - (qp->s_head - slast);
   1936	else
   1937		avail = slast - qp->s_head;
   1938
   1939	reserved_used = atomic_read(&qp->s_reserved_used);
   1940	avail =  avail - 1 -
   1941		(rdi->dparms.reserved_operations - reserved_used);
   1942	/* insure we don't assign a negative s_avail */
   1943	if ((s32)avail <= 0)
   1944		return -ENOMEM;
   1945	qp->s_avail = avail;
   1946	if (WARN_ON(qp->s_avail >
   1947		    (qp->s_size - 1 - rdi->dparms.reserved_operations)))
   1948		rvt_pr_err(rdi,
   1949			   "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u",
   1950			   qp->ibqp.qp_num, qp->s_size, qp->s_avail,
   1951			   qp->s_head, qp->s_tail, qp->s_cur,
   1952			   qp->s_acked, qp->s_last);
   1953	return 0;
   1954}
   1955
   1956/**
   1957 * rvt_post_one_wr - post one RC, UC, or UD send work request
   1958 * @qp: the QP to post on
   1959 * @wr: the work request to send
   1960 * @call_send: kick the send engine into gear
   1961 */
   1962static int rvt_post_one_wr(struct rvt_qp *qp,
   1963			   const struct ib_send_wr *wr,
   1964			   bool *call_send)
   1965{
   1966	struct rvt_swqe *wqe;
   1967	u32 next;
   1968	int i;
   1969	int j;
   1970	int acc;
   1971	struct rvt_lkey_table *rkt;
   1972	struct rvt_pd *pd;
   1973	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
   1974	u8 log_pmtu;
   1975	int ret;
   1976	size_t cplen;
   1977	bool reserved_op;
   1978	int local_ops_delayed = 0;
   1979
   1980	BUILD_BUG_ON(IB_QPT_MAX >= (sizeof(u32) * BITS_PER_BYTE));
   1981
   1982	/* IB spec says that num_sge == 0 is OK. */
   1983	if (unlikely(wr->num_sge > qp->s_max_sge))
   1984		return -EINVAL;
   1985
   1986	ret = rvt_qp_valid_operation(qp, rdi->post_parms, wr);
   1987	if (ret < 0)
   1988		return ret;
   1989	cplen = ret;
   1990
   1991	/*
   1992	 * Local operations include fast register and local invalidate.
   1993	 * Fast register needs to be processed immediately because the
   1994	 * registered lkey may be used by following work requests and the
   1995	 * lkey needs to be valid at the time those requests are posted.
   1996	 * Local invalidate can be processed immediately if fencing is
   1997	 * not required and no previous local invalidate ops are pending.
   1998	 * Signaled local operations that have been processed immediately
   1999	 * need to have requests with "completion only" flags set posted
   2000	 * to the send queue in order to generate completions.
   2001	 */
   2002	if ((rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL)) {
   2003		switch (wr->opcode) {
   2004		case IB_WR_REG_MR:
   2005			ret = rvt_fast_reg_mr(qp,
   2006					      reg_wr(wr)->mr,
   2007					      reg_wr(wr)->key,
   2008					      reg_wr(wr)->access);
   2009			if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
   2010				return ret;
   2011			break;
   2012		case IB_WR_LOCAL_INV:
   2013			if ((wr->send_flags & IB_SEND_FENCE) ||
   2014			    atomic_read(&qp->local_ops_pending)) {
   2015				local_ops_delayed = 1;
   2016			} else {
   2017				ret = rvt_invalidate_rkey(
   2018					qp, wr->ex.invalidate_rkey);
   2019				if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
   2020					return ret;
   2021			}
   2022			break;
   2023		default:
   2024			return -EINVAL;
   2025		}
   2026	}
   2027
   2028	reserved_op = rdi->post_parms[wr->opcode].flags &
   2029			RVT_OPERATION_USE_RESERVE;
   2030	/* check for avail */
   2031	ret = rvt_qp_is_avail(qp, rdi, reserved_op);
   2032	if (ret)
   2033		return ret;
   2034	next = qp->s_head + 1;
   2035	if (next >= qp->s_size)
   2036		next = 0;
   2037
   2038	rkt = &rdi->lkey_table;
   2039	pd = ibpd_to_rvtpd(qp->ibqp.pd);
   2040	wqe = rvt_get_swqe_ptr(qp, qp->s_head);
   2041
   2042	/* cplen has length from above */
   2043	memcpy(&wqe->wr, wr, cplen);
   2044
   2045	wqe->length = 0;
   2046	j = 0;
   2047	if (wr->num_sge) {
   2048		struct rvt_sge *last_sge = NULL;
   2049
   2050		acc = wr->opcode >= IB_WR_RDMA_READ ?
   2051			IB_ACCESS_LOCAL_WRITE : 0;
   2052		for (i = 0; i < wr->num_sge; i++) {
   2053			u32 length = wr->sg_list[i].length;
   2054
   2055			if (length == 0)
   2056				continue;
   2057			ret = rvt_lkey_ok(rkt, pd, &wqe->sg_list[j], last_sge,
   2058					  &wr->sg_list[i], acc);
   2059			if (unlikely(ret < 0))
   2060				goto bail_inval_free;
   2061			wqe->length += length;
   2062			if (ret)
   2063				last_sge = &wqe->sg_list[j];
   2064			j += ret;
   2065		}
   2066		wqe->wr.num_sge = j;
   2067	}
   2068
   2069	/*
   2070	 * Calculate and set SWQE PSN values prior to handing it off
   2071	 * to the driver's check routine. This give the driver the
   2072	 * opportunity to adjust PSN values based on internal checks.
   2073	 */
   2074	log_pmtu = qp->log_pmtu;
   2075	if (qp->allowed_ops == IB_OPCODE_UD) {
   2076		struct rvt_ah *ah = rvt_get_swqe_ah(wqe);
   2077
   2078		log_pmtu = ah->log_pmtu;
   2079		rdma_copy_ah_attr(wqe->ud_wr.attr, &ah->attr);
   2080	}
   2081
   2082	if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) {
   2083		if (local_ops_delayed)
   2084			atomic_inc(&qp->local_ops_pending);
   2085		else
   2086			wqe->wr.send_flags |= RVT_SEND_COMPLETION_ONLY;
   2087		wqe->ssn = 0;
   2088		wqe->psn = 0;
   2089		wqe->lpsn = 0;
   2090	} else {
   2091		wqe->ssn = qp->s_ssn++;
   2092		wqe->psn = qp->s_next_psn;
   2093		wqe->lpsn = wqe->psn +
   2094				(wqe->length ?
   2095					((wqe->length - 1) >> log_pmtu) :
   2096					0);
   2097	}
   2098
   2099	/* general part of wqe valid - allow for driver checks */
   2100	if (rdi->driver_f.setup_wqe) {
   2101		ret = rdi->driver_f.setup_wqe(qp, wqe, call_send);
   2102		if (ret < 0)
   2103			goto bail_inval_free_ref;
   2104	}
   2105
   2106	if (!(rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL))
   2107		qp->s_next_psn = wqe->lpsn + 1;
   2108
   2109	if (unlikely(reserved_op)) {
   2110		wqe->wr.send_flags |= RVT_SEND_RESERVE_USED;
   2111		rvt_qp_wqe_reserve(qp, wqe);
   2112	} else {
   2113		wqe->wr.send_flags &= ~RVT_SEND_RESERVE_USED;
   2114		qp->s_avail--;
   2115	}
   2116	trace_rvt_post_one_wr(qp, wqe, wr->num_sge);
   2117	smp_wmb(); /* see request builders */
   2118	qp->s_head = next;
   2119
   2120	return 0;
   2121
   2122bail_inval_free_ref:
   2123	if (qp->allowed_ops == IB_OPCODE_UD)
   2124		rdma_destroy_ah_attr(wqe->ud_wr.attr);
   2125bail_inval_free:
   2126	/* release mr holds */
   2127	while (j) {
   2128		struct rvt_sge *sge = &wqe->sg_list[--j];
   2129
   2130		rvt_put_mr(sge->mr);
   2131	}
   2132	return ret;
   2133}
   2134
   2135/**
   2136 * rvt_post_send - post a send on a QP
   2137 * @ibqp: the QP to post the send on
   2138 * @wr: the list of work requests to post
   2139 * @bad_wr: the first bad WR is put here
   2140 *
   2141 * This may be called from interrupt context.
   2142 *
   2143 * Return: 0 on success else errno
   2144 */
   2145int rvt_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
   2146		  const struct ib_send_wr **bad_wr)
   2147{
   2148	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
   2149	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
   2150	unsigned long flags = 0;
   2151	bool call_send;
   2152	unsigned nreq = 0;
   2153	int err = 0;
   2154
   2155	spin_lock_irqsave(&qp->s_hlock, flags);
   2156
   2157	/*
   2158	 * Ensure QP state is such that we can send. If not bail out early,
   2159	 * there is no need to do this every time we post a send.
   2160	 */
   2161	if (unlikely(!(ib_rvt_state_ops[qp->state] & RVT_POST_SEND_OK))) {
   2162		spin_unlock_irqrestore(&qp->s_hlock, flags);
   2163		return -EINVAL;
   2164	}
   2165
   2166	/*
   2167	 * If the send queue is empty, and we only have a single WR then just go
   2168	 * ahead and kick the send engine into gear. Otherwise we will always
   2169	 * just schedule the send to happen later.
   2170	 */
   2171	call_send = qp->s_head == READ_ONCE(qp->s_last) && !wr->next;
   2172
   2173	for (; wr; wr = wr->next) {
   2174		err = rvt_post_one_wr(qp, wr, &call_send);
   2175		if (unlikely(err)) {
   2176			*bad_wr = wr;
   2177			goto bail;
   2178		}
   2179		nreq++;
   2180	}
   2181bail:
   2182	spin_unlock_irqrestore(&qp->s_hlock, flags);
   2183	if (nreq) {
   2184		/*
   2185		 * Only call do_send if there is exactly one packet, and the
   2186		 * driver said it was ok.
   2187		 */
   2188		if (nreq == 1 && call_send)
   2189			rdi->driver_f.do_send(qp);
   2190		else
   2191			rdi->driver_f.schedule_send_no_lock(qp);
   2192	}
   2193	return err;
   2194}
   2195
   2196/**
   2197 * rvt_post_srq_recv - post a receive on a shared receive queue
   2198 * @ibsrq: the SRQ to post the receive on
   2199 * @wr: the list of work requests to post
   2200 * @bad_wr: A pointer to the first WR to cause a problem is put here
   2201 *
   2202 * This may be called from interrupt context.
   2203 *
   2204 * Return: 0 on success else errno
   2205 */
   2206int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
   2207		      const struct ib_recv_wr **bad_wr)
   2208{
   2209	struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
   2210	struct rvt_krwq *wq;
   2211	unsigned long flags;
   2212
   2213	for (; wr; wr = wr->next) {
   2214		struct rvt_rwqe *wqe;
   2215		u32 next;
   2216		int i;
   2217
   2218		if ((unsigned)wr->num_sge > srq->rq.max_sge) {
   2219			*bad_wr = wr;
   2220			return -EINVAL;
   2221		}
   2222
   2223		spin_lock_irqsave(&srq->rq.kwq->p_lock, flags);
   2224		wq = srq->rq.kwq;
   2225		next = wq->head + 1;
   2226		if (next >= srq->rq.size)
   2227			next = 0;
   2228		if (next == READ_ONCE(wq->tail)) {
   2229			spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
   2230			*bad_wr = wr;
   2231			return -ENOMEM;
   2232		}
   2233
   2234		wqe = rvt_get_rwqe_ptr(&srq->rq, wq->head);
   2235		wqe->wr_id = wr->wr_id;
   2236		wqe->num_sge = wr->num_sge;
   2237		for (i = 0; i < wr->num_sge; i++) {
   2238			wqe->sg_list[i].addr = wr->sg_list[i].addr;
   2239			wqe->sg_list[i].length = wr->sg_list[i].length;
   2240			wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
   2241		}
   2242		/* Make sure queue entry is written before the head index. */
   2243		smp_store_release(&wq->head, next);
   2244		spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
   2245	}
   2246	return 0;
   2247}
   2248
   2249/*
   2250 * rvt used the internal kernel struct as part of its ABI, for now make sure
   2251 * the kernel struct does not change layout. FIXME: rvt should never cast the
   2252 * user struct to a kernel struct.
   2253 */
   2254static struct ib_sge *rvt_cast_sge(struct rvt_wqe_sge *sge)
   2255{
   2256	BUILD_BUG_ON(offsetof(struct ib_sge, addr) !=
   2257		     offsetof(struct rvt_wqe_sge, addr));
   2258	BUILD_BUG_ON(offsetof(struct ib_sge, length) !=
   2259		     offsetof(struct rvt_wqe_sge, length));
   2260	BUILD_BUG_ON(offsetof(struct ib_sge, lkey) !=
   2261		     offsetof(struct rvt_wqe_sge, lkey));
   2262	return (struct ib_sge *)sge;
   2263}
   2264
   2265/*
   2266 * Validate a RWQE and fill in the SGE state.
   2267 * Return 1 if OK.
   2268 */
   2269static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
   2270{
   2271	int i, j, ret;
   2272	struct ib_wc wc;
   2273	struct rvt_lkey_table *rkt;
   2274	struct rvt_pd *pd;
   2275	struct rvt_sge_state *ss;
   2276	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
   2277
   2278	rkt = &rdi->lkey_table;
   2279	pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
   2280	ss = &qp->r_sge;
   2281	ss->sg_list = qp->r_sg_list;
   2282	qp->r_len = 0;
   2283	for (i = j = 0; i < wqe->num_sge; i++) {
   2284		if (wqe->sg_list[i].length == 0)
   2285			continue;
   2286		/* Check LKEY */
   2287		ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
   2288				  NULL, rvt_cast_sge(&wqe->sg_list[i]),
   2289				  IB_ACCESS_LOCAL_WRITE);
   2290		if (unlikely(ret <= 0))
   2291			goto bad_lkey;
   2292		qp->r_len += wqe->sg_list[i].length;
   2293		j++;
   2294	}
   2295	ss->num_sge = j;
   2296	ss->total_len = qp->r_len;
   2297	return 1;
   2298
   2299bad_lkey:
   2300	while (j) {
   2301		struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
   2302
   2303		rvt_put_mr(sge->mr);
   2304	}
   2305	ss->num_sge = 0;
   2306	memset(&wc, 0, sizeof(wc));
   2307	wc.wr_id = wqe->wr_id;
   2308	wc.status = IB_WC_LOC_PROT_ERR;
   2309	wc.opcode = IB_WC_RECV;
   2310	wc.qp = &qp->ibqp;
   2311	/* Signal solicited completion event. */
   2312	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
   2313	return 0;
   2314}
   2315
   2316/**
   2317 * get_rvt_head - get head indices of the circular buffer
   2318 * @rq: data structure for request queue entry
   2319 * @ip: the QP
   2320 *
   2321 * Return - head index value
   2322 */
   2323static inline u32 get_rvt_head(struct rvt_rq *rq, void *ip)
   2324{
   2325	u32 head;
   2326
   2327	if (ip)
   2328		head = RDMA_READ_UAPI_ATOMIC(rq->wq->head);
   2329	else
   2330		head = rq->kwq->head;
   2331
   2332	return head;
   2333}
   2334
   2335/**
   2336 * rvt_get_rwqe - copy the next RWQE into the QP's RWQE
   2337 * @qp: the QP
   2338 * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
   2339 *
   2340 * Return -1 if there is a local error, 0 if no RWQE is available,
   2341 * otherwise return 1.
   2342 *
   2343 * Can be called from interrupt level.
   2344 */
   2345int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only)
   2346{
   2347	unsigned long flags;
   2348	struct rvt_rq *rq;
   2349	struct rvt_krwq *kwq = NULL;
   2350	struct rvt_rwq *wq;
   2351	struct rvt_srq *srq;
   2352	struct rvt_rwqe *wqe;
   2353	void (*handler)(struct ib_event *, void *);
   2354	u32 tail;
   2355	u32 head;
   2356	int ret;
   2357	void *ip = NULL;
   2358
   2359	if (qp->ibqp.srq) {
   2360		srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
   2361		handler = srq->ibsrq.event_handler;
   2362		rq = &srq->rq;
   2363		ip = srq->ip;
   2364	} else {
   2365		srq = NULL;
   2366		handler = NULL;
   2367		rq = &qp->r_rq;
   2368		ip = qp->ip;
   2369	}
   2370
   2371	spin_lock_irqsave(&rq->kwq->c_lock, flags);
   2372	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
   2373		ret = 0;
   2374		goto unlock;
   2375	}
   2376	kwq = rq->kwq;
   2377	if (ip) {
   2378		wq = rq->wq;
   2379		tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
   2380	} else {
   2381		tail = kwq->tail;
   2382	}
   2383
   2384	/* Validate tail before using it since it is user writable. */
   2385	if (tail >= rq->size)
   2386		tail = 0;
   2387
   2388	if (kwq->count < RVT_RWQ_COUNT_THRESHOLD) {
   2389		head = get_rvt_head(rq, ip);
   2390		kwq->count = rvt_get_rq_count(rq, head, tail);
   2391	}
   2392	if (unlikely(kwq->count == 0)) {
   2393		ret = 0;
   2394		goto unlock;
   2395	}
   2396	/* Make sure entry is read after the count is read. */
   2397	smp_rmb();
   2398	wqe = rvt_get_rwqe_ptr(rq, tail);
   2399	/*
   2400	 * Even though we update the tail index in memory, the verbs
   2401	 * consumer is not supposed to post more entries until a
   2402	 * completion is generated.
   2403	 */
   2404	if (++tail >= rq->size)
   2405		tail = 0;
   2406	if (ip)
   2407		RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
   2408	else
   2409		kwq->tail = tail;
   2410	if (!wr_id_only && !init_sge(qp, wqe)) {
   2411		ret = -1;
   2412		goto unlock;
   2413	}
   2414	qp->r_wr_id = wqe->wr_id;
   2415
   2416	kwq->count--;
   2417	ret = 1;
   2418	set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
   2419	if (handler) {
   2420		/*
   2421		 * Validate head pointer value and compute
   2422		 * the number of remaining WQEs.
   2423		 */
   2424		if (kwq->count < srq->limit) {
   2425			kwq->count =
   2426				rvt_get_rq_count(rq,
   2427						 get_rvt_head(rq, ip), tail);
   2428			if (kwq->count < srq->limit) {
   2429				struct ib_event ev;
   2430
   2431				srq->limit = 0;
   2432				spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
   2433				ev.device = qp->ibqp.device;
   2434				ev.element.srq = qp->ibqp.srq;
   2435				ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
   2436				handler(&ev, srq->ibsrq.srq_context);
   2437				goto bail;
   2438			}
   2439		}
   2440	}
   2441unlock:
   2442	spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
   2443bail:
   2444	return ret;
   2445}
   2446EXPORT_SYMBOL(rvt_get_rwqe);
   2447
   2448/**
   2449 * rvt_comm_est - handle trap with QP established
   2450 * @qp: the QP
   2451 */
   2452void rvt_comm_est(struct rvt_qp *qp)
   2453{
   2454	qp->r_flags |= RVT_R_COMM_EST;
   2455	if (qp->ibqp.event_handler) {
   2456		struct ib_event ev;
   2457
   2458		ev.device = qp->ibqp.device;
   2459		ev.element.qp = &qp->ibqp;
   2460		ev.event = IB_EVENT_COMM_EST;
   2461		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
   2462	}
   2463}
   2464EXPORT_SYMBOL(rvt_comm_est);
   2465
   2466void rvt_rc_error(struct rvt_qp *qp, enum ib_wc_status err)
   2467{
   2468	unsigned long flags;
   2469	int lastwqe;
   2470
   2471	spin_lock_irqsave(&qp->s_lock, flags);
   2472	lastwqe = rvt_error_qp(qp, err);
   2473	spin_unlock_irqrestore(&qp->s_lock, flags);
   2474
   2475	if (lastwqe) {
   2476		struct ib_event ev;
   2477
   2478		ev.device = qp->ibqp.device;
   2479		ev.element.qp = &qp->ibqp;
   2480		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
   2481		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
   2482	}
   2483}
   2484EXPORT_SYMBOL(rvt_rc_error);
   2485
   2486/*
   2487 *  rvt_rnr_tbl_to_usec - return index into ib_rvt_rnr_table
   2488 *  @index - the index
   2489 *  return usec from an index into ib_rvt_rnr_table
   2490 */
   2491unsigned long rvt_rnr_tbl_to_usec(u32 index)
   2492{
   2493	return ib_rvt_rnr_table[(index & IB_AETH_CREDIT_MASK)];
   2494}
   2495EXPORT_SYMBOL(rvt_rnr_tbl_to_usec);
   2496
   2497static inline unsigned long rvt_aeth_to_usec(u32 aeth)
   2498{
   2499	return ib_rvt_rnr_table[(aeth >> IB_AETH_CREDIT_SHIFT) &
   2500				  IB_AETH_CREDIT_MASK];
   2501}
   2502
   2503/*
   2504 *  rvt_add_retry_timer_ext - add/start a retry timer
   2505 *  @qp - the QP
   2506 *  @shift - timeout shift to wait for multiple packets
   2507 *  add a retry timer on the QP
   2508 */
   2509void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift)
   2510{
   2511	struct ib_qp *ibqp = &qp->ibqp;
   2512	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
   2513
   2514	lockdep_assert_held(&qp->s_lock);
   2515	qp->s_flags |= RVT_S_TIMER;
   2516       /* 4.096 usec. * (1 << qp->timeout) */
   2517	qp->s_timer.expires = jiffies + rdi->busy_jiffies +
   2518			      (qp->timeout_jiffies << shift);
   2519	add_timer(&qp->s_timer);
   2520}
   2521EXPORT_SYMBOL(rvt_add_retry_timer_ext);
   2522
   2523/**
   2524 * rvt_add_rnr_timer - add/start an rnr timer on the QP
   2525 * @qp: the QP
   2526 * @aeth: aeth of RNR timeout, simulated aeth for loopback
   2527 */
   2528void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth)
   2529{
   2530	u32 to;
   2531
   2532	lockdep_assert_held(&qp->s_lock);
   2533	qp->s_flags |= RVT_S_WAIT_RNR;
   2534	to = rvt_aeth_to_usec(aeth);
   2535	trace_rvt_rnrnak_add(qp, to);
   2536	hrtimer_start(&qp->s_rnr_timer,
   2537		      ns_to_ktime(1000 * to), HRTIMER_MODE_REL_PINNED);
   2538}
   2539EXPORT_SYMBOL(rvt_add_rnr_timer);
   2540
   2541/**
   2542 * rvt_stop_rc_timers - stop all timers
   2543 * @qp: the QP
   2544 * stop any pending timers
   2545 */
   2546void rvt_stop_rc_timers(struct rvt_qp *qp)
   2547{
   2548	lockdep_assert_held(&qp->s_lock);
   2549	/* Remove QP from all timers */
   2550	if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) {
   2551		qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR);
   2552		del_timer(&qp->s_timer);
   2553		hrtimer_try_to_cancel(&qp->s_rnr_timer);
   2554	}
   2555}
   2556EXPORT_SYMBOL(rvt_stop_rc_timers);
   2557
   2558/**
   2559 * rvt_stop_rnr_timer - stop an rnr timer
   2560 * @qp: the QP
   2561 *
   2562 * stop an rnr timer and return if the timer
   2563 * had been pending.
   2564 */
   2565static void rvt_stop_rnr_timer(struct rvt_qp *qp)
   2566{
   2567	lockdep_assert_held(&qp->s_lock);
   2568	/* Remove QP from rnr timer */
   2569	if (qp->s_flags & RVT_S_WAIT_RNR) {
   2570		qp->s_flags &= ~RVT_S_WAIT_RNR;
   2571		trace_rvt_rnrnak_stop(qp, 0);
   2572	}
   2573}
   2574
   2575/**
   2576 * rvt_del_timers_sync - wait for any timeout routines to exit
   2577 * @qp: the QP
   2578 */
   2579void rvt_del_timers_sync(struct rvt_qp *qp)
   2580{
   2581	del_timer_sync(&qp->s_timer);
   2582	hrtimer_cancel(&qp->s_rnr_timer);
   2583}
   2584EXPORT_SYMBOL(rvt_del_timers_sync);
   2585
   2586/*
   2587 * This is called from s_timer for missing responses.
   2588 */
   2589static void rvt_rc_timeout(struct timer_list *t)
   2590{
   2591	struct rvt_qp *qp = from_timer(qp, t, s_timer);
   2592	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
   2593	unsigned long flags;
   2594
   2595	spin_lock_irqsave(&qp->r_lock, flags);
   2596	spin_lock(&qp->s_lock);
   2597	if (qp->s_flags & RVT_S_TIMER) {
   2598		struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1];
   2599
   2600		qp->s_flags &= ~RVT_S_TIMER;
   2601		rvp->n_rc_timeouts++;
   2602		del_timer(&qp->s_timer);
   2603		trace_rvt_rc_timeout(qp, qp->s_last_psn + 1);
   2604		if (rdi->driver_f.notify_restart_rc)
   2605			rdi->driver_f.notify_restart_rc(qp,
   2606							qp->s_last_psn + 1,
   2607							1);
   2608		rdi->driver_f.schedule_send(qp);
   2609	}
   2610	spin_unlock(&qp->s_lock);
   2611	spin_unlock_irqrestore(&qp->r_lock, flags);
   2612}
   2613
   2614/*
   2615 * This is called from s_timer for RNR timeouts.
   2616 */
   2617enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t)
   2618{
   2619	struct rvt_qp *qp = container_of(t, struct rvt_qp, s_rnr_timer);
   2620	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
   2621	unsigned long flags;
   2622
   2623	spin_lock_irqsave(&qp->s_lock, flags);
   2624	rvt_stop_rnr_timer(qp);
   2625	trace_rvt_rnrnak_timeout(qp, 0);
   2626	rdi->driver_f.schedule_send(qp);
   2627	spin_unlock_irqrestore(&qp->s_lock, flags);
   2628	return HRTIMER_NORESTART;
   2629}
   2630EXPORT_SYMBOL(rvt_rc_rnr_retry);
   2631
   2632/**
   2633 * rvt_qp_iter_init - initial for QP iteration
   2634 * @rdi: rvt devinfo
   2635 * @v: u64 value
   2636 * @cb: user-defined callback
   2637 *
   2638 * This returns an iterator suitable for iterating QPs
   2639 * in the system.
   2640 *
   2641 * The @cb is a user-defined callback and @v is a 64-bit
   2642 * value passed to and relevant for processing in the
   2643 * @cb.  An example use case would be to alter QP processing
   2644 * based on criteria not part of the rvt_qp.
   2645 *
   2646 * Use cases that require memory allocation to succeed
   2647 * must preallocate appropriately.
   2648 *
   2649 * Return: a pointer to an rvt_qp_iter or NULL
   2650 */
   2651struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi,
   2652				     u64 v,
   2653				     void (*cb)(struct rvt_qp *qp, u64 v))
   2654{
   2655	struct rvt_qp_iter *i;
   2656
   2657	i = kzalloc(sizeof(*i), GFP_KERNEL);
   2658	if (!i)
   2659		return NULL;
   2660
   2661	i->rdi = rdi;
   2662	/* number of special QPs (SMI/GSI) for device */
   2663	i->specials = rdi->ibdev.phys_port_cnt * 2;
   2664	i->v = v;
   2665	i->cb = cb;
   2666
   2667	return i;
   2668}
   2669EXPORT_SYMBOL(rvt_qp_iter_init);
   2670
   2671/**
   2672 * rvt_qp_iter_next - return the next QP in iter
   2673 * @iter: the iterator
   2674 *
   2675 * Fine grained QP iterator suitable for use
   2676 * with debugfs seq_file mechanisms.
   2677 *
   2678 * Updates iter->qp with the current QP when the return
   2679 * value is 0.
   2680 *
   2681 * Return: 0 - iter->qp is valid 1 - no more QPs
   2682 */
   2683int rvt_qp_iter_next(struct rvt_qp_iter *iter)
   2684	__must_hold(RCU)
   2685{
   2686	int n = iter->n;
   2687	int ret = 1;
   2688	struct rvt_qp *pqp = iter->qp;
   2689	struct rvt_qp *qp;
   2690	struct rvt_dev_info *rdi = iter->rdi;
   2691
   2692	/*
   2693	 * The approach is to consider the special qps
   2694	 * as additional table entries before the
   2695	 * real hash table.  Since the qp code sets
   2696	 * the qp->next hash link to NULL, this works just fine.
   2697	 *
   2698	 * iter->specials is 2 * # ports
   2699	 *
   2700	 * n = 0..iter->specials is the special qp indices
   2701	 *
   2702	 * n = iter->specials..rdi->qp_dev->qp_table_size+iter->specials are
   2703	 * the potential hash bucket entries
   2704	 *
   2705	 */
   2706	for (; n <  rdi->qp_dev->qp_table_size + iter->specials; n++) {
   2707		if (pqp) {
   2708			qp = rcu_dereference(pqp->next);
   2709		} else {
   2710			if (n < iter->specials) {
   2711				struct rvt_ibport *rvp;
   2712				int pidx;
   2713
   2714				pidx = n % rdi->ibdev.phys_port_cnt;
   2715				rvp = rdi->ports[pidx];
   2716				qp = rcu_dereference(rvp->qp[n & 1]);
   2717			} else {
   2718				qp = rcu_dereference(
   2719					rdi->qp_dev->qp_table[
   2720						(n - iter->specials)]);
   2721			}
   2722		}
   2723		pqp = qp;
   2724		if (qp) {
   2725			iter->qp = qp;
   2726			iter->n = n;
   2727			return 0;
   2728		}
   2729	}
   2730	return ret;
   2731}
   2732EXPORT_SYMBOL(rvt_qp_iter_next);
   2733
   2734/**
   2735 * rvt_qp_iter - iterate all QPs
   2736 * @rdi: rvt devinfo
   2737 * @v: a 64-bit value
   2738 * @cb: a callback
   2739 *
   2740 * This provides a way for iterating all QPs.
   2741 *
   2742 * The @cb is a user-defined callback and @v is a 64-bit
   2743 * value passed to and relevant for processing in the
   2744 * cb.  An example use case would be to alter QP processing
   2745 * based on criteria not part of the rvt_qp.
   2746 *
   2747 * The code has an internal iterator to simplify
   2748 * non seq_file use cases.
   2749 */
   2750void rvt_qp_iter(struct rvt_dev_info *rdi,
   2751		 u64 v,
   2752		 void (*cb)(struct rvt_qp *qp, u64 v))
   2753{
   2754	int ret;
   2755	struct rvt_qp_iter i = {
   2756		.rdi = rdi,
   2757		.specials = rdi->ibdev.phys_port_cnt * 2,
   2758		.v = v,
   2759		.cb = cb
   2760	};
   2761
   2762	rcu_read_lock();
   2763	do {
   2764		ret = rvt_qp_iter_next(&i);
   2765		if (!ret) {
   2766			rvt_get_qp(i.qp);
   2767			rcu_read_unlock();
   2768			i.cb(i.qp, i.v);
   2769			rcu_read_lock();
   2770			rvt_put_qp(i.qp);
   2771		}
   2772	} while (!ret);
   2773	rcu_read_unlock();
   2774}
   2775EXPORT_SYMBOL(rvt_qp_iter);
   2776
   2777/*
   2778 * This should be called with s_lock and r_lock held.
   2779 */
   2780void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
   2781		       enum ib_wc_status status)
   2782{
   2783	u32 old_last, last;
   2784	struct rvt_dev_info *rdi;
   2785
   2786	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
   2787		return;
   2788	rdi = ib_to_rvt(qp->ibqp.device);
   2789
   2790	old_last = qp->s_last;
   2791	trace_rvt_qp_send_completion(qp, wqe, old_last);
   2792	last = rvt_qp_complete_swqe(qp, wqe, rdi->wc_opcode[wqe->wr.opcode],
   2793				    status);
   2794	if (qp->s_acked == old_last)
   2795		qp->s_acked = last;
   2796	if (qp->s_cur == old_last)
   2797		qp->s_cur = last;
   2798	if (qp->s_tail == old_last)
   2799		qp->s_tail = last;
   2800	if (qp->state == IB_QPS_SQD && last == qp->s_cur)
   2801		qp->s_draining = 0;
   2802}
   2803EXPORT_SYMBOL(rvt_send_complete);
   2804
   2805/**
   2806 * rvt_copy_sge - copy data to SGE memory
   2807 * @qp: associated QP
   2808 * @ss: the SGE state
   2809 * @data: the data to copy
   2810 * @length: the length of the data
   2811 * @release: boolean to release MR
   2812 * @copy_last: do a separate copy of the last 8 bytes
   2813 */
   2814void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss,
   2815		  void *data, u32 length,
   2816		  bool release, bool copy_last)
   2817{
   2818	struct rvt_sge *sge = &ss->sge;
   2819	int i;
   2820	bool in_last = false;
   2821	bool cacheless_copy = false;
   2822	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
   2823	struct rvt_wss *wss = rdi->wss;
   2824	unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode;
   2825
   2826	if (sge_copy_mode == RVT_SGE_COPY_CACHELESS) {
   2827		cacheless_copy = length >= PAGE_SIZE;
   2828	} else if (sge_copy_mode == RVT_SGE_COPY_ADAPTIVE) {
   2829		if (length >= PAGE_SIZE) {
   2830			/*
   2831			 * NOTE: this *assumes*:
   2832			 * o The first vaddr is the dest.
   2833			 * o If multiple pages, then vaddr is sequential.
   2834			 */
   2835			wss_insert(wss, sge->vaddr);
   2836			if (length >= (2 * PAGE_SIZE))
   2837				wss_insert(wss, (sge->vaddr + PAGE_SIZE));
   2838
   2839			cacheless_copy = wss_exceeds_threshold(wss);
   2840		} else {
   2841			wss_advance_clean_counter(wss);
   2842		}
   2843	}
   2844
   2845	if (copy_last) {
   2846		if (length > 8) {
   2847			length -= 8;
   2848		} else {
   2849			copy_last = false;
   2850			in_last = true;
   2851		}
   2852	}
   2853
   2854again:
   2855	while (length) {
   2856		u32 len = rvt_get_sge_length(sge, length);
   2857
   2858		WARN_ON_ONCE(len == 0);
   2859		if (unlikely(in_last)) {
   2860			/* enforce byte transfer ordering */
   2861			for (i = 0; i < len; i++)
   2862				((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
   2863		} else if (cacheless_copy) {
   2864			cacheless_memcpy(sge->vaddr, data, len);
   2865		} else {
   2866			memcpy(sge->vaddr, data, len);
   2867		}
   2868		rvt_update_sge(ss, len, release);
   2869		data += len;
   2870		length -= len;
   2871	}
   2872
   2873	if (copy_last) {
   2874		copy_last = false;
   2875		in_last = true;
   2876		length = 8;
   2877		goto again;
   2878	}
   2879}
   2880EXPORT_SYMBOL(rvt_copy_sge);
   2881
   2882static enum ib_wc_status loopback_qp_drop(struct rvt_ibport *rvp,
   2883					  struct rvt_qp *sqp)
   2884{
   2885	rvp->n_pkt_drops++;
   2886	/*
   2887	 * For RC, the requester would timeout and retry so
   2888	 * shortcut the timeouts and just signal too many retries.
   2889	 */
   2890	return sqp->ibqp.qp_type == IB_QPT_RC ?
   2891		IB_WC_RETRY_EXC_ERR : IB_WC_SUCCESS;
   2892}
   2893
   2894/**
   2895 * rvt_ruc_loopback - handle UC and RC loopback requests
   2896 * @sqp: the sending QP
   2897 *
   2898 * This is called from rvt_do_send() to forward a WQE addressed to the same HFI
   2899 * Note that although we are single threaded due to the send engine, we still
   2900 * have to protect against post_send().  We don't have to worry about
   2901 * receive interrupts since this is a connected protocol and all packets
   2902 * will pass through here.
   2903 */
   2904void rvt_ruc_loopback(struct rvt_qp *sqp)
   2905{
   2906	struct rvt_ibport *rvp =  NULL;
   2907	struct rvt_dev_info *rdi = ib_to_rvt(sqp->ibqp.device);
   2908	struct rvt_qp *qp;
   2909	struct rvt_swqe *wqe;
   2910	struct rvt_sge *sge;
   2911	unsigned long flags;
   2912	struct ib_wc wc;
   2913	u64 sdata;
   2914	atomic64_t *maddr;
   2915	enum ib_wc_status send_status;
   2916	bool release;
   2917	int ret;
   2918	bool copy_last = false;
   2919	int local_ops = 0;
   2920
   2921	rcu_read_lock();
   2922	rvp = rdi->ports[sqp->port_num - 1];
   2923
   2924	/*
   2925	 * Note that we check the responder QP state after
   2926	 * checking the requester's state.
   2927	 */
   2928
   2929	qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), rvp,
   2930			    sqp->remote_qpn);
   2931
   2932	spin_lock_irqsave(&sqp->s_lock, flags);
   2933
   2934	/* Return if we are already busy processing a work request. */
   2935	if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) ||
   2936	    !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND))
   2937		goto unlock;
   2938
   2939	sqp->s_flags |= RVT_S_BUSY;
   2940
   2941again:
   2942	if (sqp->s_last == READ_ONCE(sqp->s_head))
   2943		goto clr_busy;
   2944	wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
   2945
   2946	/* Return if it is not OK to start a new work request. */
   2947	if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
   2948		if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND))
   2949			goto clr_busy;
   2950		/* We are in the error state, flush the work request. */
   2951		send_status = IB_WC_WR_FLUSH_ERR;
   2952		goto flush_send;
   2953	}
   2954
   2955	/*
   2956	 * We can rely on the entry not changing without the s_lock
   2957	 * being held until we update s_last.
   2958	 * We increment s_cur to indicate s_last is in progress.
   2959	 */
   2960	if (sqp->s_last == sqp->s_cur) {
   2961		if (++sqp->s_cur >= sqp->s_size)
   2962			sqp->s_cur = 0;
   2963	}
   2964	spin_unlock_irqrestore(&sqp->s_lock, flags);
   2965
   2966	if (!qp) {
   2967		send_status = loopback_qp_drop(rvp, sqp);
   2968		goto serr_no_r_lock;
   2969	}
   2970	spin_lock_irqsave(&qp->r_lock, flags);
   2971	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
   2972	    qp->ibqp.qp_type != sqp->ibqp.qp_type) {
   2973		send_status = loopback_qp_drop(rvp, sqp);
   2974		goto serr;
   2975	}
   2976
   2977	memset(&wc, 0, sizeof(wc));
   2978	send_status = IB_WC_SUCCESS;
   2979
   2980	release = true;
   2981	sqp->s_sge.sge = wqe->sg_list[0];
   2982	sqp->s_sge.sg_list = wqe->sg_list + 1;
   2983	sqp->s_sge.num_sge = wqe->wr.num_sge;
   2984	sqp->s_len = wqe->length;
   2985	switch (wqe->wr.opcode) {
   2986	case IB_WR_REG_MR:
   2987		goto send_comp;
   2988
   2989	case IB_WR_LOCAL_INV:
   2990		if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
   2991			if (rvt_invalidate_rkey(sqp,
   2992						wqe->wr.ex.invalidate_rkey))
   2993				send_status = IB_WC_LOC_PROT_ERR;
   2994			local_ops = 1;
   2995		}
   2996		goto send_comp;
   2997
   2998	case IB_WR_SEND_WITH_INV:
   2999	case IB_WR_SEND_WITH_IMM:
   3000	case IB_WR_SEND:
   3001		ret = rvt_get_rwqe(qp, false);
   3002		if (ret < 0)
   3003			goto op_err;
   3004		if (!ret)
   3005			goto rnr_nak;
   3006		if (wqe->length > qp->r_len)
   3007			goto inv_err;
   3008		switch (wqe->wr.opcode) {
   3009		case IB_WR_SEND_WITH_INV:
   3010			if (!rvt_invalidate_rkey(qp,
   3011						 wqe->wr.ex.invalidate_rkey)) {
   3012				wc.wc_flags = IB_WC_WITH_INVALIDATE;
   3013				wc.ex.invalidate_rkey =
   3014					wqe->wr.ex.invalidate_rkey;
   3015			}
   3016			break;
   3017		case IB_WR_SEND_WITH_IMM:
   3018			wc.wc_flags = IB_WC_WITH_IMM;
   3019			wc.ex.imm_data = wqe->wr.ex.imm_data;
   3020			break;
   3021		default:
   3022			break;
   3023		}
   3024		break;
   3025
   3026	case IB_WR_RDMA_WRITE_WITH_IMM:
   3027		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
   3028			goto inv_err;
   3029		wc.wc_flags = IB_WC_WITH_IMM;
   3030		wc.ex.imm_data = wqe->wr.ex.imm_data;
   3031		ret = rvt_get_rwqe(qp, true);
   3032		if (ret < 0)
   3033			goto op_err;
   3034		if (!ret)
   3035			goto rnr_nak;
   3036		/* skip copy_last set and qp_access_flags recheck */
   3037		goto do_write;
   3038	case IB_WR_RDMA_WRITE:
   3039		copy_last = rvt_is_user_qp(qp);
   3040		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
   3041			goto inv_err;
   3042do_write:
   3043		if (wqe->length == 0)
   3044			break;
   3045		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
   3046					  wqe->rdma_wr.remote_addr,
   3047					  wqe->rdma_wr.rkey,
   3048					  IB_ACCESS_REMOTE_WRITE)))
   3049			goto acc_err;
   3050		qp->r_sge.sg_list = NULL;
   3051		qp->r_sge.num_sge = 1;
   3052		qp->r_sge.total_len = wqe->length;
   3053		break;
   3054
   3055	case IB_WR_RDMA_READ:
   3056		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
   3057			goto inv_err;
   3058		if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
   3059					  wqe->rdma_wr.remote_addr,
   3060					  wqe->rdma_wr.rkey,
   3061					  IB_ACCESS_REMOTE_READ)))
   3062			goto acc_err;
   3063		release = false;
   3064		sqp->s_sge.sg_list = NULL;
   3065		sqp->s_sge.num_sge = 1;
   3066		qp->r_sge.sge = wqe->sg_list[0];
   3067		qp->r_sge.sg_list = wqe->sg_list + 1;
   3068		qp->r_sge.num_sge = wqe->wr.num_sge;
   3069		qp->r_sge.total_len = wqe->length;
   3070		break;
   3071
   3072	case IB_WR_ATOMIC_CMP_AND_SWP:
   3073	case IB_WR_ATOMIC_FETCH_AND_ADD:
   3074		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
   3075			goto inv_err;
   3076		if (unlikely(wqe->atomic_wr.remote_addr & (sizeof(u64) - 1)))
   3077			goto inv_err;
   3078		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
   3079					  wqe->atomic_wr.remote_addr,
   3080					  wqe->atomic_wr.rkey,
   3081					  IB_ACCESS_REMOTE_ATOMIC)))
   3082			goto acc_err;
   3083		/* Perform atomic OP and save result. */
   3084		maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
   3085		sdata = wqe->atomic_wr.compare_add;
   3086		*(u64 *)sqp->s_sge.sge.vaddr =
   3087			(wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
   3088			(u64)atomic64_add_return(sdata, maddr) - sdata :
   3089			(u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
   3090				      sdata, wqe->atomic_wr.swap);
   3091		rvt_put_mr(qp->r_sge.sge.mr);
   3092		qp->r_sge.num_sge = 0;
   3093		goto send_comp;
   3094
   3095	default:
   3096		send_status = IB_WC_LOC_QP_OP_ERR;
   3097		goto serr;
   3098	}
   3099
   3100	sge = &sqp->s_sge.sge;
   3101	while (sqp->s_len) {
   3102		u32 len = rvt_get_sge_length(sge, sqp->s_len);
   3103
   3104		WARN_ON_ONCE(len == 0);
   3105		rvt_copy_sge(qp, &qp->r_sge, sge->vaddr,
   3106			     len, release, copy_last);
   3107		rvt_update_sge(&sqp->s_sge, len, !release);
   3108		sqp->s_len -= len;
   3109	}
   3110	if (release)
   3111		rvt_put_ss(&qp->r_sge);
   3112
   3113	if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
   3114		goto send_comp;
   3115
   3116	if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
   3117		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
   3118	else
   3119		wc.opcode = IB_WC_RECV;
   3120	wc.wr_id = qp->r_wr_id;
   3121	wc.status = IB_WC_SUCCESS;
   3122	wc.byte_len = wqe->length;
   3123	wc.qp = &qp->ibqp;
   3124	wc.src_qp = qp->remote_qpn;
   3125	wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX;
   3126	wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
   3127	wc.port_num = 1;
   3128	/* Signal completion event if the solicited bit is set. */
   3129	rvt_recv_cq(qp, &wc, wqe->wr.send_flags & IB_SEND_SOLICITED);
   3130
   3131send_comp:
   3132	spin_unlock_irqrestore(&qp->r_lock, flags);
   3133	spin_lock_irqsave(&sqp->s_lock, flags);
   3134	rvp->n_loop_pkts++;
   3135flush_send:
   3136	sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
   3137	spin_lock(&sqp->r_lock);
   3138	rvt_send_complete(sqp, wqe, send_status);
   3139	spin_unlock(&sqp->r_lock);
   3140	if (local_ops) {
   3141		atomic_dec(&sqp->local_ops_pending);
   3142		local_ops = 0;
   3143	}
   3144	goto again;
   3145
   3146rnr_nak:
   3147	/* Handle RNR NAK */
   3148	if (qp->ibqp.qp_type == IB_QPT_UC)
   3149		goto send_comp;
   3150	rvp->n_rnr_naks++;
   3151	/*
   3152	 * Note: we don't need the s_lock held since the BUSY flag
   3153	 * makes this single threaded.
   3154	 */
   3155	if (sqp->s_rnr_retry == 0) {
   3156		send_status = IB_WC_RNR_RETRY_EXC_ERR;
   3157		goto serr;
   3158	}
   3159	if (sqp->s_rnr_retry_cnt < 7)
   3160		sqp->s_rnr_retry--;
   3161	spin_unlock_irqrestore(&qp->r_lock, flags);
   3162	spin_lock_irqsave(&sqp->s_lock, flags);
   3163	if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
   3164		goto clr_busy;
   3165	rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer <<
   3166				IB_AETH_CREDIT_SHIFT);
   3167	goto clr_busy;
   3168
   3169op_err:
   3170	send_status = IB_WC_REM_OP_ERR;
   3171	wc.status = IB_WC_LOC_QP_OP_ERR;
   3172	goto err;
   3173
   3174inv_err:
   3175	send_status =
   3176		sqp->ibqp.qp_type == IB_QPT_RC ?
   3177			IB_WC_REM_INV_REQ_ERR :
   3178			IB_WC_SUCCESS;
   3179	wc.status = IB_WC_LOC_QP_OP_ERR;
   3180	goto err;
   3181
   3182acc_err:
   3183	send_status = IB_WC_REM_ACCESS_ERR;
   3184	wc.status = IB_WC_LOC_PROT_ERR;
   3185err:
   3186	/* responder goes to error state */
   3187	rvt_rc_error(qp, wc.status);
   3188
   3189serr:
   3190	spin_unlock_irqrestore(&qp->r_lock, flags);
   3191serr_no_r_lock:
   3192	spin_lock_irqsave(&sqp->s_lock, flags);
   3193	spin_lock(&sqp->r_lock);
   3194	rvt_send_complete(sqp, wqe, send_status);
   3195	spin_unlock(&sqp->r_lock);
   3196	if (sqp->ibqp.qp_type == IB_QPT_RC) {
   3197		int lastwqe;
   3198
   3199		spin_lock(&sqp->r_lock);
   3200		lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
   3201		spin_unlock(&sqp->r_lock);
   3202
   3203		sqp->s_flags &= ~RVT_S_BUSY;
   3204		spin_unlock_irqrestore(&sqp->s_lock, flags);
   3205		if (lastwqe) {
   3206			struct ib_event ev;
   3207
   3208			ev.device = sqp->ibqp.device;
   3209			ev.element.qp = &sqp->ibqp;
   3210			ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
   3211			sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
   3212		}
   3213		goto done;
   3214	}
   3215clr_busy:
   3216	sqp->s_flags &= ~RVT_S_BUSY;
   3217unlock:
   3218	spin_unlock_irqrestore(&sqp->s_lock, flags);
   3219done:
   3220	rcu_read_unlock();
   3221}
   3222EXPORT_SYMBOL(rvt_ruc_loopback);