cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

mthca_cq.c (26005B)


      1/*
      2 * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
      3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
      4 * Copyright (c) 2005, 2006 Cisco Systems, Inc. All rights reserved.
      5 * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
      6 * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
      7 *
      8 * This software is available to you under a choice of one of two
      9 * licenses.  You may choose to be licensed under the terms of the GNU
     10 * General Public License (GPL) Version 2, available from the file
     11 * COPYING in the main directory of this source tree, or the
     12 * OpenIB.org BSD license below:
     13 *
     14 *     Redistribution and use in source and binary forms, with or
     15 *     without modification, are permitted provided that the following
     16 *     conditions are met:
     17 *
     18 *      - Redistributions of source code must retain the above
     19 *        copyright notice, this list of conditions and the following
     20 *        disclaimer.
     21 *
     22 *      - Redistributions in binary form must reproduce the above
     23 *        copyright notice, this list of conditions and the following
     24 *        disclaimer in the documentation and/or other materials
     25 *        provided with the distribution.
     26 *
     27 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     28 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     29 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     30 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
     31 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
     32 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
     33 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     34 * SOFTWARE.
     35 */
     36
     37#include <linux/gfp.h>
     38#include <linux/hardirq.h>
     39#include <linux/sched.h>
     40
     41#include <asm/io.h>
     42
     43#include <rdma/ib_pack.h>
     44
     45#include "mthca_dev.h"
     46#include "mthca_cmd.h"
     47#include "mthca_memfree.h"
     48
     49enum {
     50	MTHCA_MAX_DIRECT_CQ_SIZE = 4 * PAGE_SIZE
     51};
     52
     53enum {
     54	MTHCA_CQ_ENTRY_SIZE = 0x20
     55};
     56
     57enum {
     58	MTHCA_ATOMIC_BYTE_LEN = 8
     59};
     60
     61/*
     62 * Must be packed because start is 64 bits but only aligned to 32 bits.
     63 */
     64struct mthca_cq_context {
     65	__be32 flags;
     66	__be64 start;
     67	__be32 logsize_usrpage;
     68	__be32 error_eqn;	/* Tavor only */
     69	__be32 comp_eqn;
     70	__be32 pd;
     71	__be32 lkey;
     72	__be32 last_notified_index;
     73	__be32 solicit_producer_index;
     74	__be32 consumer_index;
     75	__be32 producer_index;
     76	__be32 cqn;
     77	__be32 ci_db;		/* Arbel only */
     78	__be32 state_db;	/* Arbel only */
     79	u32    reserved;
     80} __packed;
     81
     82#define MTHCA_CQ_STATUS_OK          ( 0 << 28)
     83#define MTHCA_CQ_STATUS_OVERFLOW    ( 9 << 28)
     84#define MTHCA_CQ_STATUS_WRITE_FAIL  (10 << 28)
     85#define MTHCA_CQ_FLAG_TR            ( 1 << 18)
     86#define MTHCA_CQ_FLAG_OI            ( 1 << 17)
     87#define MTHCA_CQ_STATE_DISARMED     ( 0 <<  8)
     88#define MTHCA_CQ_STATE_ARMED        ( 1 <<  8)
     89#define MTHCA_CQ_STATE_ARMED_SOL    ( 4 <<  8)
     90#define MTHCA_EQ_STATE_FIRED        (10 <<  8)
     91
     92enum {
     93	MTHCA_ERROR_CQE_OPCODE_MASK = 0xfe
     94};
     95
     96enum {
     97	SYNDROME_LOCAL_LENGTH_ERR 	 = 0x01,
     98	SYNDROME_LOCAL_QP_OP_ERR  	 = 0x02,
     99	SYNDROME_LOCAL_EEC_OP_ERR 	 = 0x03,
    100	SYNDROME_LOCAL_PROT_ERR   	 = 0x04,
    101	SYNDROME_WR_FLUSH_ERR     	 = 0x05,
    102	SYNDROME_MW_BIND_ERR      	 = 0x06,
    103	SYNDROME_BAD_RESP_ERR     	 = 0x10,
    104	SYNDROME_LOCAL_ACCESS_ERR 	 = 0x11,
    105	SYNDROME_REMOTE_INVAL_REQ_ERR 	 = 0x12,
    106	SYNDROME_REMOTE_ACCESS_ERR 	 = 0x13,
    107	SYNDROME_REMOTE_OP_ERR     	 = 0x14,
    108	SYNDROME_RETRY_EXC_ERR 		 = 0x15,
    109	SYNDROME_RNR_RETRY_EXC_ERR 	 = 0x16,
    110	SYNDROME_LOCAL_RDD_VIOL_ERR 	 = 0x20,
    111	SYNDROME_REMOTE_INVAL_RD_REQ_ERR = 0x21,
    112	SYNDROME_REMOTE_ABORTED_ERR 	 = 0x22,
    113	SYNDROME_INVAL_EECN_ERR 	 = 0x23,
    114	SYNDROME_INVAL_EEC_STATE_ERR 	 = 0x24
    115};
    116
    117struct mthca_cqe {
    118	__be32 my_qpn;
    119	__be32 my_ee;
    120	__be32 rqpn;
    121	u8     sl_ipok;
    122	u8     g_mlpath;
    123	__be16 rlid;
    124	__be32 imm_etype_pkey_eec;
    125	__be32 byte_cnt;
    126	__be32 wqe;
    127	u8     opcode;
    128	u8     is_send;
    129	u8     reserved;
    130	u8     owner;
    131};
    132
    133struct mthca_err_cqe {
    134	__be32 my_qpn;
    135	u32    reserved1[3];
    136	u8     syndrome;
    137	u8     vendor_err;
    138	__be16 db_cnt;
    139	u32    reserved2;
    140	__be32 wqe;
    141	u8     opcode;
    142	u8     reserved3[2];
    143	u8     owner;
    144};
    145
    146#define MTHCA_CQ_ENTRY_OWNER_SW      (0 << 7)
    147#define MTHCA_CQ_ENTRY_OWNER_HW      (1 << 7)
    148
    149#define MTHCA_TAVOR_CQ_DB_INC_CI       (1 << 24)
    150#define MTHCA_TAVOR_CQ_DB_REQ_NOT      (2 << 24)
    151#define MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL  (3 << 24)
    152#define MTHCA_TAVOR_CQ_DB_SET_CI       (4 << 24)
    153#define MTHCA_TAVOR_CQ_DB_REQ_NOT_MULT (5 << 24)
    154
    155#define MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL  (1 << 24)
    156#define MTHCA_ARBEL_CQ_DB_REQ_NOT      (2 << 24)
    157#define MTHCA_ARBEL_CQ_DB_REQ_NOT_MULT (3 << 24)
    158
    159static inline struct mthca_cqe *get_cqe_from_buf(struct mthca_cq_buf *buf,
    160						 int entry)
    161{
    162	if (buf->is_direct)
    163		return buf->queue.direct.buf + (entry * MTHCA_CQ_ENTRY_SIZE);
    164	else
    165		return buf->queue.page_list[entry * MTHCA_CQ_ENTRY_SIZE / PAGE_SIZE].buf
    166			+ (entry * MTHCA_CQ_ENTRY_SIZE) % PAGE_SIZE;
    167}
    168
    169static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry)
    170{
    171	return get_cqe_from_buf(&cq->buf, entry);
    172}
    173
    174static inline struct mthca_cqe *cqe_sw(struct mthca_cqe *cqe)
    175{
    176	return MTHCA_CQ_ENTRY_OWNER_HW & cqe->owner ? NULL : cqe;
    177}
    178
    179static inline struct mthca_cqe *next_cqe_sw(struct mthca_cq *cq)
    180{
    181	return cqe_sw(get_cqe(cq, cq->cons_index & cq->ibcq.cqe));
    182}
    183
    184static inline void set_cqe_hw(struct mthca_cqe *cqe)
    185{
    186	cqe->owner = MTHCA_CQ_ENTRY_OWNER_HW;
    187}
    188
    189static void dump_cqe(struct mthca_dev *dev, void *cqe_ptr)
    190{
    191	__be32 *cqe = cqe_ptr;
    192
    193	(void) cqe;	/* avoid warning if mthca_dbg compiled away... */
    194	mthca_dbg(dev, "CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n",
    195		  be32_to_cpu(cqe[0]), be32_to_cpu(cqe[1]), be32_to_cpu(cqe[2]),
    196		  be32_to_cpu(cqe[3]), be32_to_cpu(cqe[4]), be32_to_cpu(cqe[5]),
    197		  be32_to_cpu(cqe[6]), be32_to_cpu(cqe[7]));
    198}
    199
    200/*
    201 * incr is ignored in native Arbel (mem-free) mode, so cq->cons_index
    202 * should be correct before calling update_cons_index().
    203 */
    204static inline void update_cons_index(struct mthca_dev *dev, struct mthca_cq *cq,
    205				     int incr)
    206{
    207	if (mthca_is_memfree(dev)) {
    208		*cq->set_ci_db = cpu_to_be32(cq->cons_index);
    209		wmb();
    210	} else {
    211		mthca_write64(MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn, incr - 1,
    212			      dev->kar + MTHCA_CQ_DOORBELL,
    213			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
    214	}
    215}
    216
    217void mthca_cq_completion(struct mthca_dev *dev, u32 cqn)
    218{
    219	struct mthca_cq *cq;
    220
    221	cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
    222
    223	if (!cq) {
    224		mthca_warn(dev, "Completion event for bogus CQ %08x\n", cqn);
    225		return;
    226	}
    227
    228	++cq->arm_sn;
    229
    230	cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
    231}
    232
    233void mthca_cq_event(struct mthca_dev *dev, u32 cqn,
    234		    enum ib_event_type event_type)
    235{
    236	struct mthca_cq *cq;
    237	struct ib_event event;
    238
    239	spin_lock(&dev->cq_table.lock);
    240
    241	cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
    242	if (cq)
    243		++cq->refcount;
    244
    245	spin_unlock(&dev->cq_table.lock);
    246
    247	if (!cq) {
    248		mthca_warn(dev, "Async event for bogus CQ %08x\n", cqn);
    249		return;
    250	}
    251
    252	event.device      = &dev->ib_dev;
    253	event.event       = event_type;
    254	event.element.cq  = &cq->ibcq;
    255	if (cq->ibcq.event_handler)
    256		cq->ibcq.event_handler(&event, cq->ibcq.cq_context);
    257
    258	spin_lock(&dev->cq_table.lock);
    259	if (!--cq->refcount)
    260		wake_up(&cq->wait);
    261	spin_unlock(&dev->cq_table.lock);
    262}
    263
    264static inline int is_recv_cqe(struct mthca_cqe *cqe)
    265{
    266	if ((cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
    267	    MTHCA_ERROR_CQE_OPCODE_MASK)
    268		return !(cqe->opcode & 0x01);
    269	else
    270		return !(cqe->is_send & 0x80);
    271}
    272
    273void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn,
    274		    struct mthca_srq *srq)
    275{
    276	struct mthca_cqe *cqe;
    277	u32 prod_index;
    278	int i, nfreed = 0;
    279
    280	spin_lock_irq(&cq->lock);
    281
    282	/*
    283	 * First we need to find the current producer index, so we
    284	 * know where to start cleaning from.  It doesn't matter if HW
    285	 * adds new entries after this loop -- the QP we're worried
    286	 * about is already in RESET, so the new entries won't come
    287	 * from our QP and therefore don't need to be checked.
    288	 */
    289	for (prod_index = cq->cons_index;
    290	     cqe_sw(get_cqe(cq, prod_index & cq->ibcq.cqe));
    291	     ++prod_index)
    292		if (prod_index == cq->cons_index + cq->ibcq.cqe)
    293			break;
    294
    295	if (0)
    296		mthca_dbg(dev, "Cleaning QPN %06x from CQN %06x; ci %d, pi %d\n",
    297			  qpn, cq->cqn, cq->cons_index, prod_index);
    298
    299	/*
    300	 * Now sweep backwards through the CQ, removing CQ entries
    301	 * that match our QP by copying older entries on top of them.
    302	 */
    303	while ((int) --prod_index - (int) cq->cons_index >= 0) {
    304		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
    305		if (cqe->my_qpn == cpu_to_be32(qpn)) {
    306			if (srq && is_recv_cqe(cqe))
    307				mthca_free_srq_wqe(srq, be32_to_cpu(cqe->wqe));
    308			++nfreed;
    309		} else if (nfreed)
    310			memcpy(get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe),
    311			       cqe, MTHCA_CQ_ENTRY_SIZE);
    312	}
    313
    314	if (nfreed) {
    315		for (i = 0; i < nfreed; ++i)
    316			set_cqe_hw(get_cqe(cq, (cq->cons_index + i) & cq->ibcq.cqe));
    317		wmb();
    318		cq->cons_index += nfreed;
    319		update_cons_index(dev, cq, nfreed);
    320	}
    321
    322	spin_unlock_irq(&cq->lock);
    323}
    324
    325void mthca_cq_resize_copy_cqes(struct mthca_cq *cq)
    326{
    327	int i;
    328
    329	/*
    330	 * In Tavor mode, the hardware keeps the consumer and producer
    331	 * indices mod the CQ size.  Since we might be making the CQ
    332	 * bigger, we need to deal with the case where the producer
    333	 * index wrapped around before the CQ was resized.
    334	 */
    335	if (!mthca_is_memfree(to_mdev(cq->ibcq.device)) &&
    336	    cq->ibcq.cqe < cq->resize_buf->cqe) {
    337		cq->cons_index &= cq->ibcq.cqe;
    338		if (cqe_sw(get_cqe(cq, cq->ibcq.cqe)))
    339			cq->cons_index -= cq->ibcq.cqe + 1;
    340	}
    341
    342	for (i = cq->cons_index; cqe_sw(get_cqe(cq, i & cq->ibcq.cqe)); ++i)
    343		memcpy(get_cqe_from_buf(&cq->resize_buf->buf,
    344					i & cq->resize_buf->cqe),
    345		       get_cqe(cq, i & cq->ibcq.cqe), MTHCA_CQ_ENTRY_SIZE);
    346}
    347
    348int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent)
    349{
    350	int ret;
    351	int i;
    352
    353	ret = mthca_buf_alloc(dev, nent * MTHCA_CQ_ENTRY_SIZE,
    354			      MTHCA_MAX_DIRECT_CQ_SIZE,
    355			      &buf->queue, &buf->is_direct,
    356			      &dev->driver_pd, 1, &buf->mr);
    357	if (ret)
    358		return ret;
    359
    360	for (i = 0; i < nent; ++i)
    361		set_cqe_hw(get_cqe_from_buf(buf, i));
    362
    363	return 0;
    364}
    365
    366void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe)
    367{
    368	mthca_buf_free(dev, (cqe + 1) * MTHCA_CQ_ENTRY_SIZE, &buf->queue,
    369		       buf->is_direct, &buf->mr);
    370}
    371
    372static void handle_error_cqe(struct mthca_dev *dev, struct mthca_cq *cq,
    373			     struct mthca_qp *qp, int wqe_index, int is_send,
    374			     struct mthca_err_cqe *cqe,
    375			     struct ib_wc *entry, int *free_cqe)
    376{
    377	int dbd;
    378	__be32 new_wqe;
    379
    380	if (cqe->syndrome == SYNDROME_LOCAL_QP_OP_ERR) {
    381		mthca_dbg(dev, "local QP operation err "
    382			  "(QPN %06x, WQE @ %08x, CQN %06x, index %d)\n",
    383			  be32_to_cpu(cqe->my_qpn), be32_to_cpu(cqe->wqe),
    384			  cq->cqn, cq->cons_index);
    385		dump_cqe(dev, cqe);
    386	}
    387
    388	/*
    389	 * For completions in error, only work request ID, status, vendor error
    390	 * (and freed resource count for RD) have to be set.
    391	 */
    392	switch (cqe->syndrome) {
    393	case SYNDROME_LOCAL_LENGTH_ERR:
    394		entry->status = IB_WC_LOC_LEN_ERR;
    395		break;
    396	case SYNDROME_LOCAL_QP_OP_ERR:
    397		entry->status = IB_WC_LOC_QP_OP_ERR;
    398		break;
    399	case SYNDROME_LOCAL_EEC_OP_ERR:
    400		entry->status = IB_WC_LOC_EEC_OP_ERR;
    401		break;
    402	case SYNDROME_LOCAL_PROT_ERR:
    403		entry->status = IB_WC_LOC_PROT_ERR;
    404		break;
    405	case SYNDROME_WR_FLUSH_ERR:
    406		entry->status = IB_WC_WR_FLUSH_ERR;
    407		break;
    408	case SYNDROME_MW_BIND_ERR:
    409		entry->status = IB_WC_MW_BIND_ERR;
    410		break;
    411	case SYNDROME_BAD_RESP_ERR:
    412		entry->status = IB_WC_BAD_RESP_ERR;
    413		break;
    414	case SYNDROME_LOCAL_ACCESS_ERR:
    415		entry->status = IB_WC_LOC_ACCESS_ERR;
    416		break;
    417	case SYNDROME_REMOTE_INVAL_REQ_ERR:
    418		entry->status = IB_WC_REM_INV_REQ_ERR;
    419		break;
    420	case SYNDROME_REMOTE_ACCESS_ERR:
    421		entry->status = IB_WC_REM_ACCESS_ERR;
    422		break;
    423	case SYNDROME_REMOTE_OP_ERR:
    424		entry->status = IB_WC_REM_OP_ERR;
    425		break;
    426	case SYNDROME_RETRY_EXC_ERR:
    427		entry->status = IB_WC_RETRY_EXC_ERR;
    428		break;
    429	case SYNDROME_RNR_RETRY_EXC_ERR:
    430		entry->status = IB_WC_RNR_RETRY_EXC_ERR;
    431		break;
    432	case SYNDROME_LOCAL_RDD_VIOL_ERR:
    433		entry->status = IB_WC_LOC_RDD_VIOL_ERR;
    434		break;
    435	case SYNDROME_REMOTE_INVAL_RD_REQ_ERR:
    436		entry->status = IB_WC_REM_INV_RD_REQ_ERR;
    437		break;
    438	case SYNDROME_REMOTE_ABORTED_ERR:
    439		entry->status = IB_WC_REM_ABORT_ERR;
    440		break;
    441	case SYNDROME_INVAL_EECN_ERR:
    442		entry->status = IB_WC_INV_EECN_ERR;
    443		break;
    444	case SYNDROME_INVAL_EEC_STATE_ERR:
    445		entry->status = IB_WC_INV_EEC_STATE_ERR;
    446		break;
    447	default:
    448		entry->status = IB_WC_GENERAL_ERR;
    449		break;
    450	}
    451
    452	entry->vendor_err = cqe->vendor_err;
    453
    454	/*
    455	 * Mem-free HCAs always generate one CQE per WQE, even in the
    456	 * error case, so we don't have to check the doorbell count, etc.
    457	 */
    458	if (mthca_is_memfree(dev))
    459		return;
    460
    461	mthca_free_err_wqe(dev, qp, is_send, wqe_index, &dbd, &new_wqe);
    462
    463	/*
    464	 * If we're at the end of the WQE chain, or we've used up our
    465	 * doorbell count, free the CQE.  Otherwise just update it for
    466	 * the next poll operation.
    467	 */
    468	if (!(new_wqe & cpu_to_be32(0x3f)) || (!cqe->db_cnt && dbd))
    469		return;
    470
    471	be16_add_cpu(&cqe->db_cnt, -dbd);
    472	cqe->wqe      = new_wqe;
    473	cqe->syndrome = SYNDROME_WR_FLUSH_ERR;
    474
    475	*free_cqe = 0;
    476}
    477
    478static inline int mthca_poll_one(struct mthca_dev *dev,
    479				 struct mthca_cq *cq,
    480				 struct mthca_qp **cur_qp,
    481				 int *freed,
    482				 struct ib_wc *entry)
    483{
    484	struct mthca_wq *wq;
    485	struct mthca_cqe *cqe;
    486	int wqe_index;
    487	int is_error;
    488	int is_send;
    489	int free_cqe = 1;
    490	int err = 0;
    491	u16 checksum;
    492
    493	cqe = next_cqe_sw(cq);
    494	if (!cqe)
    495		return -EAGAIN;
    496
    497	/*
    498	 * Make sure we read CQ entry contents after we've checked the
    499	 * ownership bit.
    500	 */
    501	rmb();
    502
    503	if (0) {
    504		mthca_dbg(dev, "%x/%d: CQE -> QPN %06x, WQE @ %08x\n",
    505			  cq->cqn, cq->cons_index, be32_to_cpu(cqe->my_qpn),
    506			  be32_to_cpu(cqe->wqe));
    507		dump_cqe(dev, cqe);
    508	}
    509
    510	is_error = (cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
    511		MTHCA_ERROR_CQE_OPCODE_MASK;
    512	is_send  = is_error ? cqe->opcode & 0x01 : cqe->is_send & 0x80;
    513
    514	if (!*cur_qp || be32_to_cpu(cqe->my_qpn) != (*cur_qp)->qpn) {
    515		/*
    516		 * We do not have to take the QP table lock here,
    517		 * because CQs will be locked while QPs are removed
    518		 * from the table.
    519		 */
    520		*cur_qp = mthca_array_get(&dev->qp_table.qp,
    521					  be32_to_cpu(cqe->my_qpn) &
    522					  (dev->limits.num_qps - 1));
    523		if (!*cur_qp) {
    524			mthca_warn(dev, "CQ entry for unknown QP %06x\n",
    525				   be32_to_cpu(cqe->my_qpn) & 0xffffff);
    526			err = -EINVAL;
    527			goto out;
    528		}
    529	}
    530
    531	entry->qp = &(*cur_qp)->ibqp;
    532
    533	if (is_send) {
    534		wq = &(*cur_qp)->sq;
    535		wqe_index = ((be32_to_cpu(cqe->wqe) - (*cur_qp)->send_wqe_offset)
    536			     >> wq->wqe_shift);
    537		entry->wr_id = (*cur_qp)->wrid[wqe_index +
    538					       (*cur_qp)->rq.max];
    539	} else if ((*cur_qp)->ibqp.srq) {
    540		struct mthca_srq *srq = to_msrq((*cur_qp)->ibqp.srq);
    541		u32 wqe = be32_to_cpu(cqe->wqe);
    542		wq = NULL;
    543		wqe_index = wqe >> srq->wqe_shift;
    544		entry->wr_id = srq->wrid[wqe_index];
    545		mthca_free_srq_wqe(srq, wqe);
    546	} else {
    547		s32 wqe;
    548		wq = &(*cur_qp)->rq;
    549		wqe = be32_to_cpu(cqe->wqe);
    550		wqe_index = wqe >> wq->wqe_shift;
    551		/*
    552		 * WQE addr == base - 1 might be reported in receive completion
    553		 * with error instead of (rq size - 1) by Sinai FW 1.0.800 and
    554		 * Arbel FW 5.1.400.  This bug should be fixed in later FW revs.
    555		 */
    556		if (unlikely(wqe_index < 0))
    557			wqe_index = wq->max - 1;
    558		entry->wr_id = (*cur_qp)->wrid[wqe_index];
    559	}
    560
    561	if (wq) {
    562		if (wq->last_comp < wqe_index)
    563			wq->tail += wqe_index - wq->last_comp;
    564		else
    565			wq->tail += wqe_index + wq->max - wq->last_comp;
    566
    567		wq->last_comp = wqe_index;
    568	}
    569
    570	if (is_error) {
    571		handle_error_cqe(dev, cq, *cur_qp, wqe_index, is_send,
    572				 (struct mthca_err_cqe *) cqe,
    573				 entry, &free_cqe);
    574		goto out;
    575	}
    576
    577	if (is_send) {
    578		entry->wc_flags = 0;
    579		switch (cqe->opcode) {
    580		case MTHCA_OPCODE_RDMA_WRITE:
    581			entry->opcode    = IB_WC_RDMA_WRITE;
    582			break;
    583		case MTHCA_OPCODE_RDMA_WRITE_IMM:
    584			entry->opcode    = IB_WC_RDMA_WRITE;
    585			entry->wc_flags |= IB_WC_WITH_IMM;
    586			break;
    587		case MTHCA_OPCODE_SEND:
    588			entry->opcode    = IB_WC_SEND;
    589			break;
    590		case MTHCA_OPCODE_SEND_IMM:
    591			entry->opcode    = IB_WC_SEND;
    592			entry->wc_flags |= IB_WC_WITH_IMM;
    593			break;
    594		case MTHCA_OPCODE_RDMA_READ:
    595			entry->opcode    = IB_WC_RDMA_READ;
    596			entry->byte_len  = be32_to_cpu(cqe->byte_cnt);
    597			break;
    598		case MTHCA_OPCODE_ATOMIC_CS:
    599			entry->opcode    = IB_WC_COMP_SWAP;
    600			entry->byte_len  = MTHCA_ATOMIC_BYTE_LEN;
    601			break;
    602		case MTHCA_OPCODE_ATOMIC_FA:
    603			entry->opcode    = IB_WC_FETCH_ADD;
    604			entry->byte_len  = MTHCA_ATOMIC_BYTE_LEN;
    605			break;
    606		default:
    607			entry->opcode = 0xFF;
    608			break;
    609		}
    610	} else {
    611		entry->byte_len = be32_to_cpu(cqe->byte_cnt);
    612		switch (cqe->opcode & 0x1f) {
    613		case IB_OPCODE_SEND_LAST_WITH_IMMEDIATE:
    614		case IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE:
    615			entry->wc_flags = IB_WC_WITH_IMM;
    616			entry->ex.imm_data = cqe->imm_etype_pkey_eec;
    617			entry->opcode = IB_WC_RECV;
    618			break;
    619		case IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE:
    620		case IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
    621			entry->wc_flags = IB_WC_WITH_IMM;
    622			entry->ex.imm_data = cqe->imm_etype_pkey_eec;
    623			entry->opcode = IB_WC_RECV_RDMA_WITH_IMM;
    624			break;
    625		default:
    626			entry->wc_flags = 0;
    627			entry->opcode = IB_WC_RECV;
    628			break;
    629		}
    630		entry->slid 	   = be16_to_cpu(cqe->rlid);
    631		entry->sl   	   = cqe->sl_ipok >> 4;
    632		entry->src_qp 	   = be32_to_cpu(cqe->rqpn) & 0xffffff;
    633		entry->dlid_path_bits = cqe->g_mlpath & 0x7f;
    634		entry->pkey_index  = be32_to_cpu(cqe->imm_etype_pkey_eec) >> 16;
    635		entry->wc_flags   |= cqe->g_mlpath & 0x80 ? IB_WC_GRH : 0;
    636		checksum = (be32_to_cpu(cqe->rqpn) >> 24) |
    637				((be32_to_cpu(cqe->my_ee) >> 16) & 0xff00);
    638		entry->wc_flags	  |=  (cqe->sl_ipok & 1 && checksum == 0xffff) ?
    639							IB_WC_IP_CSUM_OK : 0;
    640	}
    641
    642	entry->status = IB_WC_SUCCESS;
    643
    644 out:
    645	if (likely(free_cqe)) {
    646		set_cqe_hw(cqe);
    647		++(*freed);
    648		++cq->cons_index;
    649	}
    650
    651	return err;
    652}
    653
    654int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
    655		  struct ib_wc *entry)
    656{
    657	struct mthca_dev *dev = to_mdev(ibcq->device);
    658	struct mthca_cq *cq = to_mcq(ibcq);
    659	struct mthca_qp *qp = NULL;
    660	unsigned long flags;
    661	int err = 0;
    662	int freed = 0;
    663	int npolled;
    664
    665	spin_lock_irqsave(&cq->lock, flags);
    666
    667	npolled = 0;
    668repoll:
    669	while (npolled < num_entries) {
    670		err = mthca_poll_one(dev, cq, &qp,
    671				     &freed, entry + npolled);
    672		if (err)
    673			break;
    674		++npolled;
    675	}
    676
    677	if (freed) {
    678		wmb();
    679		update_cons_index(dev, cq, freed);
    680	}
    681
    682	/*
    683	 * If a CQ resize is in progress and we discovered that the
    684	 * old buffer is empty, then peek in the new buffer, and if
    685	 * it's not empty, switch to the new buffer and continue
    686	 * polling there.
    687	 */
    688	if (unlikely(err == -EAGAIN && cq->resize_buf &&
    689		     cq->resize_buf->state == CQ_RESIZE_READY)) {
    690		/*
    691		 * In Tavor mode, the hardware keeps the producer
    692		 * index modulo the CQ size.  Since we might be making
    693		 * the CQ bigger, we need to mask our consumer index
    694		 * using the size of the old CQ buffer before looking
    695		 * in the new CQ buffer.
    696		 */
    697		if (!mthca_is_memfree(dev))
    698			cq->cons_index &= cq->ibcq.cqe;
    699
    700		if (cqe_sw(get_cqe_from_buf(&cq->resize_buf->buf,
    701					    cq->cons_index & cq->resize_buf->cqe))) {
    702			struct mthca_cq_buf tbuf;
    703			int tcqe;
    704
    705			tbuf         = cq->buf;
    706			tcqe         = cq->ibcq.cqe;
    707			cq->buf      = cq->resize_buf->buf;
    708			cq->ibcq.cqe = cq->resize_buf->cqe;
    709
    710			cq->resize_buf->buf   = tbuf;
    711			cq->resize_buf->cqe   = tcqe;
    712			cq->resize_buf->state = CQ_RESIZE_SWAPPED;
    713
    714			goto repoll;
    715		}
    716	}
    717
    718	spin_unlock_irqrestore(&cq->lock, flags);
    719
    720	return err == 0 || err == -EAGAIN ? npolled : err;
    721}
    722
    723int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags)
    724{
    725	u32 dbhi = ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
    726		    MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL :
    727		    MTHCA_TAVOR_CQ_DB_REQ_NOT) |
    728		to_mcq(cq)->cqn;
    729
    730	mthca_write64(dbhi, 0xffffffff, to_mdev(cq->device)->kar + MTHCA_CQ_DOORBELL,
    731		      MTHCA_GET_DOORBELL_LOCK(&to_mdev(cq->device)->doorbell_lock));
    732
    733	return 0;
    734}
    735
    736int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
    737{
    738	struct mthca_cq *cq = to_mcq(ibcq);
    739	__be32 db_rec[2];
    740	u32 dbhi;
    741	u32 sn = cq->arm_sn & 3;
    742
    743	db_rec[0] = cpu_to_be32(cq->cons_index);
    744	db_rec[1] = cpu_to_be32((cq->cqn << 8) | (2 << 5) | (sn << 3) |
    745				((flags & IB_CQ_SOLICITED_MASK) ==
    746				 IB_CQ_SOLICITED ? 1 : 2));
    747
    748	mthca_write_db_rec(db_rec, cq->arm_db);
    749
    750	/*
    751	 * Make sure that the doorbell record in host memory is
    752	 * written before ringing the doorbell via PCI MMIO.
    753	 */
    754	wmb();
    755
    756	dbhi = (sn << 28) |
    757		((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
    758		 MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL :
    759		 MTHCA_ARBEL_CQ_DB_REQ_NOT) | cq->cqn;
    760
    761	mthca_write64(dbhi, cq->cons_index,
    762		      to_mdev(ibcq->device)->kar + MTHCA_CQ_DOORBELL,
    763		      MTHCA_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->doorbell_lock));
    764
    765	return 0;
    766}
    767
    768int mthca_init_cq(struct mthca_dev *dev, int nent,
    769		  struct mthca_ucontext *ctx, u32 pdn,
    770		  struct mthca_cq *cq)
    771{
    772	struct mthca_mailbox *mailbox;
    773	struct mthca_cq_context *cq_context;
    774	int err = -ENOMEM;
    775
    776	cq->ibcq.cqe  = nent - 1;
    777	cq->is_kernel = !ctx;
    778
    779	cq->cqn = mthca_alloc(&dev->cq_table.alloc);
    780	if (cq->cqn == -1)
    781		return -ENOMEM;
    782
    783	if (mthca_is_memfree(dev)) {
    784		err = mthca_table_get(dev, dev->cq_table.table, cq->cqn);
    785		if (err)
    786			goto err_out;
    787
    788		if (cq->is_kernel) {
    789			cq->arm_sn = 1;
    790
    791			err = -ENOMEM;
    792
    793			cq->set_ci_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_SET_CI,
    794							     cq->cqn, &cq->set_ci_db);
    795			if (cq->set_ci_db_index < 0)
    796				goto err_out_icm;
    797
    798			cq->arm_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_ARM,
    799							  cq->cqn, &cq->arm_db);
    800			if (cq->arm_db_index < 0)
    801				goto err_out_ci;
    802		}
    803	}
    804
    805	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
    806	if (IS_ERR(mailbox)) {
    807		err = PTR_ERR(mailbox);
    808		goto err_out_arm;
    809	}
    810
    811	cq_context = mailbox->buf;
    812
    813	if (cq->is_kernel) {
    814		err = mthca_alloc_cq_buf(dev, &cq->buf, nent);
    815		if (err)
    816			goto err_out_mailbox;
    817	}
    818
    819	spin_lock_init(&cq->lock);
    820	cq->refcount = 1;
    821	init_waitqueue_head(&cq->wait);
    822	mutex_init(&cq->mutex);
    823
    824	memset(cq_context, 0, sizeof *cq_context);
    825	cq_context->flags           = cpu_to_be32(MTHCA_CQ_STATUS_OK      |
    826						  MTHCA_CQ_STATE_DISARMED |
    827						  MTHCA_CQ_FLAG_TR);
    828	cq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24);
    829	if (ctx)
    830		cq_context->logsize_usrpage |= cpu_to_be32(ctx->uar.index);
    831	else
    832		cq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index);
    833	cq_context->error_eqn       = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn);
    834	cq_context->comp_eqn        = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_COMP].eqn);
    835	cq_context->pd              = cpu_to_be32(pdn);
    836	cq_context->lkey            = cpu_to_be32(cq->buf.mr.ibmr.lkey);
    837	cq_context->cqn             = cpu_to_be32(cq->cqn);
    838
    839	if (mthca_is_memfree(dev)) {
    840		cq_context->ci_db    = cpu_to_be32(cq->set_ci_db_index);
    841		cq_context->state_db = cpu_to_be32(cq->arm_db_index);
    842	}
    843
    844	err = mthca_SW2HW_CQ(dev, mailbox, cq->cqn);
    845	if (err) {
    846		mthca_warn(dev, "SW2HW_CQ failed (%d)\n", err);
    847		goto err_out_free_mr;
    848	}
    849
    850	spin_lock_irq(&dev->cq_table.lock);
    851	err = mthca_array_set(&dev->cq_table.cq,
    852			      cq->cqn & (dev->limits.num_cqs - 1), cq);
    853	if (err) {
    854		spin_unlock_irq(&dev->cq_table.lock);
    855		goto err_out_free_mr;
    856	}
    857	spin_unlock_irq(&dev->cq_table.lock);
    858
    859	cq->cons_index = 0;
    860
    861	mthca_free_mailbox(dev, mailbox);
    862
    863	return 0;
    864
    865err_out_free_mr:
    866	if (cq->is_kernel)
    867		mthca_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
    868
    869err_out_mailbox:
    870	mthca_free_mailbox(dev, mailbox);
    871
    872err_out_arm:
    873	if (cq->is_kernel && mthca_is_memfree(dev))
    874		mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM, cq->arm_db_index);
    875
    876err_out_ci:
    877	if (cq->is_kernel && mthca_is_memfree(dev))
    878		mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
    879
    880err_out_icm:
    881	mthca_table_put(dev, dev->cq_table.table, cq->cqn);
    882
    883err_out:
    884	mthca_free(&dev->cq_table.alloc, cq->cqn);
    885
    886	return err;
    887}
    888
    889static inline int get_cq_refcount(struct mthca_dev *dev, struct mthca_cq *cq)
    890{
    891	int c;
    892
    893	spin_lock_irq(&dev->cq_table.lock);
    894	c = cq->refcount;
    895	spin_unlock_irq(&dev->cq_table.lock);
    896
    897	return c;
    898}
    899
    900void mthca_free_cq(struct mthca_dev *dev,
    901		   struct mthca_cq *cq)
    902{
    903	struct mthca_mailbox *mailbox;
    904	int err;
    905
    906	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
    907	if (IS_ERR(mailbox)) {
    908		mthca_warn(dev, "No memory for mailbox to free CQ.\n");
    909		return;
    910	}
    911
    912	err = mthca_HW2SW_CQ(dev, mailbox, cq->cqn);
    913	if (err)
    914		mthca_warn(dev, "HW2SW_CQ failed (%d)\n", err);
    915
    916	if (0) {
    917		__be32 *ctx = mailbox->buf;
    918		int j;
    919
    920		printk(KERN_ERR "context for CQN %x (cons index %x, next sw %d)\n",
    921		       cq->cqn, cq->cons_index,
    922		       cq->is_kernel ? !!next_cqe_sw(cq) : 0);
    923		for (j = 0; j < 16; ++j)
    924			printk(KERN_ERR "[%2x] %08x\n", j * 4, be32_to_cpu(ctx[j]));
    925	}
    926
    927	spin_lock_irq(&dev->cq_table.lock);
    928	mthca_array_clear(&dev->cq_table.cq,
    929			  cq->cqn & (dev->limits.num_cqs - 1));
    930	--cq->refcount;
    931	spin_unlock_irq(&dev->cq_table.lock);
    932
    933	if (dev->mthca_flags & MTHCA_FLAG_MSI_X)
    934		synchronize_irq(dev->eq_table.eq[MTHCA_EQ_COMP].msi_x_vector);
    935	else
    936		synchronize_irq(dev->pdev->irq);
    937
    938	wait_event(cq->wait, !get_cq_refcount(dev, cq));
    939
    940	if (cq->is_kernel) {
    941		mthca_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
    942		if (mthca_is_memfree(dev)) {
    943			mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM,    cq->arm_db_index);
    944			mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
    945		}
    946	}
    947
    948	mthca_table_put(dev, dev->cq_table.table, cq->cqn);
    949	mthca_free(&dev->cq_table.alloc, cq->cqn);
    950	mthca_free_mailbox(dev, mailbox);
    951}
    952
    953int mthca_init_cq_table(struct mthca_dev *dev)
    954{
    955	int err;
    956
    957	spin_lock_init(&dev->cq_table.lock);
    958
    959	err = mthca_alloc_init(&dev->cq_table.alloc,
    960			       dev->limits.num_cqs,
    961			       (1 << 24) - 1,
    962			       dev->limits.reserved_cqs);
    963	if (err)
    964		return err;
    965
    966	err = mthca_array_init(&dev->cq_table.cq,
    967			       dev->limits.num_cqs);
    968	if (err)
    969		mthca_alloc_cleanup(&dev->cq_table.alloc);
    970
    971	return err;
    972}
    973
    974void mthca_cleanup_cq_table(struct mthca_dev *dev)
    975{
    976	mthca_array_cleanup(&dev->cq_table.cq, dev->limits.num_cqs);
    977	mthca_alloc_cleanup(&dev->cq_table.alloc);
    978}