rc.c (91600B)
1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause 2/* 3 * Copyright(c) 2015 - 2018 Intel Corporation. 4 */ 5 6#include <linux/io.h> 7#include <rdma/rdma_vt.h> 8#include <rdma/rdmavt_qp.h> 9 10#include "hfi.h" 11#include "qp.h" 12#include "rc.h" 13#include "verbs_txreq.h" 14#include "trace.h" 15 16struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev, 17 u8 *prev_ack, bool *scheduled) 18 __must_hold(&qp->s_lock) 19{ 20 struct rvt_ack_entry *e = NULL; 21 u8 i, p; 22 bool s = true; 23 24 for (i = qp->r_head_ack_queue; ; i = p) { 25 if (i == qp->s_tail_ack_queue) 26 s = false; 27 if (i) 28 p = i - 1; 29 else 30 p = rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); 31 if (p == qp->r_head_ack_queue) { 32 e = NULL; 33 break; 34 } 35 e = &qp->s_ack_queue[p]; 36 if (!e->opcode) { 37 e = NULL; 38 break; 39 } 40 if (cmp_psn(psn, e->psn) >= 0) { 41 if (p == qp->s_tail_ack_queue && 42 cmp_psn(psn, e->lpsn) <= 0) 43 s = false; 44 break; 45 } 46 } 47 if (prev) 48 *prev = p; 49 if (prev_ack) 50 *prev_ack = i; 51 if (scheduled) 52 *scheduled = s; 53 return e; 54} 55 56/** 57 * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read) 58 * @dev: the device for this QP 59 * @qp: a pointer to the QP 60 * @ohdr: a pointer to the IB header being constructed 61 * @ps: the xmit packet state 62 * 63 * Return 1 if constructed; otherwise, return 0. 64 * Note that we are in the responder's side of the QP context. 65 * Note the QP s_lock must be held. 66 */ 67static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, 68 struct ib_other_headers *ohdr, 69 struct hfi1_pkt_state *ps) 70{ 71 struct rvt_ack_entry *e; 72 u32 hwords, hdrlen; 73 u32 len = 0; 74 u32 bth0 = 0, bth2 = 0; 75 u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); 76 int middle = 0; 77 u32 pmtu = qp->pmtu; 78 struct hfi1_qp_priv *qpriv = qp->priv; 79 bool last_pkt; 80 u32 delta; 81 u8 next = qp->s_tail_ack_queue; 82 struct tid_rdma_request *req; 83 84 trace_hfi1_rsp_make_rc_ack(qp, 0); 85 lockdep_assert_held(&qp->s_lock); 86 /* Don't send an ACK if we aren't supposed to. */ 87 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) 88 goto bail; 89 90 if (qpriv->hdr_type == HFI1_PKT_TYPE_9B) 91 /* header size in 32-bit words LRH+BTH = (8+12)/4. */ 92 hwords = 5; 93 else 94 /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */ 95 hwords = 7; 96 97 switch (qp->s_ack_state) { 98 case OP(RDMA_READ_RESPONSE_LAST): 99 case OP(RDMA_READ_RESPONSE_ONLY): 100 e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 101 release_rdma_sge_mr(e); 102 fallthrough; 103 case OP(ATOMIC_ACKNOWLEDGE): 104 /* 105 * We can increment the tail pointer now that the last 106 * response has been sent instead of only being 107 * constructed. 108 */ 109 if (++next > rvt_size_atomic(&dev->rdi)) 110 next = 0; 111 /* 112 * Only advance the s_acked_ack_queue pointer if there 113 * have been no TID RDMA requests. 114 */ 115 e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 116 if (e->opcode != TID_OP(WRITE_REQ) && 117 qp->s_acked_ack_queue == qp->s_tail_ack_queue) 118 qp->s_acked_ack_queue = next; 119 qp->s_tail_ack_queue = next; 120 trace_hfi1_rsp_make_rc_ack(qp, e->psn); 121 fallthrough; 122 case OP(SEND_ONLY): 123 case OP(ACKNOWLEDGE): 124 /* Check for no next entry in the queue. */ 125 if (qp->r_head_ack_queue == qp->s_tail_ack_queue) { 126 if (qp->s_flags & RVT_S_ACK_PENDING) 127 goto normal; 128 goto bail; 129 } 130 131 e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 132 /* Check for tid write fence */ 133 if ((qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) || 134 hfi1_tid_rdma_ack_interlock(qp, e)) { 135 iowait_set_flag(&qpriv->s_iowait, IOWAIT_PENDING_IB); 136 goto bail; 137 } 138 if (e->opcode == OP(RDMA_READ_REQUEST)) { 139 /* 140 * If a RDMA read response is being resent and 141 * we haven't seen the duplicate request yet, 142 * then stop sending the remaining responses the 143 * responder has seen until the requester re-sends it. 144 */ 145 len = e->rdma_sge.sge_length; 146 if (len && !e->rdma_sge.mr) { 147 if (qp->s_acked_ack_queue == 148 qp->s_tail_ack_queue) 149 qp->s_acked_ack_queue = 150 qp->r_head_ack_queue; 151 qp->s_tail_ack_queue = qp->r_head_ack_queue; 152 goto bail; 153 } 154 /* Copy SGE state in case we need to resend */ 155 ps->s_txreq->mr = e->rdma_sge.mr; 156 if (ps->s_txreq->mr) 157 rvt_get_mr(ps->s_txreq->mr); 158 qp->s_ack_rdma_sge.sge = e->rdma_sge; 159 qp->s_ack_rdma_sge.num_sge = 1; 160 ps->s_txreq->ss = &qp->s_ack_rdma_sge; 161 if (len > pmtu) { 162 len = pmtu; 163 qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); 164 } else { 165 qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); 166 e->sent = 1; 167 } 168 ohdr->u.aeth = rvt_compute_aeth(qp); 169 hwords++; 170 qp->s_ack_rdma_psn = e->psn; 171 bth2 = mask_psn(qp->s_ack_rdma_psn++); 172 } else if (e->opcode == TID_OP(WRITE_REQ)) { 173 /* 174 * If a TID RDMA WRITE RESP is being resent, we have to 175 * wait for the actual request. All requests that are to 176 * be resent will have their state set to 177 * TID_REQUEST_RESEND. When the new request arrives, the 178 * state will be changed to TID_REQUEST_RESEND_ACTIVE. 179 */ 180 req = ack_to_tid_req(e); 181 if (req->state == TID_REQUEST_RESEND || 182 req->state == TID_REQUEST_INIT_RESEND) 183 goto bail; 184 qp->s_ack_state = TID_OP(WRITE_RESP); 185 qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg); 186 goto write_resp; 187 } else if (e->opcode == TID_OP(READ_REQ)) { 188 /* 189 * If a TID RDMA read response is being resent and 190 * we haven't seen the duplicate request yet, 191 * then stop sending the remaining responses the 192 * responder has seen until the requester re-sends it. 193 */ 194 len = e->rdma_sge.sge_length; 195 if (len && !e->rdma_sge.mr) { 196 if (qp->s_acked_ack_queue == 197 qp->s_tail_ack_queue) 198 qp->s_acked_ack_queue = 199 qp->r_head_ack_queue; 200 qp->s_tail_ack_queue = qp->r_head_ack_queue; 201 goto bail; 202 } 203 /* Copy SGE state in case we need to resend */ 204 ps->s_txreq->mr = e->rdma_sge.mr; 205 if (ps->s_txreq->mr) 206 rvt_get_mr(ps->s_txreq->mr); 207 qp->s_ack_rdma_sge.sge = e->rdma_sge; 208 qp->s_ack_rdma_sge.num_sge = 1; 209 qp->s_ack_state = TID_OP(READ_RESP); 210 goto read_resp; 211 } else { 212 /* COMPARE_SWAP or FETCH_ADD */ 213 ps->s_txreq->ss = NULL; 214 len = 0; 215 qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); 216 ohdr->u.at.aeth = rvt_compute_aeth(qp); 217 ib_u64_put(e->atomic_data, &ohdr->u.at.atomic_ack_eth); 218 hwords += sizeof(ohdr->u.at) / sizeof(u32); 219 bth2 = mask_psn(e->psn); 220 e->sent = 1; 221 } 222 trace_hfi1_tid_write_rsp_make_rc_ack(qp); 223 bth0 = qp->s_ack_state << 24; 224 break; 225 226 case OP(RDMA_READ_RESPONSE_FIRST): 227 qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); 228 fallthrough; 229 case OP(RDMA_READ_RESPONSE_MIDDLE): 230 ps->s_txreq->ss = &qp->s_ack_rdma_sge; 231 ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr; 232 if (ps->s_txreq->mr) 233 rvt_get_mr(ps->s_txreq->mr); 234 len = qp->s_ack_rdma_sge.sge.sge_length; 235 if (len > pmtu) { 236 len = pmtu; 237 middle = HFI1_CAP_IS_KSET(SDMA_AHG); 238 } else { 239 ohdr->u.aeth = rvt_compute_aeth(qp); 240 hwords++; 241 qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); 242 e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 243 e->sent = 1; 244 } 245 bth0 = qp->s_ack_state << 24; 246 bth2 = mask_psn(qp->s_ack_rdma_psn++); 247 break; 248 249 case TID_OP(WRITE_RESP): 250write_resp: 251 /* 252 * 1. Check if RVT_S_ACK_PENDING is set. If yes, 253 * goto normal. 254 * 2. Attempt to allocate TID resources. 255 * 3. Remove RVT_S_RESP_PENDING flags from s_flags 256 * 4. If resources not available: 257 * 4.1 Set RVT_S_WAIT_TID_SPACE 258 * 4.2 Queue QP on RCD TID queue 259 * 4.3 Put QP on iowait list. 260 * 4.4 Build IB RNR NAK with appropriate timeout value 261 * 4.5 Return indication progress made. 262 * 5. If resources are available: 263 * 5.1 Program HW flow CSRs 264 * 5.2 Build TID RDMA WRITE RESP packet 265 * 5.3 If more resources needed, do 2.1 - 2.3. 266 * 5.4 Wake up next QP on RCD TID queue. 267 * 5.5 Return indication progress made. 268 */ 269 270 e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 271 req = ack_to_tid_req(e); 272 273 /* 274 * Send scheduled RNR NAK's. RNR NAK's need to be sent at 275 * segment boundaries, not at request boundaries. Don't change 276 * s_ack_state because we are still in the middle of a request 277 */ 278 if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND && 279 qp->s_tail_ack_queue == qpriv->r_tid_alloc && 280 req->cur_seg == req->alloc_seg) { 281 qpriv->rnr_nak_state = TID_RNR_NAK_SENT; 282 goto normal_no_state; 283 } 284 285 bth2 = mask_psn(qp->s_ack_rdma_psn); 286 hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1, 287 bth2, &len, 288 &ps->s_txreq->ss); 289 if (!hdrlen) 290 return 0; 291 292 hwords += hdrlen; 293 bth0 = qp->s_ack_state << 24; 294 qp->s_ack_rdma_psn++; 295 trace_hfi1_tid_req_make_rc_ack_write(qp, 0, e->opcode, e->psn, 296 e->lpsn, req); 297 if (req->cur_seg != req->total_segs) 298 break; 299 300 e->sent = 1; 301 /* Do not free e->rdma_sge until all data are received */ 302 qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); 303 break; 304 305 case TID_OP(READ_RESP): 306read_resp: 307 e = &qp->s_ack_queue[qp->s_tail_ack_queue]; 308 ps->s_txreq->ss = &qp->s_ack_rdma_sge; 309 delta = hfi1_build_tid_rdma_read_resp(qp, e, ohdr, &bth0, 310 &bth1, &bth2, &len, 311 &last_pkt); 312 if (delta == 0) 313 goto error_qp; 314 hwords += delta; 315 if (last_pkt) { 316 e->sent = 1; 317 /* 318 * Increment qp->s_tail_ack_queue through s_ack_state 319 * transition. 320 */ 321 qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); 322 } 323 break; 324 case TID_OP(READ_REQ): 325 goto bail; 326 327 default: 328normal: 329 /* 330 * Send a regular ACK. 331 * Set the s_ack_state so we wait until after sending 332 * the ACK before setting s_ack_state to ACKNOWLEDGE 333 * (see above). 334 */ 335 qp->s_ack_state = OP(SEND_ONLY); 336normal_no_state: 337 if (qp->s_nak_state) 338 ohdr->u.aeth = 339 cpu_to_be32((qp->r_msn & IB_MSN_MASK) | 340 (qp->s_nak_state << 341 IB_AETH_CREDIT_SHIFT)); 342 else 343 ohdr->u.aeth = rvt_compute_aeth(qp); 344 hwords++; 345 len = 0; 346 bth0 = OP(ACKNOWLEDGE) << 24; 347 bth2 = mask_psn(qp->s_ack_psn); 348 qp->s_flags &= ~RVT_S_ACK_PENDING; 349 ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP; 350 ps->s_txreq->ss = NULL; 351 } 352 qp->s_rdma_ack_cnt++; 353 ps->s_txreq->sde = qpriv->s_sde; 354 ps->s_txreq->s_cur_size = len; 355 ps->s_txreq->hdr_dwords = hwords; 356 hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps); 357 return 1; 358error_qp: 359 spin_unlock_irqrestore(&qp->s_lock, ps->flags); 360 spin_lock_irqsave(&qp->r_lock, ps->flags); 361 spin_lock(&qp->s_lock); 362 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 363 spin_unlock(&qp->s_lock); 364 spin_unlock_irqrestore(&qp->r_lock, ps->flags); 365 spin_lock_irqsave(&qp->s_lock, ps->flags); 366bail: 367 qp->s_ack_state = OP(ACKNOWLEDGE); 368 /* 369 * Ensure s_rdma_ack_cnt changes are committed prior to resetting 370 * RVT_S_RESP_PENDING 371 */ 372 smp_wmb(); 373 qp->s_flags &= ~(RVT_S_RESP_PENDING 374 | RVT_S_ACK_PENDING 375 | HFI1_S_AHG_VALID); 376 return 0; 377} 378 379/** 380 * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) 381 * @qp: a pointer to the QP 382 * @ps: the current packet state 383 * 384 * Assumes s_lock is held. 385 * 386 * Return 1 if constructed; otherwise, return 0. 387 */ 388int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) 389{ 390 struct hfi1_qp_priv *priv = qp->priv; 391 struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); 392 struct ib_other_headers *ohdr; 393 struct rvt_sge_state *ss = NULL; 394 struct rvt_swqe *wqe; 395 struct hfi1_swqe_priv *wpriv; 396 struct tid_rdma_request *req = NULL; 397 /* header size in 32-bit words LRH+BTH = (8+12)/4. */ 398 u32 hwords = 5; 399 u32 len = 0; 400 u32 bth0 = 0, bth2 = 0; 401 u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); 402 u32 pmtu = qp->pmtu; 403 char newreq; 404 int middle = 0; 405 int delta; 406 struct tid_rdma_flow *flow = NULL; 407 struct tid_rdma_params *remote; 408 409 trace_hfi1_sender_make_rc_req(qp); 410 lockdep_assert_held(&qp->s_lock); 411 ps->s_txreq = get_txreq(ps->dev, qp); 412 if (!ps->s_txreq) 413 goto bail_no_tx; 414 415 if (priv->hdr_type == HFI1_PKT_TYPE_9B) { 416 /* header size in 32-bit words LRH+BTH = (8+12)/4. */ 417 hwords = 5; 418 if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) 419 ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth; 420 else 421 ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; 422 } else { 423 /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */ 424 hwords = 7; 425 if ((rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) && 426 (hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr)))) 427 ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth; 428 else 429 ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth; 430 } 431 432 /* Sending responses has higher priority over sending requests. */ 433 if ((qp->s_flags & RVT_S_RESP_PENDING) && 434 make_rc_ack(dev, qp, ohdr, ps)) 435 return 1; 436 437 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) { 438 if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) 439 goto bail; 440 /* We are in the error state, flush the work request. */ 441 if (qp->s_last == READ_ONCE(qp->s_head)) 442 goto bail; 443 /* If DMAs are in progress, we can't flush immediately. */ 444 if (iowait_sdma_pending(&priv->s_iowait)) { 445 qp->s_flags |= RVT_S_WAIT_DMA; 446 goto bail; 447 } 448 clear_ahg(qp); 449 wqe = rvt_get_swqe_ptr(qp, qp->s_last); 450 hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ? 451 IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); 452 /* will get called again */ 453 goto done_free_tx; 454 } 455 456 if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT)) 457 goto bail; 458 459 if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) { 460 if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) { 461 qp->s_flags |= RVT_S_WAIT_PSN; 462 goto bail; 463 } 464 qp->s_sending_psn = qp->s_psn; 465 qp->s_sending_hpsn = qp->s_psn - 1; 466 } 467 468 /* Send a request. */ 469 wqe = rvt_get_swqe_ptr(qp, qp->s_cur); 470check_s_state: 471 switch (qp->s_state) { 472 default: 473 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) 474 goto bail; 475 /* 476 * Resend an old request or start a new one. 477 * 478 * We keep track of the current SWQE so that 479 * we don't reset the "furthest progress" state 480 * if we need to back up. 481 */ 482 newreq = 0; 483 if (qp->s_cur == qp->s_tail) { 484 /* Check if send work queue is empty. */ 485 if (qp->s_tail == READ_ONCE(qp->s_head)) { 486 clear_ahg(qp); 487 goto bail; 488 } 489 /* 490 * If a fence is requested, wait for previous 491 * RDMA read and atomic operations to finish. 492 * However, there is no need to guard against 493 * TID RDMA READ after TID RDMA READ. 494 */ 495 if ((wqe->wr.send_flags & IB_SEND_FENCE) && 496 qp->s_num_rd_atomic && 497 (wqe->wr.opcode != IB_WR_TID_RDMA_READ || 498 priv->pending_tid_r_segs < qp->s_num_rd_atomic)) { 499 qp->s_flags |= RVT_S_WAIT_FENCE; 500 goto bail; 501 } 502 /* 503 * Local operations are processed immediately 504 * after all prior requests have completed 505 */ 506 if (wqe->wr.opcode == IB_WR_REG_MR || 507 wqe->wr.opcode == IB_WR_LOCAL_INV) { 508 int local_ops = 0; 509 int err = 0; 510 511 if (qp->s_last != qp->s_cur) 512 goto bail; 513 if (++qp->s_cur == qp->s_size) 514 qp->s_cur = 0; 515 if (++qp->s_tail == qp->s_size) 516 qp->s_tail = 0; 517 if (!(wqe->wr.send_flags & 518 RVT_SEND_COMPLETION_ONLY)) { 519 err = rvt_invalidate_rkey( 520 qp, 521 wqe->wr.ex.invalidate_rkey); 522 local_ops = 1; 523 } 524 rvt_send_complete(qp, wqe, 525 err ? IB_WC_LOC_PROT_ERR 526 : IB_WC_SUCCESS); 527 if (local_ops) 528 atomic_dec(&qp->local_ops_pending); 529 goto done_free_tx; 530 } 531 532 newreq = 1; 533 qp->s_psn = wqe->psn; 534 } 535 /* 536 * Note that we have to be careful not to modify the 537 * original work request since we may need to resend 538 * it. 539 */ 540 len = wqe->length; 541 ss = &qp->s_sge; 542 bth2 = mask_psn(qp->s_psn); 543 544 /* 545 * Interlock between various IB requests and TID RDMA 546 * if necessary. 547 */ 548 if ((priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) || 549 hfi1_tid_rdma_wqe_interlock(qp, wqe)) 550 goto bail; 551 552 switch (wqe->wr.opcode) { 553 case IB_WR_SEND: 554 case IB_WR_SEND_WITH_IMM: 555 case IB_WR_SEND_WITH_INV: 556 /* If no credit, return. */ 557 if (!rvt_rc_credit_avail(qp, wqe)) 558 goto bail; 559 if (len > pmtu) { 560 qp->s_state = OP(SEND_FIRST); 561 len = pmtu; 562 break; 563 } 564 if (wqe->wr.opcode == IB_WR_SEND) { 565 qp->s_state = OP(SEND_ONLY); 566 } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { 567 qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); 568 /* Immediate data comes after the BTH */ 569 ohdr->u.imm_data = wqe->wr.ex.imm_data; 570 hwords += 1; 571 } else { 572 qp->s_state = OP(SEND_ONLY_WITH_INVALIDATE); 573 /* Invalidate rkey comes after the BTH */ 574 ohdr->u.ieth = cpu_to_be32( 575 wqe->wr.ex.invalidate_rkey); 576 hwords += 1; 577 } 578 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 579 bth0 |= IB_BTH_SOLICITED; 580 bth2 |= IB_BTH_REQ_ACK; 581 if (++qp->s_cur == qp->s_size) 582 qp->s_cur = 0; 583 break; 584 585 case IB_WR_RDMA_WRITE: 586 if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) 587 qp->s_lsn++; 588 goto no_flow_control; 589 case IB_WR_RDMA_WRITE_WITH_IMM: 590 /* If no credit, return. */ 591 if (!rvt_rc_credit_avail(qp, wqe)) 592 goto bail; 593no_flow_control: 594 put_ib_reth_vaddr( 595 wqe->rdma_wr.remote_addr, 596 &ohdr->u.rc.reth); 597 ohdr->u.rc.reth.rkey = 598 cpu_to_be32(wqe->rdma_wr.rkey); 599 ohdr->u.rc.reth.length = cpu_to_be32(len); 600 hwords += sizeof(struct ib_reth) / sizeof(u32); 601 if (len > pmtu) { 602 qp->s_state = OP(RDMA_WRITE_FIRST); 603 len = pmtu; 604 break; 605 } 606 if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { 607 qp->s_state = OP(RDMA_WRITE_ONLY); 608 } else { 609 qp->s_state = 610 OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); 611 /* Immediate data comes after RETH */ 612 ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; 613 hwords += 1; 614 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 615 bth0 |= IB_BTH_SOLICITED; 616 } 617 bth2 |= IB_BTH_REQ_ACK; 618 if (++qp->s_cur == qp->s_size) 619 qp->s_cur = 0; 620 break; 621 622 case IB_WR_TID_RDMA_WRITE: 623 if (newreq) { 624 /* 625 * Limit the number of TID RDMA WRITE requests. 626 */ 627 if (atomic_read(&priv->n_tid_requests) >= 628 HFI1_TID_RDMA_WRITE_CNT) 629 goto bail; 630 631 if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) 632 qp->s_lsn++; 633 } 634 635 hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, 636 &bth1, &bth2, 637 &len); 638 ss = NULL; 639 if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) { 640 priv->s_tid_cur = qp->s_cur; 641 if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) { 642 priv->s_tid_tail = qp->s_cur; 643 priv->s_state = TID_OP(WRITE_RESP); 644 } 645 } else if (priv->s_tid_cur == priv->s_tid_head) { 646 struct rvt_swqe *__w; 647 struct tid_rdma_request *__r; 648 649 __w = rvt_get_swqe_ptr(qp, priv->s_tid_cur); 650 __r = wqe_to_tid_req(__w); 651 652 /* 653 * The s_tid_cur pointer is advanced to s_cur if 654 * any of the following conditions about the WQE 655 * to which s_ti_cur currently points to are 656 * satisfied: 657 * 1. The request is not a TID RDMA WRITE 658 * request, 659 * 2. The request is in the INACTIVE or 660 * COMPLETE states (TID RDMA READ requests 661 * stay at INACTIVE and TID RDMA WRITE 662 * transition to COMPLETE when done), 663 * 3. The request is in the ACTIVE or SYNC 664 * state and the number of completed 665 * segments is equal to the total segment 666 * count. 667 * (If ACTIVE, the request is waiting for 668 * ACKs. If SYNC, the request has not 669 * received any responses because it's 670 * waiting on a sync point.) 671 */ 672 if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE || 673 __r->state == TID_REQUEST_INACTIVE || 674 __r->state == TID_REQUEST_COMPLETE || 675 ((__r->state == TID_REQUEST_ACTIVE || 676 __r->state == TID_REQUEST_SYNC) && 677 __r->comp_seg == __r->total_segs)) { 678 if (priv->s_tid_tail == 679 priv->s_tid_cur && 680 priv->s_state == 681 TID_OP(WRITE_DATA_LAST)) { 682 priv->s_tid_tail = qp->s_cur; 683 priv->s_state = 684 TID_OP(WRITE_RESP); 685 } 686 priv->s_tid_cur = qp->s_cur; 687 } 688 /* 689 * A corner case: when the last TID RDMA WRITE 690 * request was completed, s_tid_head, 691 * s_tid_cur, and s_tid_tail all point to the 692 * same location. Other requests are posted and 693 * s_cur wraps around to the same location, 694 * where a new TID RDMA WRITE is posted. In 695 * this case, none of the indices need to be 696 * updated. However, the priv->s_state should. 697 */ 698 if (priv->s_tid_tail == qp->s_cur && 699 priv->s_state == TID_OP(WRITE_DATA_LAST)) 700 priv->s_state = TID_OP(WRITE_RESP); 701 } 702 req = wqe_to_tid_req(wqe); 703 if (newreq) { 704 priv->s_tid_head = qp->s_cur; 705 priv->pending_tid_w_resp += req->total_segs; 706 atomic_inc(&priv->n_tid_requests); 707 atomic_dec(&priv->n_requests); 708 } else { 709 req->state = TID_REQUEST_RESEND; 710 req->comp_seg = delta_psn(bth2, wqe->psn); 711 /* 712 * Pull back any segments since we are going 713 * to re-receive them. 714 */ 715 req->setup_head = req->clear_tail; 716 priv->pending_tid_w_resp += 717 delta_psn(wqe->lpsn, bth2) + 1; 718 } 719 720 trace_hfi1_tid_write_sender_make_req(qp, newreq); 721 trace_hfi1_tid_req_make_req_write(qp, newreq, 722 wqe->wr.opcode, 723 wqe->psn, wqe->lpsn, 724 req); 725 if (++qp->s_cur == qp->s_size) 726 qp->s_cur = 0; 727 break; 728 729 case IB_WR_RDMA_READ: 730 /* 731 * Don't allow more operations to be started 732 * than the QP limits allow. 733 */ 734 if (qp->s_num_rd_atomic >= 735 qp->s_max_rd_atomic) { 736 qp->s_flags |= RVT_S_WAIT_RDMAR; 737 goto bail; 738 } 739 qp->s_num_rd_atomic++; 740 if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) 741 qp->s_lsn++; 742 put_ib_reth_vaddr( 743 wqe->rdma_wr.remote_addr, 744 &ohdr->u.rc.reth); 745 ohdr->u.rc.reth.rkey = 746 cpu_to_be32(wqe->rdma_wr.rkey); 747 ohdr->u.rc.reth.length = cpu_to_be32(len); 748 qp->s_state = OP(RDMA_READ_REQUEST); 749 hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); 750 ss = NULL; 751 len = 0; 752 bth2 |= IB_BTH_REQ_ACK; 753 if (++qp->s_cur == qp->s_size) 754 qp->s_cur = 0; 755 break; 756 757 case IB_WR_TID_RDMA_READ: 758 trace_hfi1_tid_read_sender_make_req(qp, newreq); 759 wpriv = wqe->priv; 760 req = wqe_to_tid_req(wqe); 761 trace_hfi1_tid_req_make_req_read(qp, newreq, 762 wqe->wr.opcode, 763 wqe->psn, wqe->lpsn, 764 req); 765 delta = cmp_psn(qp->s_psn, wqe->psn); 766 767 /* 768 * Don't allow more operations to be started 769 * than the QP limits allow. We could get here under 770 * three conditions; (1) It's a new request; (2) We are 771 * sending the second or later segment of a request, 772 * but the qp->s_state is set to OP(RDMA_READ_REQUEST) 773 * when the last segment of a previous request is 774 * received just before this; (3) We are re-sending a 775 * request. 776 */ 777 if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) { 778 qp->s_flags |= RVT_S_WAIT_RDMAR; 779 goto bail; 780 } 781 if (newreq) { 782 struct tid_rdma_flow *flow = 783 &req->flows[req->setup_head]; 784 785 /* 786 * Set up s_sge as it is needed for TID 787 * allocation. However, if the pages have been 788 * walked and mapped, skip it. An earlier try 789 * has failed to allocate the TID entries. 790 */ 791 if (!flow->npagesets) { 792 qp->s_sge.sge = wqe->sg_list[0]; 793 qp->s_sge.sg_list = wqe->sg_list + 1; 794 qp->s_sge.num_sge = wqe->wr.num_sge; 795 qp->s_sge.total_len = wqe->length; 796 qp->s_len = wqe->length; 797 req->isge = 0; 798 req->clear_tail = req->setup_head; 799 req->flow_idx = req->setup_head; 800 req->state = TID_REQUEST_ACTIVE; 801 } 802 } else if (delta == 0) { 803 /* Re-send a request */ 804 req->cur_seg = 0; 805 req->comp_seg = 0; 806 req->ack_pending = 0; 807 req->flow_idx = req->clear_tail; 808 req->state = TID_REQUEST_RESEND; 809 } 810 req->s_next_psn = qp->s_psn; 811 /* Read one segment at a time */ 812 len = min_t(u32, req->seg_len, 813 wqe->length - req->seg_len * req->cur_seg); 814 delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, 815 &bth1, &bth2, 816 &len); 817 if (delta <= 0) { 818 /* Wait for TID space */ 819 goto bail; 820 } 821 if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) 822 qp->s_lsn++; 823 hwords += delta; 824 ss = &wpriv->ss; 825 /* Check if this is the last segment */ 826 if (req->cur_seg >= req->total_segs && 827 ++qp->s_cur == qp->s_size) 828 qp->s_cur = 0; 829 break; 830 831 case IB_WR_ATOMIC_CMP_AND_SWP: 832 case IB_WR_ATOMIC_FETCH_AND_ADD: 833 /* 834 * Don't allow more operations to be started 835 * than the QP limits allow. 836 */ 837 if (qp->s_num_rd_atomic >= 838 qp->s_max_rd_atomic) { 839 qp->s_flags |= RVT_S_WAIT_RDMAR; 840 goto bail; 841 } 842 qp->s_num_rd_atomic++; 843 fallthrough; 844 case IB_WR_OPFN: 845 if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) 846 qp->s_lsn++; 847 if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 848 wqe->wr.opcode == IB_WR_OPFN) { 849 qp->s_state = OP(COMPARE_SWAP); 850 put_ib_ateth_swap(wqe->atomic_wr.swap, 851 &ohdr->u.atomic_eth); 852 put_ib_ateth_compare(wqe->atomic_wr.compare_add, 853 &ohdr->u.atomic_eth); 854 } else { 855 qp->s_state = OP(FETCH_ADD); 856 put_ib_ateth_swap(wqe->atomic_wr.compare_add, 857 &ohdr->u.atomic_eth); 858 put_ib_ateth_compare(0, &ohdr->u.atomic_eth); 859 } 860 put_ib_ateth_vaddr(wqe->atomic_wr.remote_addr, 861 &ohdr->u.atomic_eth); 862 ohdr->u.atomic_eth.rkey = cpu_to_be32( 863 wqe->atomic_wr.rkey); 864 hwords += sizeof(struct ib_atomic_eth) / sizeof(u32); 865 ss = NULL; 866 len = 0; 867 bth2 |= IB_BTH_REQ_ACK; 868 if (++qp->s_cur == qp->s_size) 869 qp->s_cur = 0; 870 break; 871 872 default: 873 goto bail; 874 } 875 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) { 876 qp->s_sge.sge = wqe->sg_list[0]; 877 qp->s_sge.sg_list = wqe->sg_list + 1; 878 qp->s_sge.num_sge = wqe->wr.num_sge; 879 qp->s_sge.total_len = wqe->length; 880 qp->s_len = wqe->length; 881 } 882 if (newreq) { 883 qp->s_tail++; 884 if (qp->s_tail >= qp->s_size) 885 qp->s_tail = 0; 886 } 887 if (wqe->wr.opcode == IB_WR_RDMA_READ || 888 wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) 889 qp->s_psn = wqe->lpsn + 1; 890 else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) 891 qp->s_psn = req->s_next_psn; 892 else 893 qp->s_psn++; 894 break; 895 896 case OP(RDMA_READ_RESPONSE_FIRST): 897 /* 898 * qp->s_state is normally set to the opcode of the 899 * last packet constructed for new requests and therefore 900 * is never set to RDMA read response. 901 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing 902 * thread to indicate a SEND needs to be restarted from an 903 * earlier PSN without interfering with the sending thread. 904 * See restart_rc(). 905 */ 906 qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu); 907 fallthrough; 908 case OP(SEND_FIRST): 909 qp->s_state = OP(SEND_MIDDLE); 910 fallthrough; 911 case OP(SEND_MIDDLE): 912 bth2 = mask_psn(qp->s_psn++); 913 ss = &qp->s_sge; 914 len = qp->s_len; 915 if (len > pmtu) { 916 len = pmtu; 917 middle = HFI1_CAP_IS_KSET(SDMA_AHG); 918 break; 919 } 920 if (wqe->wr.opcode == IB_WR_SEND) { 921 qp->s_state = OP(SEND_LAST); 922 } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { 923 qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); 924 /* Immediate data comes after the BTH */ 925 ohdr->u.imm_data = wqe->wr.ex.imm_data; 926 hwords += 1; 927 } else { 928 qp->s_state = OP(SEND_LAST_WITH_INVALIDATE); 929 /* invalidate data comes after the BTH */ 930 ohdr->u.ieth = cpu_to_be32(wqe->wr.ex.invalidate_rkey); 931 hwords += 1; 932 } 933 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 934 bth0 |= IB_BTH_SOLICITED; 935 bth2 |= IB_BTH_REQ_ACK; 936 qp->s_cur++; 937 if (qp->s_cur >= qp->s_size) 938 qp->s_cur = 0; 939 break; 940 941 case OP(RDMA_READ_RESPONSE_LAST): 942 /* 943 * qp->s_state is normally set to the opcode of the 944 * last packet constructed for new requests and therefore 945 * is never set to RDMA read response. 946 * RDMA_READ_RESPONSE_LAST is used by the ACK processing 947 * thread to indicate a RDMA write needs to be restarted from 948 * an earlier PSN without interfering with the sending thread. 949 * See restart_rc(). 950 */ 951 qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu); 952 fallthrough; 953 case OP(RDMA_WRITE_FIRST): 954 qp->s_state = OP(RDMA_WRITE_MIDDLE); 955 fallthrough; 956 case OP(RDMA_WRITE_MIDDLE): 957 bth2 = mask_psn(qp->s_psn++); 958 ss = &qp->s_sge; 959 len = qp->s_len; 960 if (len > pmtu) { 961 len = pmtu; 962 middle = HFI1_CAP_IS_KSET(SDMA_AHG); 963 break; 964 } 965 if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { 966 qp->s_state = OP(RDMA_WRITE_LAST); 967 } else { 968 qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); 969 /* Immediate data comes after the BTH */ 970 ohdr->u.imm_data = wqe->wr.ex.imm_data; 971 hwords += 1; 972 if (wqe->wr.send_flags & IB_SEND_SOLICITED) 973 bth0 |= IB_BTH_SOLICITED; 974 } 975 bth2 |= IB_BTH_REQ_ACK; 976 qp->s_cur++; 977 if (qp->s_cur >= qp->s_size) 978 qp->s_cur = 0; 979 break; 980 981 case OP(RDMA_READ_RESPONSE_MIDDLE): 982 /* 983 * qp->s_state is normally set to the opcode of the 984 * last packet constructed for new requests and therefore 985 * is never set to RDMA read response. 986 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing 987 * thread to indicate a RDMA read needs to be restarted from 988 * an earlier PSN without interfering with the sending thread. 989 * See restart_rc(). 990 */ 991 len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu; 992 put_ib_reth_vaddr( 993 wqe->rdma_wr.remote_addr + len, 994 &ohdr->u.rc.reth); 995 ohdr->u.rc.reth.rkey = 996 cpu_to_be32(wqe->rdma_wr.rkey); 997 ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len); 998 qp->s_state = OP(RDMA_READ_REQUEST); 999 hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); 1000 bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK; 1001 qp->s_psn = wqe->lpsn + 1; 1002 ss = NULL; 1003 len = 0; 1004 qp->s_cur++; 1005 if (qp->s_cur == qp->s_size) 1006 qp->s_cur = 0; 1007 break; 1008 1009 case TID_OP(WRITE_RESP): 1010 /* 1011 * This value for s_state is used for restarting a TID RDMA 1012 * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE 1013 * for more). 1014 */ 1015 req = wqe_to_tid_req(wqe); 1016 req->state = TID_REQUEST_RESEND; 1017 rcu_read_lock(); 1018 remote = rcu_dereference(priv->tid_rdma.remote); 1019 req->comp_seg = delta_psn(qp->s_psn, wqe->psn); 1020 len = wqe->length - (req->comp_seg * remote->max_len); 1021 rcu_read_unlock(); 1022 1023 bth2 = mask_psn(qp->s_psn); 1024 hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1, 1025 &bth2, &len); 1026 qp->s_psn = wqe->lpsn + 1; 1027 ss = NULL; 1028 qp->s_state = TID_OP(WRITE_REQ); 1029 priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1; 1030 priv->s_tid_cur = qp->s_cur; 1031 if (++qp->s_cur == qp->s_size) 1032 qp->s_cur = 0; 1033 trace_hfi1_tid_req_make_req_write(qp, 0, wqe->wr.opcode, 1034 wqe->psn, wqe->lpsn, req); 1035 break; 1036 1037 case TID_OP(READ_RESP): 1038 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) 1039 goto bail; 1040 /* This is used to restart a TID read request */ 1041 req = wqe_to_tid_req(wqe); 1042 wpriv = wqe->priv; 1043 /* 1044 * Back down. The field qp->s_psn has been set to the psn with 1045 * which the request should be restart. It's OK to use division 1046 * as this is on the retry path. 1047 */ 1048 req->cur_seg = delta_psn(qp->s_psn, wqe->psn) / priv->pkts_ps; 1049 1050 /* 1051 * The following function need to be redefined to return the 1052 * status to make sure that we find the flow. At the same 1053 * time, we can use the req->state change to check if the 1054 * call succeeds or not. 1055 */ 1056 req->state = TID_REQUEST_RESEND; 1057 hfi1_tid_rdma_restart_req(qp, wqe, &bth2); 1058 if (req->state != TID_REQUEST_ACTIVE) { 1059 /* 1060 * Failed to find the flow. Release all allocated tid 1061 * resources. 1062 */ 1063 hfi1_kern_exp_rcv_clear_all(req); 1064 hfi1_kern_clear_hw_flow(priv->rcd, qp); 1065 1066 hfi1_trdma_send_complete(qp, wqe, IB_WC_LOC_QP_OP_ERR); 1067 goto bail; 1068 } 1069 req->state = TID_REQUEST_RESEND; 1070 len = min_t(u32, req->seg_len, 1071 wqe->length - req->seg_len * req->cur_seg); 1072 flow = &req->flows[req->flow_idx]; 1073 len -= flow->sent; 1074 req->s_next_psn = flow->flow_state.ib_lpsn + 1; 1075 delta = hfi1_build_tid_rdma_read_packet(wqe, ohdr, &bth1, 1076 &bth2, &len); 1077 if (delta <= 0) { 1078 /* Wait for TID space */ 1079 goto bail; 1080 } 1081 hwords += delta; 1082 ss = &wpriv->ss; 1083 /* Check if this is the last segment */ 1084 if (req->cur_seg >= req->total_segs && 1085 ++qp->s_cur == qp->s_size) 1086 qp->s_cur = 0; 1087 qp->s_psn = req->s_next_psn; 1088 trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode, 1089 wqe->psn, wqe->lpsn, req); 1090 break; 1091 case TID_OP(READ_REQ): 1092 req = wqe_to_tid_req(wqe); 1093 delta = cmp_psn(qp->s_psn, wqe->psn); 1094 /* 1095 * If the current WR is not TID RDMA READ, or this is the start 1096 * of a new request, we need to change the qp->s_state so that 1097 * the request can be set up properly. 1098 */ 1099 if (wqe->wr.opcode != IB_WR_TID_RDMA_READ || delta == 0 || 1100 qp->s_cur == qp->s_tail) { 1101 qp->s_state = OP(RDMA_READ_REQUEST); 1102 if (delta == 0 || qp->s_cur == qp->s_tail) 1103 goto check_s_state; 1104 else 1105 goto bail; 1106 } 1107 1108 /* Rate limiting */ 1109 if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) { 1110 qp->s_flags |= RVT_S_WAIT_RDMAR; 1111 goto bail; 1112 } 1113 1114 wpriv = wqe->priv; 1115 /* Read one segment at a time */ 1116 len = min_t(u32, req->seg_len, 1117 wqe->length - req->seg_len * req->cur_seg); 1118 delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, &bth1, 1119 &bth2, &len); 1120 if (delta <= 0) { 1121 /* Wait for TID space */ 1122 goto bail; 1123 } 1124 hwords += delta; 1125 ss = &wpriv->ss; 1126 /* Check if this is the last segment */ 1127 if (req->cur_seg >= req->total_segs && 1128 ++qp->s_cur == qp->s_size) 1129 qp->s_cur = 0; 1130 qp->s_psn = req->s_next_psn; 1131 trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode, 1132 wqe->psn, wqe->lpsn, req); 1133 break; 1134 } 1135 qp->s_sending_hpsn = bth2; 1136 delta = delta_psn(bth2, wqe->psn); 1137 if (delta && delta % HFI1_PSN_CREDIT == 0 && 1138 wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) 1139 bth2 |= IB_BTH_REQ_ACK; 1140 if (qp->s_flags & RVT_S_SEND_ONE) { 1141 qp->s_flags &= ~RVT_S_SEND_ONE; 1142 qp->s_flags |= RVT_S_WAIT_ACK; 1143 bth2 |= IB_BTH_REQ_ACK; 1144 } 1145 qp->s_len -= len; 1146 ps->s_txreq->hdr_dwords = hwords; 1147 ps->s_txreq->sde = priv->s_sde; 1148 ps->s_txreq->ss = ss; 1149 ps->s_txreq->s_cur_size = len; 1150 hfi1_make_ruc_header( 1151 qp, 1152 ohdr, 1153 bth0 | (qp->s_state << 24), 1154 bth1, 1155 bth2, 1156 middle, 1157 ps); 1158 return 1; 1159 1160done_free_tx: 1161 hfi1_put_txreq(ps->s_txreq); 1162 ps->s_txreq = NULL; 1163 return 1; 1164 1165bail: 1166 hfi1_put_txreq(ps->s_txreq); 1167 1168bail_no_tx: 1169 ps->s_txreq = NULL; 1170 qp->s_flags &= ~RVT_S_BUSY; 1171 /* 1172 * If we didn't get a txreq, the QP will be woken up later to try 1173 * again. Set the flags to indicate which work item to wake 1174 * up. 1175 */ 1176 iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB); 1177 return 0; 1178} 1179 1180static inline void hfi1_make_bth_aeth(struct rvt_qp *qp, 1181 struct ib_other_headers *ohdr, 1182 u32 bth0, u32 bth1) 1183{ 1184 if (qp->r_nak_state) 1185 ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) | 1186 (qp->r_nak_state << 1187 IB_AETH_CREDIT_SHIFT)); 1188 else 1189 ohdr->u.aeth = rvt_compute_aeth(qp); 1190 1191 ohdr->bth[0] = cpu_to_be32(bth0); 1192 ohdr->bth[1] = cpu_to_be32(bth1 | qp->remote_qpn); 1193 ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn)); 1194} 1195 1196static inline void hfi1_queue_rc_ack(struct hfi1_packet *packet, bool is_fecn) 1197{ 1198 struct rvt_qp *qp = packet->qp; 1199 struct hfi1_ibport *ibp; 1200 unsigned long flags; 1201 1202 spin_lock_irqsave(&qp->s_lock, flags); 1203 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) 1204 goto unlock; 1205 ibp = rcd_to_iport(packet->rcd); 1206 this_cpu_inc(*ibp->rvp.rc_qacks); 1207 qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING; 1208 qp->s_nak_state = qp->r_nak_state; 1209 qp->s_ack_psn = qp->r_ack_psn; 1210 if (is_fecn) 1211 qp->s_flags |= RVT_S_ECN; 1212 1213 /* Schedule the send tasklet. */ 1214 hfi1_schedule_send(qp); 1215unlock: 1216 spin_unlock_irqrestore(&qp->s_lock, flags); 1217} 1218 1219static inline void hfi1_make_rc_ack_9B(struct hfi1_packet *packet, 1220 struct hfi1_opa_header *opa_hdr, 1221 u8 sc5, bool is_fecn, 1222 u64 *pbc_flags, u32 *hwords, 1223 u32 *nwords) 1224{ 1225 struct rvt_qp *qp = packet->qp; 1226 struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd); 1227 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); 1228 struct ib_header *hdr = &opa_hdr->ibh; 1229 struct ib_other_headers *ohdr; 1230 u16 lrh0 = HFI1_LRH_BTH; 1231 u16 pkey; 1232 u32 bth0, bth1; 1233 1234 opa_hdr->hdr_type = HFI1_PKT_TYPE_9B; 1235 ohdr = &hdr->u.oth; 1236 /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */ 1237 *hwords = 6; 1238 1239 if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) { 1240 *hwords += hfi1_make_grh(ibp, &hdr->u.l.grh, 1241 rdma_ah_read_grh(&qp->remote_ah_attr), 1242 *hwords - 2, SIZE_OF_CRC); 1243 ohdr = &hdr->u.l.oth; 1244 lrh0 = HFI1_LRH_GRH; 1245 } 1246 /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */ 1247 *pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT); 1248 1249 /* read pkey_index w/o lock (its atomic) */ 1250 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); 1251 1252 lrh0 |= (sc5 & IB_SC_MASK) << IB_SC_SHIFT | 1253 (rdma_ah_get_sl(&qp->remote_ah_attr) & IB_SL_MASK) << 1254 IB_SL_SHIFT; 1255 1256 hfi1_make_ib_hdr(hdr, lrh0, *hwords + SIZE_OF_CRC, 1257 opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), 9B), 1258 ppd->lid | rdma_ah_get_path_bits(&qp->remote_ah_attr)); 1259 1260 bth0 = pkey | (OP(ACKNOWLEDGE) << 24); 1261 if (qp->s_mig_state == IB_MIG_MIGRATED) 1262 bth0 |= IB_BTH_MIG_REQ; 1263 bth1 = (!!is_fecn) << IB_BECN_SHIFT; 1264 /* 1265 * Inline ACKs go out without the use of the Verbs send engine, so 1266 * we need to set the STL Verbs Extended bit here 1267 */ 1268 bth1 |= HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT; 1269 hfi1_make_bth_aeth(qp, ohdr, bth0, bth1); 1270} 1271 1272static inline void hfi1_make_rc_ack_16B(struct hfi1_packet *packet, 1273 struct hfi1_opa_header *opa_hdr, 1274 u8 sc5, bool is_fecn, 1275 u64 *pbc_flags, u32 *hwords, 1276 u32 *nwords) 1277{ 1278 struct rvt_qp *qp = packet->qp; 1279 struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd); 1280 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); 1281 struct hfi1_16b_header *hdr = &opa_hdr->opah; 1282 struct ib_other_headers *ohdr; 1283 u32 bth0, bth1 = 0; 1284 u16 len, pkey; 1285 bool becn = is_fecn; 1286 u8 l4 = OPA_16B_L4_IB_LOCAL; 1287 u8 extra_bytes; 1288 1289 opa_hdr->hdr_type = HFI1_PKT_TYPE_16B; 1290 ohdr = &hdr->u.oth; 1291 /* header size in 32-bit words 16B LRH+BTH+AETH = (16+12+4)/4 */ 1292 *hwords = 8; 1293 extra_bytes = hfi1_get_16b_padding(*hwords << 2, 0); 1294 *nwords = SIZE_OF_CRC + ((extra_bytes + SIZE_OF_LT) >> 2); 1295 1296 if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) && 1297 hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))) { 1298 *hwords += hfi1_make_grh(ibp, &hdr->u.l.grh, 1299 rdma_ah_read_grh(&qp->remote_ah_attr), 1300 *hwords - 4, *nwords); 1301 ohdr = &hdr->u.l.oth; 1302 l4 = OPA_16B_L4_IB_GLOBAL; 1303 } 1304 *pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC; 1305 1306 /* read pkey_index w/o lock (its atomic) */ 1307 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); 1308 1309 /* Convert dwords to flits */ 1310 len = (*hwords + *nwords) >> 1; 1311 1312 hfi1_make_16b_hdr(hdr, ppd->lid | 1313 (rdma_ah_get_path_bits(&qp->remote_ah_attr) & 1314 ((1 << ppd->lmc) - 1)), 1315 opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), 1316 16B), len, pkey, becn, 0, l4, sc5); 1317 1318 bth0 = pkey | (OP(ACKNOWLEDGE) << 24); 1319 bth0 |= extra_bytes << 20; 1320 if (qp->s_mig_state == IB_MIG_MIGRATED) 1321 bth1 = OPA_BTH_MIG_REQ; 1322 hfi1_make_bth_aeth(qp, ohdr, bth0, bth1); 1323} 1324 1325typedef void (*hfi1_make_rc_ack)(struct hfi1_packet *packet, 1326 struct hfi1_opa_header *opa_hdr, 1327 u8 sc5, bool is_fecn, 1328 u64 *pbc_flags, u32 *hwords, 1329 u32 *nwords); 1330 1331/* We support only two types - 9B and 16B for now */ 1332static const hfi1_make_rc_ack hfi1_make_rc_ack_tbl[2] = { 1333 [HFI1_PKT_TYPE_9B] = &hfi1_make_rc_ack_9B, 1334 [HFI1_PKT_TYPE_16B] = &hfi1_make_rc_ack_16B 1335}; 1336 1337/* 1338 * hfi1_send_rc_ack - Construct an ACK packet and send it 1339 * 1340 * This is called from hfi1_rc_rcv() and handle_receive_interrupt(). 1341 * Note that RDMA reads and atomics are handled in the 1342 * send side QP state and send engine. 1343 */ 1344void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn) 1345{ 1346 struct hfi1_ctxtdata *rcd = packet->rcd; 1347 struct rvt_qp *qp = packet->qp; 1348 struct hfi1_ibport *ibp = rcd_to_iport(rcd); 1349 struct hfi1_qp_priv *priv = qp->priv; 1350 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); 1351 u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)]; 1352 u64 pbc, pbc_flags = 0; 1353 u32 hwords = 0; 1354 u32 nwords = 0; 1355 u32 plen; 1356 struct pio_buf *pbuf; 1357 struct hfi1_opa_header opa_hdr; 1358 1359 /* clear the defer count */ 1360 qp->r_adefered = 0; 1361 1362 /* Don't send ACK or NAK if a RDMA read or atomic is pending. */ 1363 if (qp->s_flags & RVT_S_RESP_PENDING) { 1364 hfi1_queue_rc_ack(packet, is_fecn); 1365 return; 1366 } 1367 1368 /* Ensure s_rdma_ack_cnt changes are committed */ 1369 if (qp->s_rdma_ack_cnt) { 1370 hfi1_queue_rc_ack(packet, is_fecn); 1371 return; 1372 } 1373 1374 /* Don't try to send ACKs if the link isn't ACTIVE */ 1375 if (driver_lstate(ppd) != IB_PORT_ACTIVE) 1376 return; 1377 1378 /* Make the appropriate header */ 1379 hfi1_make_rc_ack_tbl[priv->hdr_type](packet, &opa_hdr, sc5, is_fecn, 1380 &pbc_flags, &hwords, &nwords); 1381 1382 plen = 2 /* PBC */ + hwords + nwords; 1383 pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, 1384 sc_to_vlt(ppd->dd, sc5), plen); 1385 pbuf = sc_buffer_alloc(rcd->sc, plen, NULL, NULL); 1386 if (IS_ERR_OR_NULL(pbuf)) { 1387 /* 1388 * We have no room to send at the moment. Pass 1389 * responsibility for sending the ACK to the send engine 1390 * so that when enough buffer space becomes available, 1391 * the ACK is sent ahead of other outgoing packets. 1392 */ 1393 hfi1_queue_rc_ack(packet, is_fecn); 1394 return; 1395 } 1396 trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device), 1397 &opa_hdr, ib_is_sc5(sc5)); 1398 1399 /* write the pbc and data */ 1400 ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, 1401 (priv->hdr_type == HFI1_PKT_TYPE_9B ? 1402 (void *)&opa_hdr.ibh : 1403 (void *)&opa_hdr.opah), hwords); 1404 return; 1405} 1406 1407/** 1408 * update_num_rd_atomic - update the qp->s_num_rd_atomic 1409 * @qp: the QP 1410 * @psn: the packet sequence number to restart at 1411 * @wqe: the wqe 1412 * 1413 * This is called from reset_psn() to update qp->s_num_rd_atomic 1414 * for the current wqe. 1415 * Called at interrupt level with the QP s_lock held. 1416 */ 1417static void update_num_rd_atomic(struct rvt_qp *qp, u32 psn, 1418 struct rvt_swqe *wqe) 1419{ 1420 u32 opcode = wqe->wr.opcode; 1421 1422 if (opcode == IB_WR_RDMA_READ || 1423 opcode == IB_WR_ATOMIC_CMP_AND_SWP || 1424 opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { 1425 qp->s_num_rd_atomic++; 1426 } else if (opcode == IB_WR_TID_RDMA_READ) { 1427 struct tid_rdma_request *req = wqe_to_tid_req(wqe); 1428 struct hfi1_qp_priv *priv = qp->priv; 1429 1430 if (cmp_psn(psn, wqe->lpsn) <= 0) { 1431 u32 cur_seg; 1432 1433 cur_seg = (psn - wqe->psn) / priv->pkts_ps; 1434 req->ack_pending = cur_seg - req->comp_seg; 1435 priv->pending_tid_r_segs += req->ack_pending; 1436 qp->s_num_rd_atomic += req->ack_pending; 1437 trace_hfi1_tid_req_update_num_rd_atomic(qp, 0, 1438 wqe->wr.opcode, 1439 wqe->psn, 1440 wqe->lpsn, 1441 req); 1442 } else { 1443 priv->pending_tid_r_segs += req->total_segs; 1444 qp->s_num_rd_atomic += req->total_segs; 1445 } 1446 } 1447} 1448 1449/** 1450 * reset_psn - reset the QP state to send starting from PSN 1451 * @qp: the QP 1452 * @psn: the packet sequence number to restart at 1453 * 1454 * This is called from hfi1_rc_rcv() to process an incoming RC ACK 1455 * for the given QP. 1456 * Called at interrupt level with the QP s_lock held. 1457 */ 1458static void reset_psn(struct rvt_qp *qp, u32 psn) 1459{ 1460 u32 n = qp->s_acked; 1461 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n); 1462 u32 opcode; 1463 struct hfi1_qp_priv *priv = qp->priv; 1464 1465 lockdep_assert_held(&qp->s_lock); 1466 qp->s_cur = n; 1467 priv->pending_tid_r_segs = 0; 1468 priv->pending_tid_w_resp = 0; 1469 qp->s_num_rd_atomic = 0; 1470 1471 /* 1472 * If we are starting the request from the beginning, 1473 * let the normal send code handle initialization. 1474 */ 1475 if (cmp_psn(psn, wqe->psn) <= 0) { 1476 qp->s_state = OP(SEND_LAST); 1477 goto done; 1478 } 1479 update_num_rd_atomic(qp, psn, wqe); 1480 1481 /* Find the work request opcode corresponding to the given PSN. */ 1482 for (;;) { 1483 int diff; 1484 1485 if (++n == qp->s_size) 1486 n = 0; 1487 if (n == qp->s_tail) 1488 break; 1489 wqe = rvt_get_swqe_ptr(qp, n); 1490 diff = cmp_psn(psn, wqe->psn); 1491 if (diff < 0) { 1492 /* Point wqe back to the previous one*/ 1493 wqe = rvt_get_swqe_ptr(qp, qp->s_cur); 1494 break; 1495 } 1496 qp->s_cur = n; 1497 /* 1498 * If we are starting the request from the beginning, 1499 * let the normal send code handle initialization. 1500 */ 1501 if (diff == 0) { 1502 qp->s_state = OP(SEND_LAST); 1503 goto done; 1504 } 1505 1506 update_num_rd_atomic(qp, psn, wqe); 1507 } 1508 opcode = wqe->wr.opcode; 1509 1510 /* 1511 * Set the state to restart in the middle of a request. 1512 * Don't change the s_sge, s_cur_sge, or s_cur_size. 1513 * See hfi1_make_rc_req(). 1514 */ 1515 switch (opcode) { 1516 case IB_WR_SEND: 1517 case IB_WR_SEND_WITH_IMM: 1518 qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); 1519 break; 1520 1521 case IB_WR_RDMA_WRITE: 1522 case IB_WR_RDMA_WRITE_WITH_IMM: 1523 qp->s_state = OP(RDMA_READ_RESPONSE_LAST); 1524 break; 1525 1526 case IB_WR_TID_RDMA_WRITE: 1527 qp->s_state = TID_OP(WRITE_RESP); 1528 break; 1529 1530 case IB_WR_RDMA_READ: 1531 qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); 1532 break; 1533 1534 case IB_WR_TID_RDMA_READ: 1535 qp->s_state = TID_OP(READ_RESP); 1536 break; 1537 1538 default: 1539 /* 1540 * This case shouldn't happen since its only 1541 * one PSN per req. 1542 */ 1543 qp->s_state = OP(SEND_LAST); 1544 } 1545done: 1546 priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK; 1547 qp->s_psn = psn; 1548 /* 1549 * Set RVT_S_WAIT_PSN as rc_complete() may start the timer 1550 * asynchronously before the send engine can get scheduled. 1551 * Doing it in hfi1_make_rc_req() is too late. 1552 */ 1553 if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) && 1554 (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) 1555 qp->s_flags |= RVT_S_WAIT_PSN; 1556 qp->s_flags &= ~HFI1_S_AHG_VALID; 1557 trace_hfi1_sender_reset_psn(qp); 1558} 1559 1560/* 1561 * Back up requester to resend the last un-ACKed request. 1562 * The QP r_lock and s_lock should be held and interrupts disabled. 1563 */ 1564void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) 1565{ 1566 struct hfi1_qp_priv *priv = qp->priv; 1567 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 1568 struct hfi1_ibport *ibp; 1569 1570 lockdep_assert_held(&qp->r_lock); 1571 lockdep_assert_held(&qp->s_lock); 1572 trace_hfi1_sender_restart_rc(qp); 1573 if (qp->s_retry == 0) { 1574 if (qp->s_mig_state == IB_MIG_ARMED) { 1575 hfi1_migrate_qp(qp); 1576 qp->s_retry = qp->s_retry_cnt; 1577 } else if (qp->s_last == qp->s_acked) { 1578 /* 1579 * We need special handling for the OPFN request WQEs as 1580 * they are not allowed to generate real user errors 1581 */ 1582 if (wqe->wr.opcode == IB_WR_OPFN) { 1583 struct hfi1_ibport *ibp = 1584 to_iport(qp->ibqp.device, qp->port_num); 1585 /* 1586 * Call opfn_conn_reply() with capcode and 1587 * remaining data as 0 to close out the 1588 * current request 1589 */ 1590 opfn_conn_reply(qp, priv->opfn.curr); 1591 wqe = do_rc_completion(qp, wqe, ibp); 1592 qp->s_flags &= ~RVT_S_WAIT_ACK; 1593 } else { 1594 trace_hfi1_tid_write_sender_restart_rc(qp, 0); 1595 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 1596 struct tid_rdma_request *req; 1597 1598 req = wqe_to_tid_req(wqe); 1599 hfi1_kern_exp_rcv_clear_all(req); 1600 hfi1_kern_clear_hw_flow(priv->rcd, qp); 1601 } 1602 1603 hfi1_trdma_send_complete(qp, wqe, 1604 IB_WC_RETRY_EXC_ERR); 1605 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 1606 } 1607 return; 1608 } else { /* need to handle delayed completion */ 1609 return; 1610 } 1611 } else { 1612 qp->s_retry--; 1613 } 1614 1615 ibp = to_iport(qp->ibqp.device, qp->port_num); 1616 if (wqe->wr.opcode == IB_WR_RDMA_READ || 1617 wqe->wr.opcode == IB_WR_TID_RDMA_READ) 1618 ibp->rvp.n_rc_resends++; 1619 else 1620 ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); 1621 1622 qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | 1623 RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN | 1624 RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP); 1625 if (wait) 1626 qp->s_flags |= RVT_S_SEND_ONE; 1627 reset_psn(qp, psn); 1628} 1629 1630/* 1631 * Set qp->s_sending_psn to the next PSN after the given one. 1632 * This would be psn+1 except when RDMA reads or TID RDMA ops 1633 * are present. 1634 */ 1635static void reset_sending_psn(struct rvt_qp *qp, u32 psn) 1636{ 1637 struct rvt_swqe *wqe; 1638 u32 n = qp->s_last; 1639 1640 lockdep_assert_held(&qp->s_lock); 1641 /* Find the work request corresponding to the given PSN. */ 1642 for (;;) { 1643 wqe = rvt_get_swqe_ptr(qp, n); 1644 if (cmp_psn(psn, wqe->lpsn) <= 0) { 1645 if (wqe->wr.opcode == IB_WR_RDMA_READ || 1646 wqe->wr.opcode == IB_WR_TID_RDMA_READ || 1647 wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) 1648 qp->s_sending_psn = wqe->lpsn + 1; 1649 else 1650 qp->s_sending_psn = psn + 1; 1651 break; 1652 } 1653 if (++n == qp->s_size) 1654 n = 0; 1655 if (n == qp->s_tail) 1656 break; 1657 } 1658} 1659 1660/** 1661 * hfi1_rc_verbs_aborted - handle abort status 1662 * @qp: the QP 1663 * @opah: the opa header 1664 * 1665 * This code modifies both ACK bit in BTH[2] 1666 * and the s_flags to go into send one mode. 1667 * 1668 * This serves to throttle the send engine to only 1669 * send a single packet in the likely case the 1670 * a link has gone down. 1671 */ 1672void hfi1_rc_verbs_aborted(struct rvt_qp *qp, struct hfi1_opa_header *opah) 1673{ 1674 struct ib_other_headers *ohdr = hfi1_get_rc_ohdr(opah); 1675 u8 opcode = ib_bth_get_opcode(ohdr); 1676 u32 psn; 1677 1678 /* ignore responses */ 1679 if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) && 1680 opcode <= OP(ATOMIC_ACKNOWLEDGE)) || 1681 opcode == TID_OP(READ_RESP) || 1682 opcode == TID_OP(WRITE_RESP)) 1683 return; 1684 1685 psn = ib_bth_get_psn(ohdr) | IB_BTH_REQ_ACK; 1686 ohdr->bth[2] = cpu_to_be32(psn); 1687 qp->s_flags |= RVT_S_SEND_ONE; 1688} 1689 1690/* 1691 * This should be called with the QP s_lock held and interrupts disabled. 1692 */ 1693void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) 1694{ 1695 struct ib_other_headers *ohdr; 1696 struct hfi1_qp_priv *priv = qp->priv; 1697 struct rvt_swqe *wqe; 1698 u32 opcode, head, tail; 1699 u32 psn; 1700 struct tid_rdma_request *req; 1701 1702 lockdep_assert_held(&qp->s_lock); 1703 if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK)) 1704 return; 1705 1706 ohdr = hfi1_get_rc_ohdr(opah); 1707 opcode = ib_bth_get_opcode(ohdr); 1708 if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) && 1709 opcode <= OP(ATOMIC_ACKNOWLEDGE)) || 1710 opcode == TID_OP(READ_RESP) || 1711 opcode == TID_OP(WRITE_RESP)) { 1712 WARN_ON(!qp->s_rdma_ack_cnt); 1713 qp->s_rdma_ack_cnt--; 1714 return; 1715 } 1716 1717 psn = ib_bth_get_psn(ohdr); 1718 /* 1719 * Don't attempt to reset the sending PSN for packets in the 1720 * KDETH PSN space since the PSN does not match anything. 1721 */ 1722 if (opcode != TID_OP(WRITE_DATA) && 1723 opcode != TID_OP(WRITE_DATA_LAST) && 1724 opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC)) 1725 reset_sending_psn(qp, psn); 1726 1727 /* Handle TID RDMA WRITE packets differently */ 1728 if (opcode >= TID_OP(WRITE_REQ) && 1729 opcode <= TID_OP(WRITE_DATA_LAST)) { 1730 head = priv->s_tid_head; 1731 tail = priv->s_tid_cur; 1732 /* 1733 * s_tid_cur is set to s_tid_head in the case, where 1734 * a new TID RDMA request is being started and all 1735 * previous ones have been completed. 1736 * Therefore, we need to do a secondary check in order 1737 * to properly determine whether we should start the 1738 * RC timer. 1739 */ 1740 wqe = rvt_get_swqe_ptr(qp, tail); 1741 req = wqe_to_tid_req(wqe); 1742 if (head == tail && req->comp_seg < req->total_segs) { 1743 if (tail == 0) 1744 tail = qp->s_size - 1; 1745 else 1746 tail -= 1; 1747 } 1748 } else { 1749 head = qp->s_tail; 1750 tail = qp->s_acked; 1751 } 1752 1753 /* 1754 * Start timer after a packet requesting an ACK has been sent and 1755 * there are still requests that haven't been acked. 1756 */ 1757 if ((psn & IB_BTH_REQ_ACK) && tail != head && 1758 opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) && 1759 opcode != TID_OP(RESYNC) && 1760 !(qp->s_flags & 1761 (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && 1762 (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { 1763 if (opcode == TID_OP(READ_REQ)) 1764 rvt_add_retry_timer_ext(qp, priv->timeout_shift); 1765 else 1766 rvt_add_retry_timer(qp); 1767 } 1768 1769 /* Start TID RDMA ACK timer */ 1770 if ((opcode == TID_OP(WRITE_DATA) || 1771 opcode == TID_OP(WRITE_DATA_LAST) || 1772 opcode == TID_OP(RESYNC)) && 1773 (psn & IB_BTH_REQ_ACK) && 1774 !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) && 1775 (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { 1776 /* 1777 * The TID RDMA ACK packet could be received before this 1778 * function is called. Therefore, add the timer only if TID 1779 * RDMA ACK packets are actually pending. 1780 */ 1781 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 1782 req = wqe_to_tid_req(wqe); 1783 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && 1784 req->ack_seg < req->cur_seg) 1785 hfi1_add_tid_retry_timer(qp); 1786 } 1787 1788 while (qp->s_last != qp->s_acked) { 1789 wqe = rvt_get_swqe_ptr(qp, qp->s_last); 1790 if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 && 1791 cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) 1792 break; 1793 trdma_clean_swqe(qp, wqe); 1794 trace_hfi1_qp_send_completion(qp, wqe, qp->s_last); 1795 rvt_qp_complete_swqe(qp, 1796 wqe, 1797 ib_hfi1_wc_opcode[wqe->wr.opcode], 1798 IB_WC_SUCCESS); 1799 } 1800 /* 1801 * If we were waiting for sends to complete before re-sending, 1802 * and they are now complete, restart sending. 1803 */ 1804 trace_hfi1_sendcomplete(qp, psn); 1805 if (qp->s_flags & RVT_S_WAIT_PSN && 1806 cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { 1807 qp->s_flags &= ~RVT_S_WAIT_PSN; 1808 qp->s_sending_psn = qp->s_psn; 1809 qp->s_sending_hpsn = qp->s_psn - 1; 1810 hfi1_schedule_send(qp); 1811 } 1812} 1813 1814static inline void update_last_psn(struct rvt_qp *qp, u32 psn) 1815{ 1816 qp->s_last_psn = psn; 1817} 1818 1819/* 1820 * Generate a SWQE completion. 1821 * This is similar to hfi1_send_complete but has to check to be sure 1822 * that the SGEs are not being referenced if the SWQE is being resent. 1823 */ 1824struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, 1825 struct rvt_swqe *wqe, 1826 struct hfi1_ibport *ibp) 1827{ 1828 struct hfi1_qp_priv *priv = qp->priv; 1829 1830 lockdep_assert_held(&qp->s_lock); 1831 /* 1832 * Don't decrement refcount and don't generate a 1833 * completion if the SWQE is being resent until the send 1834 * is finished. 1835 */ 1836 trace_hfi1_rc_completion(qp, wqe->lpsn); 1837 if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 || 1838 cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { 1839 trdma_clean_swqe(qp, wqe); 1840 trace_hfi1_qp_send_completion(qp, wqe, qp->s_last); 1841 rvt_qp_complete_swqe(qp, 1842 wqe, 1843 ib_hfi1_wc_opcode[wqe->wr.opcode], 1844 IB_WC_SUCCESS); 1845 } else { 1846 struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); 1847 1848 this_cpu_inc(*ibp->rvp.rc_delayed_comp); 1849 /* 1850 * If send progress not running attempt to progress 1851 * SDMA queue. 1852 */ 1853 if (ppd->dd->flags & HFI1_HAS_SEND_DMA) { 1854 struct sdma_engine *engine; 1855 u8 sl = rdma_ah_get_sl(&qp->remote_ah_attr); 1856 u8 sc5; 1857 1858 /* For now use sc to find engine */ 1859 sc5 = ibp->sl_to_sc[sl]; 1860 engine = qp_to_sdma_engine(qp, sc5); 1861 sdma_engine_progress_schedule(engine); 1862 } 1863 } 1864 1865 qp->s_retry = qp->s_retry_cnt; 1866 /* 1867 * Don't update the last PSN if the request being completed is 1868 * a TID RDMA WRITE request. 1869 * Completion of the TID RDMA WRITE requests are done by the 1870 * TID RDMA ACKs and as such could be for a request that has 1871 * already been ACKed as far as the IB state machine is 1872 * concerned. 1873 */ 1874 if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) 1875 update_last_psn(qp, wqe->lpsn); 1876 1877 /* 1878 * If we are completing a request which is in the process of 1879 * being resent, we can stop re-sending it since we know the 1880 * responder has already seen it. 1881 */ 1882 if (qp->s_acked == qp->s_cur) { 1883 if (++qp->s_cur >= qp->s_size) 1884 qp->s_cur = 0; 1885 qp->s_acked = qp->s_cur; 1886 wqe = rvt_get_swqe_ptr(qp, qp->s_cur); 1887 if (qp->s_acked != qp->s_tail) { 1888 qp->s_state = OP(SEND_LAST); 1889 qp->s_psn = wqe->psn; 1890 } 1891 } else { 1892 if (++qp->s_acked >= qp->s_size) 1893 qp->s_acked = 0; 1894 if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur) 1895 qp->s_draining = 0; 1896 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 1897 } 1898 if (priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) { 1899 priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK; 1900 hfi1_schedule_send(qp); 1901 } 1902 return wqe; 1903} 1904 1905static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd) 1906{ 1907 /* Retry this request. */ 1908 if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { 1909 qp->r_flags |= RVT_R_RDMAR_SEQ; 1910 hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); 1911 if (list_empty(&qp->rspwait)) { 1912 qp->r_flags |= RVT_R_RSP_SEND; 1913 rvt_get_qp(qp); 1914 list_add_tail(&qp->rspwait, &rcd->qp_wait_list); 1915 } 1916 } 1917} 1918 1919/** 1920 * update_qp_retry_state - Update qp retry state. 1921 * @qp: the QP 1922 * @psn: the packet sequence number of the TID RDMA WRITE RESP. 1923 * @spsn: The start psn for the given TID RDMA WRITE swqe. 1924 * @lpsn: The last psn for the given TID RDMA WRITE swqe. 1925 * 1926 * This function is called to update the qp retry state upon 1927 * receiving a TID WRITE RESP after the qp is scheduled to retry 1928 * a request. 1929 */ 1930static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn, 1931 u32 lpsn) 1932{ 1933 struct hfi1_qp_priv *qpriv = qp->priv; 1934 1935 qp->s_psn = psn + 1; 1936 /* 1937 * If this is the first TID RDMA WRITE RESP packet for the current 1938 * request, change the s_state so that the retry will be processed 1939 * correctly. Similarly, if this is the last TID RDMA WRITE RESP 1940 * packet, change the s_state and advance the s_cur. 1941 */ 1942 if (cmp_psn(psn, lpsn) >= 0) { 1943 qp->s_cur = qpriv->s_tid_cur + 1; 1944 if (qp->s_cur >= qp->s_size) 1945 qp->s_cur = 0; 1946 qp->s_state = TID_OP(WRITE_REQ); 1947 } else if (!cmp_psn(psn, spsn)) { 1948 qp->s_cur = qpriv->s_tid_cur; 1949 qp->s_state = TID_OP(WRITE_RESP); 1950 } 1951} 1952 1953/* 1954 * do_rc_ack - process an incoming RC ACK 1955 * @qp: the QP the ACK came in on 1956 * @psn: the packet sequence number of the ACK 1957 * @opcode: the opcode of the request that resulted in the ACK 1958 * 1959 * This is called from rc_rcv_resp() to process an incoming RC ACK 1960 * for the given QP. 1961 * May be called at interrupt level, with the QP s_lock held. 1962 * Returns 1 if OK, 0 if current operation should be aborted (NAK). 1963 */ 1964int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, 1965 u64 val, struct hfi1_ctxtdata *rcd) 1966{ 1967 struct hfi1_ibport *ibp; 1968 enum ib_wc_status status; 1969 struct hfi1_qp_priv *qpriv = qp->priv; 1970 struct rvt_swqe *wqe; 1971 int ret = 0; 1972 u32 ack_psn; 1973 int diff; 1974 struct rvt_dev_info *rdi; 1975 1976 lockdep_assert_held(&qp->s_lock); 1977 /* 1978 * Note that NAKs implicitly ACK outstanding SEND and RDMA write 1979 * requests and implicitly NAK RDMA read and atomic requests issued 1980 * before the NAK'ed request. The MSN won't include the NAK'ed 1981 * request but will include an ACK'ed request(s). 1982 */ 1983 ack_psn = psn; 1984 if (aeth >> IB_AETH_NAK_SHIFT) 1985 ack_psn--; 1986 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 1987 ibp = rcd_to_iport(rcd); 1988 1989 /* 1990 * The MSN might be for a later WQE than the PSN indicates so 1991 * only complete WQEs that the PSN finishes. 1992 */ 1993 while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) { 1994 /* 1995 * RDMA_READ_RESPONSE_ONLY is a special case since 1996 * we want to generate completion events for everything 1997 * before the RDMA read, copy the data, then generate 1998 * the completion for the read. 1999 */ 2000 if (wqe->wr.opcode == IB_WR_RDMA_READ && 2001 opcode == OP(RDMA_READ_RESPONSE_ONLY) && 2002 diff == 0) { 2003 ret = 1; 2004 goto bail_stop; 2005 } 2006 /* 2007 * If this request is a RDMA read or atomic, and the ACK is 2008 * for a later operation, this ACK NAKs the RDMA read or 2009 * atomic. In other words, only a RDMA_READ_LAST or ONLY 2010 * can ACK a RDMA read and likewise for atomic ops. Note 2011 * that the NAK case can only happen if relaxed ordering is 2012 * used and requests are sent after an RDMA read or atomic 2013 * is sent but before the response is received. 2014 */ 2015 if ((wqe->wr.opcode == IB_WR_RDMA_READ && 2016 (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) || 2017 (wqe->wr.opcode == IB_WR_TID_RDMA_READ && 2018 (opcode != TID_OP(READ_RESP) || diff != 0)) || 2019 ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 2020 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && 2021 (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) || 2022 (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && 2023 (delta_psn(psn, qp->s_last_psn) != 1))) { 2024 set_restart_qp(qp, rcd); 2025 /* 2026 * No need to process the ACK/NAK since we are 2027 * restarting an earlier request. 2028 */ 2029 goto bail_stop; 2030 } 2031 if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 2032 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { 2033 u64 *vaddr = wqe->sg_list[0].vaddr; 2034 *vaddr = val; 2035 } 2036 if (wqe->wr.opcode == IB_WR_OPFN) 2037 opfn_conn_reply(qp, val); 2038 2039 if (qp->s_num_rd_atomic && 2040 (wqe->wr.opcode == IB_WR_RDMA_READ || 2041 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 2042 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) { 2043 qp->s_num_rd_atomic--; 2044 /* Restart sending task if fence is complete */ 2045 if ((qp->s_flags & RVT_S_WAIT_FENCE) && 2046 !qp->s_num_rd_atomic) { 2047 qp->s_flags &= ~(RVT_S_WAIT_FENCE | 2048 RVT_S_WAIT_ACK); 2049 hfi1_schedule_send(qp); 2050 } else if (qp->s_flags & RVT_S_WAIT_RDMAR) { 2051 qp->s_flags &= ~(RVT_S_WAIT_RDMAR | 2052 RVT_S_WAIT_ACK); 2053 hfi1_schedule_send(qp); 2054 } 2055 } 2056 2057 /* 2058 * TID RDMA WRITE requests will be completed by the TID RDMA 2059 * ACK packet handler (see tid_rdma.c). 2060 */ 2061 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) 2062 break; 2063 2064 wqe = do_rc_completion(qp, wqe, ibp); 2065 if (qp->s_acked == qp->s_tail) 2066 break; 2067 } 2068 2069 trace_hfi1_rc_ack_do(qp, aeth, psn, wqe); 2070 trace_hfi1_sender_do_rc_ack(qp); 2071 switch (aeth >> IB_AETH_NAK_SHIFT) { 2072 case 0: /* ACK */ 2073 this_cpu_inc(*ibp->rvp.rc_acks); 2074 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) { 2075 if (wqe_to_tid_req(wqe)->ack_pending) 2076 rvt_mod_retry_timer_ext(qp, 2077 qpriv->timeout_shift); 2078 else 2079 rvt_stop_rc_timers(qp); 2080 } else if (qp->s_acked != qp->s_tail) { 2081 struct rvt_swqe *__w = NULL; 2082 2083 if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID) 2084 __w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur); 2085 2086 /* 2087 * Stop timers if we've received all of the TID RDMA 2088 * WRITE * responses. 2089 */ 2090 if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE && 2091 opcode == TID_OP(WRITE_RESP)) { 2092 /* 2093 * Normally, the loop above would correctly 2094 * process all WQEs from s_acked onward and 2095 * either complete them or check for correct 2096 * PSN sequencing. 2097 * However, for TID RDMA, due to pipelining, 2098 * the response may not be for the request at 2099 * s_acked so the above look would just be 2100 * skipped. This does not allow for checking 2101 * the PSN sequencing. It has to be done 2102 * separately. 2103 */ 2104 if (cmp_psn(psn, qp->s_last_psn + 1)) { 2105 set_restart_qp(qp, rcd); 2106 goto bail_stop; 2107 } 2108 /* 2109 * If the psn is being resent, stop the 2110 * resending. 2111 */ 2112 if (qp->s_cur != qp->s_tail && 2113 cmp_psn(qp->s_psn, psn) <= 0) 2114 update_qp_retry_state(qp, psn, 2115 __w->psn, 2116 __w->lpsn); 2117 else if (--qpriv->pending_tid_w_resp) 2118 rvt_mod_retry_timer(qp); 2119 else 2120 rvt_stop_rc_timers(qp); 2121 } else { 2122 /* 2123 * We are expecting more ACKs so 2124 * mod the retry timer. 2125 */ 2126 rvt_mod_retry_timer(qp); 2127 /* 2128 * We can stop re-sending the earlier packets 2129 * and continue with the next packet the 2130 * receiver wants. 2131 */ 2132 if (cmp_psn(qp->s_psn, psn) <= 0) 2133 reset_psn(qp, psn + 1); 2134 } 2135 } else { 2136 /* No more acks - kill all timers */ 2137 rvt_stop_rc_timers(qp); 2138 if (cmp_psn(qp->s_psn, psn) <= 0) { 2139 qp->s_state = OP(SEND_LAST); 2140 qp->s_psn = psn + 1; 2141 } 2142 } 2143 if (qp->s_flags & RVT_S_WAIT_ACK) { 2144 qp->s_flags &= ~RVT_S_WAIT_ACK; 2145 hfi1_schedule_send(qp); 2146 } 2147 rvt_get_credit(qp, aeth); 2148 qp->s_rnr_retry = qp->s_rnr_retry_cnt; 2149 qp->s_retry = qp->s_retry_cnt; 2150 /* 2151 * If the current request is a TID RDMA WRITE request and the 2152 * response is not a TID RDMA WRITE RESP packet, s_last_psn 2153 * can't be advanced. 2154 */ 2155 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && 2156 opcode != TID_OP(WRITE_RESP) && 2157 cmp_psn(psn, wqe->psn) >= 0) 2158 return 1; 2159 update_last_psn(qp, psn); 2160 return 1; 2161 2162 case 1: /* RNR NAK */ 2163 ibp->rvp.n_rnr_naks++; 2164 if (qp->s_acked == qp->s_tail) 2165 goto bail_stop; 2166 if (qp->s_flags & RVT_S_WAIT_RNR) 2167 goto bail_stop; 2168 rdi = ib_to_rvt(qp->ibqp.device); 2169 if (!(rdi->post_parms[wqe->wr.opcode].flags & 2170 RVT_OPERATION_IGN_RNR_CNT)) { 2171 if (qp->s_rnr_retry == 0) { 2172 status = IB_WC_RNR_RETRY_EXC_ERR; 2173 goto class_b; 2174 } 2175 if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0) 2176 qp->s_rnr_retry--; 2177 } 2178 2179 /* 2180 * The last valid PSN is the previous PSN. For TID RDMA WRITE 2181 * request, s_last_psn should be incremented only when a TID 2182 * RDMA WRITE RESP is received to avoid skipping lost TID RDMA 2183 * WRITE RESP packets. 2184 */ 2185 if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { 2186 reset_psn(qp, qp->s_last_psn + 1); 2187 } else { 2188 update_last_psn(qp, psn - 1); 2189 reset_psn(qp, psn); 2190 } 2191 2192 ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); 2193 qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK); 2194 rvt_stop_rc_timers(qp); 2195 rvt_add_rnr_timer(qp, aeth); 2196 return 0; 2197 2198 case 3: /* NAK */ 2199 if (qp->s_acked == qp->s_tail) 2200 goto bail_stop; 2201 /* The last valid PSN is the previous PSN. */ 2202 update_last_psn(qp, psn - 1); 2203 switch ((aeth >> IB_AETH_CREDIT_SHIFT) & 2204 IB_AETH_CREDIT_MASK) { 2205 case 0: /* PSN sequence error */ 2206 ibp->rvp.n_seq_naks++; 2207 /* 2208 * Back up to the responder's expected PSN. 2209 * Note that we might get a NAK in the middle of an 2210 * RDMA READ response which terminates the RDMA 2211 * READ. 2212 */ 2213 hfi1_restart_rc(qp, psn, 0); 2214 hfi1_schedule_send(qp); 2215 break; 2216 2217 case 1: /* Invalid Request */ 2218 status = IB_WC_REM_INV_REQ_ERR; 2219 ibp->rvp.n_other_naks++; 2220 goto class_b; 2221 2222 case 2: /* Remote Access Error */ 2223 status = IB_WC_REM_ACCESS_ERR; 2224 ibp->rvp.n_other_naks++; 2225 goto class_b; 2226 2227 case 3: /* Remote Operation Error */ 2228 status = IB_WC_REM_OP_ERR; 2229 ibp->rvp.n_other_naks++; 2230class_b: 2231 if (qp->s_last == qp->s_acked) { 2232 if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) 2233 hfi1_kern_read_tid_flow_free(qp); 2234 2235 hfi1_trdma_send_complete(qp, wqe, status); 2236 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 2237 } 2238 break; 2239 2240 default: 2241 /* Ignore other reserved NAK error codes */ 2242 goto reserved; 2243 } 2244 qp->s_retry = qp->s_retry_cnt; 2245 qp->s_rnr_retry = qp->s_rnr_retry_cnt; 2246 goto bail_stop; 2247 2248 default: /* 2: reserved */ 2249reserved: 2250 /* Ignore reserved NAK codes. */ 2251 goto bail_stop; 2252 } 2253 /* cannot be reached */ 2254bail_stop: 2255 rvt_stop_rc_timers(qp); 2256 return ret; 2257} 2258 2259/* 2260 * We have seen an out of sequence RDMA read middle or last packet. 2261 * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE. 2262 */ 2263static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn, 2264 struct hfi1_ctxtdata *rcd) 2265{ 2266 struct rvt_swqe *wqe; 2267 2268 lockdep_assert_held(&qp->s_lock); 2269 /* Remove QP from retry timer */ 2270 rvt_stop_rc_timers(qp); 2271 2272 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 2273 2274 while (cmp_psn(psn, wqe->lpsn) > 0) { 2275 if (wqe->wr.opcode == IB_WR_RDMA_READ || 2276 wqe->wr.opcode == IB_WR_TID_RDMA_READ || 2277 wqe->wr.opcode == IB_WR_TID_RDMA_WRITE || 2278 wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || 2279 wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) 2280 break; 2281 wqe = do_rc_completion(qp, wqe, ibp); 2282 } 2283 2284 ibp->rvp.n_rdma_seq++; 2285 qp->r_flags |= RVT_R_RDMAR_SEQ; 2286 hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); 2287 if (list_empty(&qp->rspwait)) { 2288 qp->r_flags |= RVT_R_RSP_SEND; 2289 rvt_get_qp(qp); 2290 list_add_tail(&qp->rspwait, &rcd->qp_wait_list); 2291 } 2292} 2293 2294/** 2295 * rc_rcv_resp - process an incoming RC response packet 2296 * @packet: data packet information 2297 * 2298 * This is called from hfi1_rc_rcv() to process an incoming RC response 2299 * packet for the given QP. 2300 * Called at interrupt level. 2301 */ 2302static void rc_rcv_resp(struct hfi1_packet *packet) 2303{ 2304 struct hfi1_ctxtdata *rcd = packet->rcd; 2305 void *data = packet->payload; 2306 u32 tlen = packet->tlen; 2307 struct rvt_qp *qp = packet->qp; 2308 struct hfi1_ibport *ibp; 2309 struct ib_other_headers *ohdr = packet->ohdr; 2310 struct rvt_swqe *wqe; 2311 enum ib_wc_status status; 2312 unsigned long flags; 2313 int diff; 2314 u64 val; 2315 u32 aeth; 2316 u32 psn = ib_bth_get_psn(packet->ohdr); 2317 u32 pmtu = qp->pmtu; 2318 u16 hdrsize = packet->hlen; 2319 u8 opcode = packet->opcode; 2320 u8 pad = packet->pad; 2321 u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2); 2322 2323 spin_lock_irqsave(&qp->s_lock, flags); 2324 trace_hfi1_ack(qp, psn); 2325 2326 /* Ignore invalid responses. */ 2327 if (cmp_psn(psn, READ_ONCE(qp->s_next_psn)) >= 0) 2328 goto ack_done; 2329 2330 /* Ignore duplicate responses. */ 2331 diff = cmp_psn(psn, qp->s_last_psn); 2332 if (unlikely(diff <= 0)) { 2333 /* Update credits for "ghost" ACKs */ 2334 if (diff == 0 && opcode == OP(ACKNOWLEDGE)) { 2335 aeth = be32_to_cpu(ohdr->u.aeth); 2336 if ((aeth >> IB_AETH_NAK_SHIFT) == 0) 2337 rvt_get_credit(qp, aeth); 2338 } 2339 goto ack_done; 2340 } 2341 2342 /* 2343 * Skip everything other than the PSN we expect, if we are waiting 2344 * for a reply to a restarted RDMA read or atomic op. 2345 */ 2346 if (qp->r_flags & RVT_R_RDMAR_SEQ) { 2347 if (cmp_psn(psn, qp->s_last_psn + 1) != 0) 2348 goto ack_done; 2349 qp->r_flags &= ~RVT_R_RDMAR_SEQ; 2350 } 2351 2352 if (unlikely(qp->s_acked == qp->s_tail)) 2353 goto ack_done; 2354 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 2355 status = IB_WC_SUCCESS; 2356 2357 switch (opcode) { 2358 case OP(ACKNOWLEDGE): 2359 case OP(ATOMIC_ACKNOWLEDGE): 2360 case OP(RDMA_READ_RESPONSE_FIRST): 2361 aeth = be32_to_cpu(ohdr->u.aeth); 2362 if (opcode == OP(ATOMIC_ACKNOWLEDGE)) 2363 val = ib_u64_get(&ohdr->u.at.atomic_ack_eth); 2364 else 2365 val = 0; 2366 if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) || 2367 opcode != OP(RDMA_READ_RESPONSE_FIRST)) 2368 goto ack_done; 2369 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 2370 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) 2371 goto ack_op_err; 2372 /* 2373 * If this is a response to a resent RDMA read, we 2374 * have to be careful to copy the data to the right 2375 * location. 2376 */ 2377 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, 2378 wqe, psn, pmtu); 2379 goto read_middle; 2380 2381 case OP(RDMA_READ_RESPONSE_MIDDLE): 2382 /* no AETH, no ACK */ 2383 if (unlikely(cmp_psn(psn, qp->s_last_psn + 1))) 2384 goto ack_seq_err; 2385 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) 2386 goto ack_op_err; 2387read_middle: 2388 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) 2389 goto ack_len_err; 2390 if (unlikely(pmtu >= qp->s_rdma_read_len)) 2391 goto ack_len_err; 2392 2393 /* 2394 * We got a response so update the timeout. 2395 * 4.096 usec. * (1 << qp->timeout) 2396 */ 2397 rvt_mod_retry_timer(qp); 2398 if (qp->s_flags & RVT_S_WAIT_ACK) { 2399 qp->s_flags &= ~RVT_S_WAIT_ACK; 2400 hfi1_schedule_send(qp); 2401 } 2402 2403 if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE)) 2404 qp->s_retry = qp->s_retry_cnt; 2405 2406 /* 2407 * Update the RDMA receive state but do the copy w/o 2408 * holding the locks and blocking interrupts. 2409 */ 2410 qp->s_rdma_read_len -= pmtu; 2411 update_last_psn(qp, psn); 2412 spin_unlock_irqrestore(&qp->s_lock, flags); 2413 rvt_copy_sge(qp, &qp->s_rdma_read_sge, 2414 data, pmtu, false, false); 2415 goto bail; 2416 2417 case OP(RDMA_READ_RESPONSE_ONLY): 2418 aeth = be32_to_cpu(ohdr->u.aeth); 2419 if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) 2420 goto ack_done; 2421 /* 2422 * Check that the data size is >= 0 && <= pmtu. 2423 * Remember to account for ICRC (4). 2424 */ 2425 if (unlikely(tlen < (hdrsize + extra_bytes))) 2426 goto ack_len_err; 2427 /* 2428 * If this is a response to a resent RDMA read, we 2429 * have to be careful to copy the data to the right 2430 * location. 2431 */ 2432 wqe = rvt_get_swqe_ptr(qp, qp->s_acked); 2433 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, 2434 wqe, psn, pmtu); 2435 goto read_last; 2436 2437 case OP(RDMA_READ_RESPONSE_LAST): 2438 /* ACKs READ req. */ 2439 if (unlikely(cmp_psn(psn, qp->s_last_psn + 1))) 2440 goto ack_seq_err; 2441 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) 2442 goto ack_op_err; 2443 /* 2444 * Check that the data size is >= 1 && <= pmtu. 2445 * Remember to account for ICRC (4). 2446 */ 2447 if (unlikely(tlen <= (hdrsize + extra_bytes))) 2448 goto ack_len_err; 2449read_last: 2450 tlen -= hdrsize + extra_bytes; 2451 if (unlikely(tlen != qp->s_rdma_read_len)) 2452 goto ack_len_err; 2453 aeth = be32_to_cpu(ohdr->u.aeth); 2454 rvt_copy_sge(qp, &qp->s_rdma_read_sge, 2455 data, tlen, false, false); 2456 WARN_ON(qp->s_rdma_read_sge.num_sge); 2457 (void)do_rc_ack(qp, aeth, psn, 2458 OP(RDMA_READ_RESPONSE_LAST), 0, rcd); 2459 goto ack_done; 2460 } 2461 2462ack_op_err: 2463 status = IB_WC_LOC_QP_OP_ERR; 2464 goto ack_err; 2465 2466ack_seq_err: 2467 ibp = rcd_to_iport(rcd); 2468 rdma_seq_err(qp, ibp, psn, rcd); 2469 goto ack_done; 2470 2471ack_len_err: 2472 status = IB_WC_LOC_LEN_ERR; 2473ack_err: 2474 if (qp->s_last == qp->s_acked) { 2475 rvt_send_complete(qp, wqe, status); 2476 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); 2477 } 2478ack_done: 2479 spin_unlock_irqrestore(&qp->s_lock, flags); 2480bail: 2481 return; 2482} 2483 2484static inline void rc_cancel_ack(struct rvt_qp *qp) 2485{ 2486 qp->r_adefered = 0; 2487 if (list_empty(&qp->rspwait)) 2488 return; 2489 list_del_init(&qp->rspwait); 2490 qp->r_flags &= ~RVT_R_RSP_NAK; 2491 rvt_put_qp(qp); 2492} 2493 2494/** 2495 * rc_rcv_error - process an incoming duplicate or error RC packet 2496 * @ohdr: the other headers for this packet 2497 * @data: the packet data 2498 * @qp: the QP for this packet 2499 * @opcode: the opcode for this packet 2500 * @psn: the packet sequence number for this packet 2501 * @diff: the difference between the PSN and the expected PSN 2502 * @rcd: the receive context 2503 * 2504 * This is called from hfi1_rc_rcv() to process an unexpected 2505 * incoming RC packet for the given QP. 2506 * Called at interrupt level. 2507 * Return 1 if no more processing is needed; otherwise return 0 to 2508 * schedule a response to be sent. 2509 */ 2510static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, 2511 struct rvt_qp *qp, u32 opcode, u32 psn, 2512 int diff, struct hfi1_ctxtdata *rcd) 2513{ 2514 struct hfi1_ibport *ibp = rcd_to_iport(rcd); 2515 struct rvt_ack_entry *e; 2516 unsigned long flags; 2517 u8 prev; 2518 u8 mra; /* most recent ACK */ 2519 bool old_req; 2520 2521 trace_hfi1_rcv_error(qp, psn); 2522 if (diff > 0) { 2523 /* 2524 * Packet sequence error. 2525 * A NAK will ACK earlier sends and RDMA writes. 2526 * Don't queue the NAK if we already sent one. 2527 */ 2528 if (!qp->r_nak_state) { 2529 ibp->rvp.n_rc_seqnak++; 2530 qp->r_nak_state = IB_NAK_PSN_ERROR; 2531 /* Use the expected PSN. */ 2532 qp->r_ack_psn = qp->r_psn; 2533 /* 2534 * Wait to send the sequence NAK until all packets 2535 * in the receive queue have been processed. 2536 * Otherwise, we end up propagating congestion. 2537 */ 2538 rc_defered_ack(rcd, qp); 2539 } 2540 goto done; 2541 } 2542 2543 /* 2544 * Handle a duplicate request. Don't re-execute SEND, RDMA 2545 * write or atomic op. Don't NAK errors, just silently drop 2546 * the duplicate request. Note that r_sge, r_len, and 2547 * r_rcv_len may be in use so don't modify them. 2548 * 2549 * We are supposed to ACK the earliest duplicate PSN but we 2550 * can coalesce an outstanding duplicate ACK. We have to 2551 * send the earliest so that RDMA reads can be restarted at 2552 * the requester's expected PSN. 2553 * 2554 * First, find where this duplicate PSN falls within the 2555 * ACKs previously sent. 2556 * old_req is true if there is an older response that is scheduled 2557 * to be sent before sending this one. 2558 */ 2559 e = NULL; 2560 old_req = true; 2561 ibp->rvp.n_rc_dupreq++; 2562 2563 spin_lock_irqsave(&qp->s_lock, flags); 2564 2565 e = find_prev_entry(qp, psn, &prev, &mra, &old_req); 2566 2567 switch (opcode) { 2568 case OP(RDMA_READ_REQUEST): { 2569 struct ib_reth *reth; 2570 u32 offset; 2571 u32 len; 2572 2573 /* 2574 * If we didn't find the RDMA read request in the ack queue, 2575 * we can ignore this request. 2576 */ 2577 if (!e || e->opcode != OP(RDMA_READ_REQUEST)) 2578 goto unlock_done; 2579 /* RETH comes after BTH */ 2580 reth = &ohdr->u.rc.reth; 2581 /* 2582 * Address range must be a subset of the original 2583 * request and start on pmtu boundaries. 2584 * We reuse the old ack_queue slot since the requester 2585 * should not back up and request an earlier PSN for the 2586 * same request. 2587 */ 2588 offset = delta_psn(psn, e->psn) * qp->pmtu; 2589 len = be32_to_cpu(reth->length); 2590 if (unlikely(offset + len != e->rdma_sge.sge_length)) 2591 goto unlock_done; 2592 release_rdma_sge_mr(e); 2593 if (len != 0) { 2594 u32 rkey = be32_to_cpu(reth->rkey); 2595 u64 vaddr = get_ib_reth_vaddr(reth); 2596 int ok; 2597 2598 ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, 2599 IB_ACCESS_REMOTE_READ); 2600 if (unlikely(!ok)) 2601 goto unlock_done; 2602 } else { 2603 e->rdma_sge.vaddr = NULL; 2604 e->rdma_sge.length = 0; 2605 e->rdma_sge.sge_length = 0; 2606 } 2607 e->psn = psn; 2608 if (old_req) 2609 goto unlock_done; 2610 if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) 2611 qp->s_acked_ack_queue = prev; 2612 qp->s_tail_ack_queue = prev; 2613 break; 2614 } 2615 2616 case OP(COMPARE_SWAP): 2617 case OP(FETCH_ADD): { 2618 /* 2619 * If we didn't find the atomic request in the ack queue 2620 * or the send engine is already backed up to send an 2621 * earlier entry, we can ignore this request. 2622 */ 2623 if (!e || e->opcode != (u8)opcode || old_req) 2624 goto unlock_done; 2625 if (qp->s_tail_ack_queue == qp->s_acked_ack_queue) 2626 qp->s_acked_ack_queue = prev; 2627 qp->s_tail_ack_queue = prev; 2628 break; 2629 } 2630 2631 default: 2632 /* 2633 * Ignore this operation if it doesn't request an ACK 2634 * or an earlier RDMA read or atomic is going to be resent. 2635 */ 2636 if (!(psn & IB_BTH_REQ_ACK) || old_req) 2637 goto unlock_done; 2638 /* 2639 * Resend the most recent ACK if this request is 2640 * after all the previous RDMA reads and atomics. 2641 */ 2642 if (mra == qp->r_head_ack_queue) { 2643 spin_unlock_irqrestore(&qp->s_lock, flags); 2644 qp->r_nak_state = 0; 2645 qp->r_ack_psn = qp->r_psn - 1; 2646 goto send_ack; 2647 } 2648 2649 /* 2650 * Resend the RDMA read or atomic op which 2651 * ACKs this duplicate request. 2652 */ 2653 if (qp->s_tail_ack_queue == qp->s_acked_ack_queue) 2654 qp->s_acked_ack_queue = mra; 2655 qp->s_tail_ack_queue = mra; 2656 break; 2657 } 2658 qp->s_ack_state = OP(ACKNOWLEDGE); 2659 qp->s_flags |= RVT_S_RESP_PENDING; 2660 qp->r_nak_state = 0; 2661 hfi1_schedule_send(qp); 2662 2663unlock_done: 2664 spin_unlock_irqrestore(&qp->s_lock, flags); 2665done: 2666 return 1; 2667 2668send_ack: 2669 return 0; 2670} 2671 2672static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, 2673 u32 lqpn, u32 rqpn, u8 svc_type) 2674{ 2675 struct opa_hfi1_cong_log_event_internal *cc_event; 2676 unsigned long flags; 2677 2678 if (sl >= OPA_MAX_SLS) 2679 return; 2680 2681 spin_lock_irqsave(&ppd->cc_log_lock, flags); 2682 2683 ppd->threshold_cong_event_map[sl / 8] |= 1 << (sl % 8); 2684 ppd->threshold_event_counter++; 2685 2686 cc_event = &ppd->cc_events[ppd->cc_log_idx++]; 2687 if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS) 2688 ppd->cc_log_idx = 0; 2689 cc_event->lqpn = lqpn & RVT_QPN_MASK; 2690 cc_event->rqpn = rqpn & RVT_QPN_MASK; 2691 cc_event->sl = sl; 2692 cc_event->svc_type = svc_type; 2693 cc_event->rlid = rlid; 2694 /* keep timestamp in units of 1.024 usec */ 2695 cc_event->timestamp = ktime_get_ns() / 1024; 2696 2697 spin_unlock_irqrestore(&ppd->cc_log_lock, flags); 2698} 2699 2700void process_becn(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn, 2701 u32 rqpn, u8 svc_type) 2702{ 2703 struct cca_timer *cca_timer; 2704 u16 ccti, ccti_incr, ccti_timer, ccti_limit; 2705 u8 trigger_threshold; 2706 struct cc_state *cc_state; 2707 unsigned long flags; 2708 2709 if (sl >= OPA_MAX_SLS) 2710 return; 2711 2712 cc_state = get_cc_state(ppd); 2713 2714 if (!cc_state) 2715 return; 2716 2717 /* 2718 * 1) increase CCTI (for this SL) 2719 * 2) select IPG (i.e., call set_link_ipg()) 2720 * 3) start timer 2721 */ 2722 ccti_limit = cc_state->cct.ccti_limit; 2723 ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase; 2724 ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer; 2725 trigger_threshold = 2726 cc_state->cong_setting.entries[sl].trigger_threshold; 2727 2728 spin_lock_irqsave(&ppd->cca_timer_lock, flags); 2729 2730 cca_timer = &ppd->cca_timer[sl]; 2731 if (cca_timer->ccti < ccti_limit) { 2732 if (cca_timer->ccti + ccti_incr <= ccti_limit) 2733 cca_timer->ccti += ccti_incr; 2734 else 2735 cca_timer->ccti = ccti_limit; 2736 set_link_ipg(ppd); 2737 } 2738 2739 ccti = cca_timer->ccti; 2740 2741 if (!hrtimer_active(&cca_timer->hrtimer)) { 2742 /* ccti_timer is in units of 1.024 usec */ 2743 unsigned long nsec = 1024 * ccti_timer; 2744 2745 hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec), 2746 HRTIMER_MODE_REL_PINNED); 2747 } 2748 2749 spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); 2750 2751 if ((trigger_threshold != 0) && (ccti >= trigger_threshold)) 2752 log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type); 2753} 2754 2755/** 2756 * hfi1_rc_rcv - process an incoming RC packet 2757 * @packet: data packet information 2758 * 2759 * This is called from qp_rcv() to process an incoming RC packet 2760 * for the given QP. 2761 * May be called at interrupt level. 2762 */ 2763void hfi1_rc_rcv(struct hfi1_packet *packet) 2764{ 2765 struct hfi1_ctxtdata *rcd = packet->rcd; 2766 void *data = packet->payload; 2767 u32 tlen = packet->tlen; 2768 struct rvt_qp *qp = packet->qp; 2769 struct hfi1_qp_priv *qpriv = qp->priv; 2770 struct hfi1_ibport *ibp = rcd_to_iport(rcd); 2771 struct ib_other_headers *ohdr = packet->ohdr; 2772 u32 opcode = packet->opcode; 2773 u32 hdrsize = packet->hlen; 2774 u32 psn = ib_bth_get_psn(packet->ohdr); 2775 u32 pad = packet->pad; 2776 struct ib_wc wc; 2777 u32 pmtu = qp->pmtu; 2778 int diff; 2779 struct ib_reth *reth; 2780 unsigned long flags; 2781 int ret; 2782 bool copy_last = false, fecn; 2783 u32 rkey; 2784 u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2); 2785 2786 lockdep_assert_held(&qp->r_lock); 2787 2788 if (hfi1_ruc_check_hdr(ibp, packet)) 2789 return; 2790 2791 fecn = process_ecn(qp, packet); 2792 opfn_trigger_conn_request(qp, be32_to_cpu(ohdr->bth[1])); 2793 2794 /* 2795 * Process responses (ACKs) before anything else. Note that the 2796 * packet sequence number will be for something in the send work 2797 * queue rather than the expected receive packet sequence number. 2798 * In other words, this QP is the requester. 2799 */ 2800 if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && 2801 opcode <= OP(ATOMIC_ACKNOWLEDGE)) { 2802 rc_rcv_resp(packet); 2803 return; 2804 } 2805 2806 /* Compute 24 bits worth of difference. */ 2807 diff = delta_psn(psn, qp->r_psn); 2808 if (unlikely(diff)) { 2809 if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd)) 2810 return; 2811 goto send_ack; 2812 } 2813 2814 /* Check for opcode sequence errors. */ 2815 switch (qp->r_state) { 2816 case OP(SEND_FIRST): 2817 case OP(SEND_MIDDLE): 2818 if (opcode == OP(SEND_MIDDLE) || 2819 opcode == OP(SEND_LAST) || 2820 opcode == OP(SEND_LAST_WITH_IMMEDIATE) || 2821 opcode == OP(SEND_LAST_WITH_INVALIDATE)) 2822 break; 2823 goto nack_inv; 2824 2825 case OP(RDMA_WRITE_FIRST): 2826 case OP(RDMA_WRITE_MIDDLE): 2827 if (opcode == OP(RDMA_WRITE_MIDDLE) || 2828 opcode == OP(RDMA_WRITE_LAST) || 2829 opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) 2830 break; 2831 goto nack_inv; 2832 2833 default: 2834 if (opcode == OP(SEND_MIDDLE) || 2835 opcode == OP(SEND_LAST) || 2836 opcode == OP(SEND_LAST_WITH_IMMEDIATE) || 2837 opcode == OP(SEND_LAST_WITH_INVALIDATE) || 2838 opcode == OP(RDMA_WRITE_MIDDLE) || 2839 opcode == OP(RDMA_WRITE_LAST) || 2840 opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) 2841 goto nack_inv; 2842 /* 2843 * Note that it is up to the requester to not send a new 2844 * RDMA read or atomic operation before receiving an ACK 2845 * for the previous operation. 2846 */ 2847 break; 2848 } 2849 2850 if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) 2851 rvt_comm_est(qp); 2852 2853 /* OK, process the packet. */ 2854 switch (opcode) { 2855 case OP(SEND_FIRST): 2856 ret = rvt_get_rwqe(qp, false); 2857 if (ret < 0) 2858 goto nack_op_err; 2859 if (!ret) 2860 goto rnr_nak; 2861 qp->r_rcv_len = 0; 2862 fallthrough; 2863 case OP(SEND_MIDDLE): 2864 case OP(RDMA_WRITE_MIDDLE): 2865send_middle: 2866 /* Check for invalid length PMTU or posted rwqe len. */ 2867 /* 2868 * There will be no padding for 9B packet but 16B packets 2869 * will come in with some padding since we always add 2870 * CRC and LT bytes which will need to be flit aligned 2871 */ 2872 if (unlikely(tlen != (hdrsize + pmtu + extra_bytes))) 2873 goto nack_inv; 2874 qp->r_rcv_len += pmtu; 2875 if (unlikely(qp->r_rcv_len > qp->r_len)) 2876 goto nack_inv; 2877 rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); 2878 break; 2879 2880 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): 2881 /* consume RWQE */ 2882 ret = rvt_get_rwqe(qp, true); 2883 if (ret < 0) 2884 goto nack_op_err; 2885 if (!ret) 2886 goto rnr_nak; 2887 goto send_last_imm; 2888 2889 case OP(SEND_ONLY): 2890 case OP(SEND_ONLY_WITH_IMMEDIATE): 2891 case OP(SEND_ONLY_WITH_INVALIDATE): 2892 ret = rvt_get_rwqe(qp, false); 2893 if (ret < 0) 2894 goto nack_op_err; 2895 if (!ret) 2896 goto rnr_nak; 2897 qp->r_rcv_len = 0; 2898 if (opcode == OP(SEND_ONLY)) 2899 goto no_immediate_data; 2900 if (opcode == OP(SEND_ONLY_WITH_INVALIDATE)) 2901 goto send_last_inv; 2902 fallthrough; /* for SEND_ONLY_WITH_IMMEDIATE */ 2903 case OP(SEND_LAST_WITH_IMMEDIATE): 2904send_last_imm: 2905 wc.ex.imm_data = ohdr->u.imm_data; 2906 wc.wc_flags = IB_WC_WITH_IMM; 2907 goto send_last; 2908 case OP(SEND_LAST_WITH_INVALIDATE): 2909send_last_inv: 2910 rkey = be32_to_cpu(ohdr->u.ieth); 2911 if (rvt_invalidate_rkey(qp, rkey)) 2912 goto no_immediate_data; 2913 wc.ex.invalidate_rkey = rkey; 2914 wc.wc_flags = IB_WC_WITH_INVALIDATE; 2915 goto send_last; 2916 case OP(RDMA_WRITE_LAST): 2917 copy_last = rvt_is_user_qp(qp); 2918 fallthrough; 2919 case OP(SEND_LAST): 2920no_immediate_data: 2921 wc.wc_flags = 0; 2922 wc.ex.imm_data = 0; 2923send_last: 2924 /* Check for invalid length. */ 2925 /* LAST len should be >= 1 */ 2926 if (unlikely(tlen < (hdrsize + extra_bytes))) 2927 goto nack_inv; 2928 /* Don't count the CRC(and padding and LT byte for 16B). */ 2929 tlen -= (hdrsize + extra_bytes); 2930 wc.byte_len = tlen + qp->r_rcv_len; 2931 if (unlikely(wc.byte_len > qp->r_len)) 2932 goto nack_inv; 2933 rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, copy_last); 2934 rvt_put_ss(&qp->r_sge); 2935 qp->r_msn++; 2936 if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) 2937 break; 2938 wc.wr_id = qp->r_wr_id; 2939 wc.status = IB_WC_SUCCESS; 2940 if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) || 2941 opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) 2942 wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; 2943 else 2944 wc.opcode = IB_WC_RECV; 2945 wc.qp = &qp->ibqp; 2946 wc.src_qp = qp->remote_qpn; 2947 wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX; 2948 /* 2949 * It seems that IB mandates the presence of an SL in a 2950 * work completion only for the UD transport (see section 2951 * 11.4.2 of IBTA Vol. 1). 2952 * 2953 * However, the way the SL is chosen below is consistent 2954 * with the way that IB/qib works and is trying avoid 2955 * introducing incompatibilities. 2956 * 2957 * See also OPA Vol. 1, section 9.7.6, and table 9-17. 2958 */ 2959 wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); 2960 /* zero fields that are N/A */ 2961 wc.vendor_err = 0; 2962 wc.pkey_index = 0; 2963 wc.dlid_path_bits = 0; 2964 wc.port_num = 0; 2965 /* Signal completion event if the solicited bit is set. */ 2966 rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr)); 2967 break; 2968 2969 case OP(RDMA_WRITE_ONLY): 2970 copy_last = rvt_is_user_qp(qp); 2971 fallthrough; 2972 case OP(RDMA_WRITE_FIRST): 2973 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): 2974 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) 2975 goto nack_inv; 2976 /* consume RWQE */ 2977 reth = &ohdr->u.rc.reth; 2978 qp->r_len = be32_to_cpu(reth->length); 2979 qp->r_rcv_len = 0; 2980 qp->r_sge.sg_list = NULL; 2981 if (qp->r_len != 0) { 2982 u32 rkey = be32_to_cpu(reth->rkey); 2983 u64 vaddr = get_ib_reth_vaddr(reth); 2984 int ok; 2985 2986 /* Check rkey & NAK */ 2987 ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr, 2988 rkey, IB_ACCESS_REMOTE_WRITE); 2989 if (unlikely(!ok)) 2990 goto nack_acc; 2991 qp->r_sge.num_sge = 1; 2992 } else { 2993 qp->r_sge.num_sge = 0; 2994 qp->r_sge.sge.mr = NULL; 2995 qp->r_sge.sge.vaddr = NULL; 2996 qp->r_sge.sge.length = 0; 2997 qp->r_sge.sge.sge_length = 0; 2998 } 2999 if (opcode == OP(RDMA_WRITE_FIRST)) 3000 goto send_middle; 3001 else if (opcode == OP(RDMA_WRITE_ONLY)) 3002 goto no_immediate_data; 3003 ret = rvt_get_rwqe(qp, true); 3004 if (ret < 0) 3005 goto nack_op_err; 3006 if (!ret) { 3007 /* peer will send again */ 3008 rvt_put_ss(&qp->r_sge); 3009 goto rnr_nak; 3010 } 3011 wc.ex.imm_data = ohdr->u.rc.imm_data; 3012 wc.wc_flags = IB_WC_WITH_IMM; 3013 goto send_last; 3014 3015 case OP(RDMA_READ_REQUEST): { 3016 struct rvt_ack_entry *e; 3017 u32 len; 3018 u8 next; 3019 3020 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) 3021 goto nack_inv; 3022 next = qp->r_head_ack_queue + 1; 3023 /* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */ 3024 if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 3025 next = 0; 3026 spin_lock_irqsave(&qp->s_lock, flags); 3027 if (unlikely(next == qp->s_acked_ack_queue)) { 3028 if (!qp->s_ack_queue[next].sent) 3029 goto nack_inv_unlck; 3030 update_ack_queue(qp, next); 3031 } 3032 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 3033 release_rdma_sge_mr(e); 3034 reth = &ohdr->u.rc.reth; 3035 len = be32_to_cpu(reth->length); 3036 if (len) { 3037 u32 rkey = be32_to_cpu(reth->rkey); 3038 u64 vaddr = get_ib_reth_vaddr(reth); 3039 int ok; 3040 3041 /* Check rkey & NAK */ 3042 ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, 3043 rkey, IB_ACCESS_REMOTE_READ); 3044 if (unlikely(!ok)) 3045 goto nack_acc_unlck; 3046 /* 3047 * Update the next expected PSN. We add 1 later 3048 * below, so only add the remainder here. 3049 */ 3050 qp->r_psn += rvt_div_mtu(qp, len - 1); 3051 } else { 3052 e->rdma_sge.mr = NULL; 3053 e->rdma_sge.vaddr = NULL; 3054 e->rdma_sge.length = 0; 3055 e->rdma_sge.sge_length = 0; 3056 } 3057 e->opcode = opcode; 3058 e->sent = 0; 3059 e->psn = psn; 3060 e->lpsn = qp->r_psn; 3061 /* 3062 * We need to increment the MSN here instead of when we 3063 * finish sending the result since a duplicate request would 3064 * increment it more than once. 3065 */ 3066 qp->r_msn++; 3067 qp->r_psn++; 3068 qp->r_state = opcode; 3069 qp->r_nak_state = 0; 3070 qp->r_head_ack_queue = next; 3071 qpriv->r_tid_alloc = qp->r_head_ack_queue; 3072 3073 /* Schedule the send engine. */ 3074 qp->s_flags |= RVT_S_RESP_PENDING; 3075 if (fecn) 3076 qp->s_flags |= RVT_S_ECN; 3077 hfi1_schedule_send(qp); 3078 3079 spin_unlock_irqrestore(&qp->s_lock, flags); 3080 return; 3081 } 3082 3083 case OP(COMPARE_SWAP): 3084 case OP(FETCH_ADD): { 3085 struct ib_atomic_eth *ateth = &ohdr->u.atomic_eth; 3086 u64 vaddr = get_ib_ateth_vaddr(ateth); 3087 bool opfn = opcode == OP(COMPARE_SWAP) && 3088 vaddr == HFI1_VERBS_E_ATOMIC_VADDR; 3089 struct rvt_ack_entry *e; 3090 atomic64_t *maddr; 3091 u64 sdata; 3092 u32 rkey; 3093 u8 next; 3094 3095 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) && 3096 !opfn)) 3097 goto nack_inv; 3098 next = qp->r_head_ack_queue + 1; 3099 if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) 3100 next = 0; 3101 spin_lock_irqsave(&qp->s_lock, flags); 3102 if (unlikely(next == qp->s_acked_ack_queue)) { 3103 if (!qp->s_ack_queue[next].sent) 3104 goto nack_inv_unlck; 3105 update_ack_queue(qp, next); 3106 } 3107 e = &qp->s_ack_queue[qp->r_head_ack_queue]; 3108 release_rdma_sge_mr(e); 3109 /* Process OPFN special virtual address */ 3110 if (opfn) { 3111 opfn_conn_response(qp, e, ateth); 3112 goto ack; 3113 } 3114 if (unlikely(vaddr & (sizeof(u64) - 1))) 3115 goto nack_inv_unlck; 3116 rkey = be32_to_cpu(ateth->rkey); 3117 /* Check rkey & NAK */ 3118 if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), 3119 vaddr, rkey, 3120 IB_ACCESS_REMOTE_ATOMIC))) 3121 goto nack_acc_unlck; 3122 /* Perform atomic OP and save result. */ 3123 maddr = (atomic64_t *)qp->r_sge.sge.vaddr; 3124 sdata = get_ib_ateth_swap(ateth); 3125 e->atomic_data = (opcode == OP(FETCH_ADD)) ? 3126 (u64)atomic64_add_return(sdata, maddr) - sdata : 3127 (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr, 3128 get_ib_ateth_compare(ateth), 3129 sdata); 3130 rvt_put_mr(qp->r_sge.sge.mr); 3131 qp->r_sge.num_sge = 0; 3132ack: 3133 e->opcode = opcode; 3134 e->sent = 0; 3135 e->psn = psn; 3136 e->lpsn = psn; 3137 qp->r_msn++; 3138 qp->r_psn++; 3139 qp->r_state = opcode; 3140 qp->r_nak_state = 0; 3141 qp->r_head_ack_queue = next; 3142 qpriv->r_tid_alloc = qp->r_head_ack_queue; 3143 3144 /* Schedule the send engine. */ 3145 qp->s_flags |= RVT_S_RESP_PENDING; 3146 if (fecn) 3147 qp->s_flags |= RVT_S_ECN; 3148 hfi1_schedule_send(qp); 3149 3150 spin_unlock_irqrestore(&qp->s_lock, flags); 3151 return; 3152 } 3153 3154 default: 3155 /* NAK unknown opcodes. */ 3156 goto nack_inv; 3157 } 3158 qp->r_psn++; 3159 qp->r_state = opcode; 3160 qp->r_ack_psn = psn; 3161 qp->r_nak_state = 0; 3162 /* Send an ACK if requested or required. */ 3163 if (psn & IB_BTH_REQ_ACK || fecn) { 3164 if (packet->numpkt == 0 || fecn || 3165 qp->r_adefered >= HFI1_PSN_CREDIT) { 3166 rc_cancel_ack(qp); 3167 goto send_ack; 3168 } 3169 qp->r_adefered++; 3170 rc_defered_ack(rcd, qp); 3171 } 3172 return; 3173 3174rnr_nak: 3175 qp->r_nak_state = qp->r_min_rnr_timer | IB_RNR_NAK; 3176 qp->r_ack_psn = qp->r_psn; 3177 /* Queue RNR NAK for later */ 3178 rc_defered_ack(rcd, qp); 3179 return; 3180 3181nack_op_err: 3182 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 3183 qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR; 3184 qp->r_ack_psn = qp->r_psn; 3185 /* Queue NAK for later */ 3186 rc_defered_ack(rcd, qp); 3187 return; 3188 3189nack_inv_unlck: 3190 spin_unlock_irqrestore(&qp->s_lock, flags); 3191nack_inv: 3192 rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); 3193 qp->r_nak_state = IB_NAK_INVALID_REQUEST; 3194 qp->r_ack_psn = qp->r_psn; 3195 /* Queue NAK for later */ 3196 rc_defered_ack(rcd, qp); 3197 return; 3198 3199nack_acc_unlck: 3200 spin_unlock_irqrestore(&qp->s_lock, flags); 3201nack_acc: 3202 rvt_rc_error(qp, IB_WC_LOC_PROT_ERR); 3203 qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; 3204 qp->r_ack_psn = qp->r_psn; 3205send_ack: 3206 hfi1_send_rc_ack(packet, fecn); 3207} 3208 3209void hfi1_rc_hdrerr( 3210 struct hfi1_ctxtdata *rcd, 3211 struct hfi1_packet *packet, 3212 struct rvt_qp *qp) 3213{ 3214 struct hfi1_ibport *ibp = rcd_to_iport(rcd); 3215 int diff; 3216 u32 opcode; 3217 u32 psn; 3218 3219 if (hfi1_ruc_check_hdr(ibp, packet)) 3220 return; 3221 3222 psn = ib_bth_get_psn(packet->ohdr); 3223 opcode = ib_bth_get_opcode(packet->ohdr); 3224 3225 /* Only deal with RDMA Writes for now */ 3226 if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) { 3227 diff = delta_psn(psn, qp->r_psn); 3228 if (!qp->r_nak_state && diff >= 0) { 3229 ibp->rvp.n_rc_seqnak++; 3230 qp->r_nak_state = IB_NAK_PSN_ERROR; 3231 /* Use the expected PSN. */ 3232 qp->r_ack_psn = qp->r_psn; 3233 /* 3234 * Wait to send the sequence 3235 * NAK until all packets 3236 * in the receive queue have 3237 * been processed. 3238 * Otherwise, we end up 3239 * propagating congestion. 3240 */ 3241 rc_defered_ack(rcd, qp); 3242 } /* Out of sequence NAK */ 3243 } /* QP Request NAKs */ 3244}